In [1]:
#importations
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import csv

In [70]:
#function definition
def clean_tweet(tweet):#function used to clean a single tweet
    tweet = str(tweet)
    tweet = re.sub("(http:\/\/)[^ ]*","",tweet)
    tweet = tweet.replace("&amp;","")
    tweet = re.sub("[^\w\s]","",tweet)
    tweet = re.sub("[\d]","",tweet)
    tweet = tweet.lower()
    return tweet
def remove_rare_words(tweet,rar):#function used to remove the common words (in com) and rare words (in rar) from a single tweet
    tweetWords = tweet.split(" ")
    newTweet = ""
    for word in tweetWords:
        if (word not in rar):
            newTweet += word + " "
    return newTweet
def remove_common_words(tweet,com):#function used to remove the common words (in com) and rare words (in rar) from a single tweet
    tweetWords = tweet.split(" ")
    newTweet = ""
    for word in tweetWords:
        if (word not in com):
            newTweet += word + " "
    return newTweet
def remove_single_letter_words(tweet):#function used to remove single letter words ('m' 'n', 'I'...)
    tweetWords = tweet.split(" ")
    newTweet = ""
    for word in tweetWords:
        if len(word) != 1:
            newTweet += word + " "
    return newTweet
def clean_dataset(dataset):#function used to clean an entire dataset, uses the three previously declared functions
    dataset = dataset.apply(lambda x: clean_tweet(x))
    frequencyOfWords = pd.Series(' '.join(dataset).split()).value_counts()
    #commonWords = frequencyOfWords[:10] # 10 most common words
    rareWords = frequencyOfWords[-10:] # 10 rarest words
    dataset = dataset.apply(lambda x: remove_rare_words(x,rareWords))
    #dataset = dataset.apply(lambda x: remove_rare_words(x,commonWords))
    dataset = dataset.apply(lambda x: remove_single_letter_words(x))
    return dataset

In [71]:
#load training set and testing set
train_data = pd.read_csv('train_preprocessed.csv')
test_data = pd.read_csv('test_preprocessed.csv')

In [72]:
#Cleaning both training and testing sets
train_data['comment_text'] = clean_dataset(train_data['comment_text'])
test_data['comment_text'] = clean_dataset(test_data['comment_text'])

In [73]:
#Apply features extraction algorithm to train set
count_vect = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9)
train_data_counts = count_vect.fit_transform(train_data['comment_text'])

In [74]:
#Apply TF-IDF to training set
tfidf_transformer = TfidfTransformer(smooth_idf=1, sublinear_tf=1)
train_data_tfidf = tfidf_transformer.fit_transform(train_data_counts)

In [75]:
#Feed the cleaned tweets and their classification to the classifier
#print(train_data_tfidf.shape)
#print(train_data['toxic'].shape)
#Y = train_data['insult'] + train_data['threat'] + train_data['toxic']
Y = train_data['toxic']
clf = MultinomialNB().fit(train_data_tfidf, Y)

In [76]:
#Apply features extraction algorithm and TF-IDF to the testing set
test_data_counts = count_vect.transform(test_data['comment_text'])
test_data_tfidf = tfidf_transformer.transform(test_data_counts)

In [77]:
#Predict the classification of the testing set
predicted = clf.predict(test_data_tfidf)

In [69]:
#Display both tweets from the test set and their predicted classification
for tweet, category in zip(test_data['comment_text'], predicted):
        print('%r => %s' % (tweet, category))

'yo bitch ja rule is more succesful then you ll ever be whats up with you and hating you sad mo  fuck  as should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me  ja rule is about pride in da music man  dont diss that  shit  on him  and nothin is wrong bein like tupac he was brother too fuck  in white boys get things right next time    ' => 0.0


In [78]:
#Create a csv file to store the submission
header = ["comment_text","toxic"]
rows = zip(test_data['comment_text'],predicted)
with open('sample_submission.csv', 'w') as submission:
    wr = csv.writer(submission, delimiter=',',lineterminator='\n', quoting=csv.QUOTE_ALL)
    wr.writerow(header)
    for row in rows:
        wr.writerow(row)

In [56]:
from sklearn.externals import joblib
# now you can save it to a file
#joblib.dump(clf, 'filename.pkl') 
# and later you can load it
#clf = joblib.load('filename.pkl')

filename = 'finalized_model.sav'
joblib.dump(clf, filename)

filename = 'finalized_countvectorizer.sav'
joblib.dump(count_vect, filename)

filename = 'finalized_tfidftransformer.sav'
joblib.dump(tfidf_transformer, filename)

['finalized_tfidftransformer.sav']

In [53]:
test_data_counts = count_vect.transform([clean_tweet("adorable")])
test_data_tfidf = tfidf_transformer.transform(test_data_counts)
predicted = clf.predict(test_data_tfidf)
print(predicted)

#use comment model.py

[0.]
