## Libraries

In [1]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle

## Loading Data

In [2]:
# The csv file does not have a column header so to make sure
# the first row of csv file isn't treated as a header, we put 
# header = None while reading the csv file

# Also I am giving names of cloumns using names parameter ( the numbers are column names we won't be needing)

# enter the location of downloaded file in read csv

tweets = pd.read_csv('/Users/rishabh/Downloads/training.1600000.processed.noemoticon.csv', encoding='latin-1',header = None, names = ['label',1,2,3,4,'comments'])

In [3]:
# looking at the data

tweets.head()

Unnamed: 0,label,1,2,3,4,comments
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Cleaning the data

In [4]:
# the tweets contains stop words ( example is, are, the ) and punctuations
# the below function removes these from the comments

def tweet_to_words(text):
    
    # string library consists of punctuation consisting of most of the punctuation characters
    punctuation = list(string.punctuation)
    # tokenizer separates sentences into words
    words = word_tokenize(text.lower())
    # stopwords library consists of the stop words of english ( by specifying english )
    stops = list(stopwords.words("english"))
    # combining list of stopwords and punctuations
    stops = stops + punctuation
    # select only those words which are not in stops list
    good_words = [w for w in words if not w in stops]
    return " ".join(good_words)

In [5]:
# apply to the comments section of csv file
# Depending upon the machine/ instance you are working upon, cleaning of data will take time
# ( 30 - 40 mins in modern hardware, 5 mins using a gpu approximately ) since data set has 1.6 million rows approximately

data = tweets['comments'].apply(tweet_to_words)
target = tweets['label']

## Splitting the data ( training and testing )

In [6]:
x_train, x_test, y_train, y_test = train_test_split(data, target, random_state = 0)

## Converting data for training

In [28]:
# count vectorizer converts all features ( vocabulary size ) into a matrix with each row describing if the word occurs
# and count of it
# max_features = integer x determines the maximum number of features in training set to be taken for training the model
# instead of the whole vocabulary size
# ngram_range tells if only single words are to be taken as features or combination of words can also be taken
# combination is beneficial since for example - what a good devil you are. 'good' might seem as if comment might be positive
# but 'good devil' may just make the whole comment negative
v = CountVectorizer(max_features = 20000, ngram_range=(1,2))
train_features = v.fit_transform(x_train)
# i am going to store vectorizer's object since next time we won't have to go through creating entire dataset for training and testing
# i am using something called pickle

pickle.dump(v,open("vectorizerpickle.pickle","wb"))

# now the load of object can be used anytime even if the above steps are not performed

vectorizer = pickle.load(open("vectorizerpickle.pickle", "rb"))

test_features = vectorizer.transform(x_test)

In [29]:
# We will use Multinomial naive bayes as our model for classification
clf = MultinomialNB()
clf.fit(train_features, y_train)

# cleaning and training takes a lot of time. So I will save this created model using pickle
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(test_features, y_test)
print(result)

0.76673


## Analysing the predictions

In [30]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
Y_pred = clf.predict(test_features)
print("Confusion matrix : ")
print(confusion_matrix(y_test, Y_pred))
print("Classification report : ")
print(classification_report(y_test, Y_pred))

Confusion matrix : 
[[156527  43207]
 [ 50101 150165]]
Classification report : 
             precision    recall  f1-score   support

          0       0.76      0.78      0.77    199734
          4       0.78      0.75      0.76    200266

avg / total       0.77      0.77      0.77    400000



## Prediction function for a sentence

In [42]:
# defining function for prediction

def predict(s):
    sentence = vectorizer.transform([s])
    if(loaded_model.predict(sentence)[0]==0):
        print("Your comment might be negative/abusive. It may hurt someone. Are you sure you want to post it ?(y/n) : ")
        k = input()
        if(k == 'y' or k=='Y'):
            print("Comment posted : "+s)
        else:
            print("Thank you !")
    else:
        print("Comment posted : "+s)

s = input('Enter comment : ')
predict(s)

Enter comment : it's gonna be fine. this world is full of negativity but i hope you see the good part as well. i am with you
Comment posted : it's gonna be fine. this world is full of negativity but i hope you see the good part as well. i am with you
