In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
24778,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,6,0,6,0,1,youu got wild bitches tellin you lies


Pre-Processing

In [5]:
# Removing stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [6]:
# Removing Hashtags, Mentions and Punctuations
df['tweet'] = df['tweet'].str.replace('#', '')
df['tweet'] = df['tweet'].str.replace('@', '')
df['tweet'] = df['tweet'].str.replace('[^\w\s]','')

# Removing Numbers
df['tweet'] = df['tweet'].str.replace('[0-9]','')

# Removing Emojis
import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
df['tweet'] = df['tweet'].apply(lambda x: remove_emoji(x))


In [7]:
df

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT mayasolovely: As woman complain cleanin...
1,3,0,3,0,1,!!!!! RT mleew17: boy dats cold...tyga dwn bad...
2,3,0,3,0,1,!!!!!!! RT UrKindOfBrand Dawg!!!! RT 80sbaby4l...
3,3,0,2,1,1,!!!!!!!!! RT C_G_Anderson: viva_based look lik...
4,6,0,6,0,1,!!!!!!!!!!!!! RT ShenikaRoberts: The shit hear...
...,...,...,...,...,...,...
24778,3,0,2,1,1,you's muthaf***in lie &8220;LifeAsKing: 20_Pea...
24779,3,0,1,2,2,"gone broke wrong heart baby, drove redneck crazy"
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,6,0,6,0,1,youu got wild bitches tellin lies


In [8]:
# tokenization
from nltk.tokenize import word_tokenize
df['tweet'] = df['tweet'].apply(lambda x: word_tokenize(x))

# Stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['tweet'] = df['tweet'].apply(lambda x: " ".join([st.stem(word) for word in x]))

# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['tweet'] = df['tweet'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
df

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,! ! ! rt mayasolov : a woman complain clean ho...
1,3,0,3,0,1,! ! ! ! ! rt mleew17 : boy dat cold ... tyga d...
2,3,0,3,0,1,! ! ! ! ! ! ! rt urkindofbrand dawg ! ! ! ! rt...
3,3,0,2,1,1,! ! ! ! ! ! ! ! ! rt c_g_anderson : viva_bas l...
4,6,0,6,0,1,! ! ! ! ! ! ! ! ! ! ! ! ! rt shenikarobert : t...
...,...,...,...,...,...,...
24778,3,0,2,1,1,you 's muthaf * * * in lie & 8220 ; lifeask : ...
24779,3,0,1,2,2,"gone broke wrong heart babi , drove redneck crazi"
24780,3,0,3,0,1,young buck wan na eat ! ! .. dat nigguh like i...
24781,6,0,6,0,1,youu got wild bitch tellin lie


In [9]:
# removing punctuation at start and end of words
df['tweet'] = df['tweet'].str.replace(r'^\W+|\W+$', '')
df

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,! ! ! rt mayasolov : a woman complain clean ho...
1,3,0,3,0,1,! ! ! ! ! rt mleew17 : boy dat cold ... tyga d...
2,3,0,3,0,1,! ! ! ! ! ! ! rt urkindofbrand dawg ! ! ! ! rt...
3,3,0,2,1,1,! ! ! ! ! ! ! ! ! rt c_g_anderson : viva_bas l...
4,6,0,6,0,1,! ! ! ! ! ! ! ! ! ! ! ! ! rt shenikarobert : t...
...,...,...,...,...,...,...
24778,3,0,2,1,1,you 's muthaf * * * in lie & 8220 ; lifeask : ...
24779,3,0,1,2,2,"gone broke wrong heart babi , drove redneck crazi"
24780,3,0,3,0,1,young buck wan na eat ! ! .. dat nigguh like i...
24781,6,0,6,0,1,youu got wild bitch tellin lie


In [11]:
# Splitting the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['class'], test_size = 0.2, random_state = 42)

# Vectorizing the dataset
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [12]:
# Training the model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

# Predicting the test set results
y_pred = model.predict(X_test)

# Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm


array([[  11,  253,   26],
       [  12, 3773,   47],
       [   3,  351,  481]])

In [13]:
# Training SVC model
from sklearn.svm import SVC
model2 = SVC(kernel='linear')
model2.fit(X_train, y_train)

# Predicting the test set results
y_pred2 = model2.predict(X_test)

# Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm2 = confusion_matrix(y_test, y_pred2)
cm2

array([[  80,  186,   24],
       [ 107, 3617,  108],
       [  25,   85,  725]])

In [14]:
# Training Random Forest model
from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier(n_estimators=1000, random_state=0)
model3.fit(X_train, y_train)

# Predicting the test set results
y_pred3 = model3.predict(X_test)

# Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm3 = confusion_matrix(y_test, y_pred3)
print(cm3)

# Calculating the accuracy score
from sklearn.metrics import accuracy_score
print("Accuracy for Naive Bayes: ", accuracy_score(y_test, y_pred))
print("Accuracy for SVC: ", accuracy_score(y_test, y_pred2))
print("Accuracy for Random Forest: ", accuracy_score(y_test, y_pred3))


[[  64  204   22]
 [  52 3686   94]
 [  10  136  689]]
Accuracy for Naive Bayes:  0.8603994351422232
Accuracy for SVC:  0.892071817631632
Accuracy for Random Forest:  0.8955013112769821
