In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('punkt')
stemmer = nltk.SnowballStemmer("english")
import string
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/ml/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ml/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read the data sets using python.

In [2]:
df = pd.read_csv("labeled_data.csv")

### Get the shape of data.

In [3]:
print("Number of rows in dataframe 1 : {}".format(df.shape[0]))
print("Number of columns in dataframe 1: {}".format(df.shape[1]))


Number of rows in dataframe 1 : 24783
Number of columns in dataframe 1: 7


### Show first 8 rows of data using data head

In [4]:
df.head(8)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...


### Show last 7 rows of data using data tail.

In [5]:
df.tail(7)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
24776,25289,3,3,0,0,0,you're all niggers
24777,25290,3,2,1,0,0,you're such a retard i hope you get type 2 dia...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies
24782,25296,3,0,0,3,2,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...


### Pre-processing

In [6]:
df = df.drop(['Unnamed: 0','count','hate_speech','offensive_language','neither'],axis=1)

In [7]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [8]:
filtered_df = df[df['class'].isin([1, 2])] #label "2" = normal tweet, lable "1" = Offensive/Abusive tweet
filtered_df['class'].replace(2, 0, inplace=True) # Replace 2 with 0 in the "class" column of filtered_df
filtered_df['tweet_processed']=filtered_df['tweet'].apply(clean_text)
filtered_df.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['class'].replace(2, 0, inplace=True) # Replace 2 with 0 in the "class" column of filtered_df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['tweet_processed']=filtered_df['tweet'].apply(clean_text)


Unnamed: 0,class,tweet,tweet_processed
0,0,!!! RT @mayasolovely: As a woman you shouldn't...,rt mayasolov woman shouldnt complain clean ho...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt boy dat coldtyga dwn bad cuffin dat hoe ...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt urkindofbrand dawg rt ever fuck bitch sta...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt cganderson vivabas look like tranni
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt shenikarobert shit hear might true might f...


In [9]:
filtered_df['class'].value_counts()

1    19190
0     4163
Name: class, dtype: int64

In [10]:
grouped = filtered_df.groupby('class')

# Sample 4000 rows from each group
sampled_df = grouped.apply(lambda x: x.sample(n=4163, replace=True) if len(x) > 4163 else x)
sampled_df = sampled_df.drop("tweet",axis=1)
# Reset the index of the sampled DataFrame
sampled_df.reset_index(drop=True, inplace=True)
sampled_df.head()

Unnamed: 0,class,tweet_processed
0,0,rt mayasolov woman shouldnt complain clean ho...
1,0,momma said pussi cat insid doghous
2,0,simplyaddictedtoguy woof woof hot scalli lad
3,0,allaboutmanfeet woof woof hot sole
4,0,allyhaaaaa lemmi eat oreo amp dish one oreo lol


In [11]:
sampled_df['class'].value_counts()

0    4163
1    4163
Name: class, dtype: int64

In [12]:
y = sampled_df['class']

In [13]:
# Creating the TFIDF model
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(sampled_df['tweet_processed']).toarray()

In [15]:
# Train Test Split
# 66 % for training and 345 for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.34, random_state = 0)


# Naive Bayes Multinomial

In [16]:
# Create a MultinomialNB Classifier
naive_bayes_model = MultinomialNB().fit(X_train, y_train)
#prediction on test data
y_pred_na =naive_bayes_model.predict(X_test)
# accuracy score
score_na=accuracy_score(y_test,y_pred_na)
print(f"Accuracy score for Random Forest Model is {score_na}")
print("Confusion matrix for Naive Bayes Multinomial model \n\n",confusion_matrix(y_pred_na,y_test))

Accuracy score for Random Forest Model is 0.9081596608972095
Confusion matrix for Naive Bayes Multinomial model 

 [[1245   85]
 [ 175 1326]]


# Random Forest

In [17]:

# Create a Random Forest Classifier
random_forest_model = RandomForestClassifier()

# Fit the model to the training data
random_forest_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = random_forest_model.predict(X_test)
score_rf=accuracy_score(y_test,y_pred_rf)
print(f"Accuracy score for Random Forest Model is {score_rf}")
print("Confusion matrix for Random forest model \n\n",confusion_matrix(y_pred_rf,y_test))

Accuracy score for Random Forest Model is 0.9516072059342988
Confusion matrix for Random forest model 

 [[1380   97]
 [  40 1314]]


# Decision Tree

In [18]:

# Create a Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier()

# Fit the model to the training data
decision_tree_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_dt = decision_tree_model.predict(X_test)
score_dt=accuracy_score(y_test,y_pred_dt)
print(f"Accuracy score for Decision Tree is {score_dt}")
print("Confusion matrix for Decision Tree \n\n",confusion_matrix(y_pred_dt,y_test))

Accuracy score for Decision Tree is 0.9339456022606852
Confusion matrix for Decision Tree 

 [[1329   96]
 [  91 1315]]


In [20]:
import pickle

In [21]:
pickle.dump(tv, open("tfidf.pickle", "wb"))

In [22]:
pickle.dump(random_forest_model, open("rf_model.pickle", "wb"))