# Project

## Importing Libraries

In [21]:
import string,re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier,VotingClassifier,AdaBoostClassifier,ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

np.random.seed(42)

## Importing Dataset

In [2]:
train_dataset=pd.read_csv("hate speech/train.csv")


In [3]:
train_dataset.head(100)

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
95,3,0,3,0,1,"""@CauseWereGuys: Going back to school sucks mo..."
96,3,0,3,0,1,"""@CauseWereGuys: On my way to fuck yo bitch ht..."
97,3,0,3,0,1,"""@CeleyNichole: @white_thunduh how come you ne..."
98,3,0,3,0,1,"""@ChadMFVerbeck: If Richnow doesn't show up wi..."


In [4]:
temp=train_dataset['tweet'][0]

In [5]:
train_dataset.isna().sum()

count                       0
hate_speech_count           0
offensive_language_count    0
neither_count               0
class                       0
tweet                       0
dtype: int64

In [6]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   count                     24783 non-null  int64 
 1   hate_speech_count         24783 non-null  int64 
 2   offensive_language_count  24783 non-null  int64 
 3   neither_count             24783 non-null  int64 
 4   class                     24783 non-null  int64 
 5   tweet                     24783 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.1+ MB


In [7]:
temp

"!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."

In [8]:
temp_words=word_tokenize(temp)


In [9]:
stop_words=stopwords.words('english')
punctuations=string.punctuation
def preprocess(obj):
    words=obj.split()
    for word in words:
        if word.startswith(('@','!')):
            index_to_delete=words.index(word)
            words.pop(index_to_delete)
    words=' '.join(words)
    words=word_tokenize(words)
    corpus=[word.lower() for word in words if word not in punctuations and word not in stop_words and word !='RT' and word!='`' and word!='~' and word!='"' and  word!='...' and word!='.']
    corpus=' '.join(corpus)
    print(corpus)
    return corpus

In [10]:
train_dataset['tweet']=train_dataset['tweet'].apply(preprocess)

as woman n't complain cleaning house amp man always take trash
boy dats cold tyga dwn bad cuffin dat hoe 1st place
dawg you ever fuck bitch start cry you confused shit
viva_based look like tranny
the shit hear might true might faker bitch told ya 57361
the shit blows .. claim faithful somebody still fucking hoes 128514 128514 128514 ''
i sit hate another bitch .. i got much shit going ''
cause i 'm tired big bitches coming us skinny girls 8221
`` amp might get ya bitch back amp thats ``
`` hobbies include fighting mariam '' bitch
`` keeks bitch curves everyone `` lol i walked conversation like smh
`` murda gang bitch gang land ``
`` so hoes smoke losers `` yea go ig
`` bad bitches thing like ``
`` bitch get ``
`` bitch nigga miss ``
`` bitch plz whatever ``
`` bitch love ``
`` bitches get cut everyday b ``
`` black bottle amp bad bitch ``
`` broke bitch cant tell nothing ``
`` cancel bitch like nino ``
`` cant see hoes wont change ``
`` fuck bitch dont even suck dick `` 128514 128514 1

In [11]:
train_dataset['tweet']=train_dataset['tweet'].apply(lambda x: re.sub('1-9','',x))

In [12]:
train_dataset['tweet']

0        as woman n't complain cleaning house amp man a...
1        boy dats cold tyga dwn bad cuffin dat hoe 1st ...
2        dawg you ever fuck bitch start cry you confuse...
3                              viva_based look like tranny
4        the shit hear might true might faker bitch tol...
                               ...                        
24778    's muthaf lie 8220 lifeasking corey_emanuel ri...
24779    've gone broke wrong heart baby drove redneck ...
24780    young buck wan na eat .. dat nigguh like i ain...
24781                    youu got wild bitches tellin lies
24782    ~~ruffled ntac eileen dahlia beautiful color c...
Name: tweet, Length: 24783, dtype: object

the tweets are much cleaner than before, tho there is scope of improvement 

## Data Transformation

In [13]:
c_vectorizer=CountVectorizer()
tfidf=TfidfVectorizer()
c_vectors=c_vectorizer.fit_transform(train_dataset['tweet'])
tfidf_vectors=tfidf.fit_transform(train_dataset['tweet'])
tfidf_vectors=tfidf_vectors.toarray()
c_vectors=c_vectors.toarray()

In [14]:
estimators=int(len(c_vectors)*0.8)

## Training models

In [15]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import BaggingClassifier,VotingClassifier,AdaBoostClassifier,ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier
# from sklearn.svm import SVC
# clfs={
    # 'LogisticRegression':LogisticRegression(),
    # 'Bagging' :BaggingClassifier(),
    # 'Ada':AdaBoostClassifier(),
    # 'ExtraTrees':ExtraTreesClassifier(),
    # 'SVC':SVC(),
    # 'RandomForest':RandomForestClassifier(),
    # 'GradientBoosting':GradientBoostingClassifier(),
    # 'HistGradientBoosting':HistGradientBoostingClassifier(),
    # 'mlp':MLPClassifier(hidden_layer_sizes=[1,2,3,3,2,1],learning_rate='adaptive')
# }

In [16]:
y=train_dataset['count']
x=tfidf_vectors
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)

In [17]:
# for name,clf in clfs.items():
#     print(f'{name} being trained...')
#     clf.fit(xtrain,ytrain)
#     ypred=clf.predict(xtest)
#     print(f'accuracy attained: {accuracy_score(ytest,ypred)}')


In [18]:
# mnb=MultinomialNB()
# gnb=GaussianNB()
# bnb=BernoulliNB()

# mnb.fit(xtrain,ytrain)
# gnb.fit(xtrain,ytrain)
# bnb.fit(xtrain,ytrain)

# ypred_mnb=mnb.predict(xtest)
# ypred_gnb=gnb.predict(xtest)
# ypred_bnb=bnb.predict(xtest)
# print(f'MultinomialNB accuracy: {accuracy_score(ytest,ypred_mnb)}')
# print(f'GaussianNB accuracy: {accuracy_score(ytest,ypred_gnb)}')
# print(f'BernoulliNB accuracy: {accuracy_score(ytest,ypred_bnb)}')

In [20]:
mlp=MLPClassifier(hidden_layer_sizes=[2],learning_rate='adaptive')
mlp.fit(xtest,ytest)
ypred_mlp=mlp.predict(xtest)

print(f'MLP accuracy: {accuracy_score(ytest,ypred_mlp)}')



MLP accuracy: 0.9834577365341941


In [23]:
pickle.dump(mlp,open('mlp_model.pkl','wb'))