In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
import re
import nltk
from nltk.util import pr
stemmer=nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words("english"))

In [3]:
df=pd.read_csv("HateSpeechData.csv")
print(df.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [4]:
df['labels']=df['class'].map({0:"Hate Speech detected",1:"Offensive language detected",2:"No hate and offensive speech" })
print(df.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                         labels  
0  No hate and offensive speech  
1   Offensive language detected  
2   Offensive language detected  
3   Offensive language detected  
4   Offensive language detected  


In [5]:
df=df[['tweet','labels']]
df.head()

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No hate and offensive speech
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive language detected
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive language detected
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive language detected
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive language detected


In [6]:
def clean(text):
  text=str(text).lower()
  text=re.sub(r'\[.*?\]','',text)  
  text=re.sub(r'https?://\S+|www\.\S+','',text)  
  text=re.sub('<.*?>+','',text)
  text=re.sub('[%s]' % re.escape(string.punctuation),'',text)
  text=re.sub('\n','',text)
  text=re.sub(r'\w*\d\w*','',text)
  text=[word for word in text.split(' ') if word not in stopword]
  text=" ".join(text)
  return text

df["tweet"]=df["tweet"].apply(clean)
print(df.head())

                                               tweet  \
0   rt mayasolovely woman shouldnt complain clean...   
1   rt  boy dats coldtyga dwn bad cuffin dat hoe ...   
2   rt urkindofbrand dawg rt  ever fuck bitch sta...   
3           rt cganderson vivabased look like tranny   
4   rt shenikaroberts shit hear might true might ...   

                         labels  
0  No hate and offensive speech  
1   Offensive language detected  
2   Offensive language detected  
3   Offensive language detected  
4   Offensive language detected  


In [7]:
x=np.array(df["tweet"])
y=np.array(df["labels"])
cv=CountVectorizer()
x=cv.fit_transform(x)
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=42)
clf=DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [8]:
test_data="I will kill you"
df=cv.transform([test_data]).toarray()
print(clf.predict(df))

['Hate Speech detected']


In [9]:
test_data="You are awesome"
df=cv.transform([test_data]).toarray()
print(clf.predict(df))

['No hate and offensive speech']


In [10]:
test_data="You are awesome bitch"
df=cv.transform([test_data]).toarray()
print(clf.predict(df))

['Offensive language detected']


In [11]:
test_data="You are bad i don't like you"
df=cv.transform([test_data]).toarray()
print(clf.predict(df))

['Offensive language detected']
