In [2]:
from nltk.util import pr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re
import nltk
from nltk.corpus import stopwords
import string

In [3]:
#stemming algorithm that reduces words to their root form (e.g., "running" becomes "run")
stemmer = nltk.SnowballStemmer("english")

In [4]:
#stopwords.words('english'): This part of the code retrieves a list of common English stop words (words like "the", "a", "is", etc.) from the NLTK corpus.
nltk.download('stopwords')
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
data = pd.read_csv("https://raw.githubusercontent.com/amankharwal/Website-data/master/twitter.csv")

In [6]:
print(data.head)

<bound method NDFrame.head of        Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0               0      3            0                   0        3      2   
1               1      3            0                   3        0      1   
2               2      3            0                   3        0      1   
3               3      3            0                   2        1      1   
4               4      6            0                   6        0      1   
...           ...    ...          ...                 ...      ...    ...   
24778       25291      3            0                   2        1      1   
24779       25292      3            0                   1        2      2   
24780       25294      3            0                   3        0      1   
24781       25295      6            0                   6        0      1   
24782       25296      3            0                   0        3      2   

                                             

In [7]:
data["labels"] = data["class"].map({
    0:"Hate Speech",
    1:"Offensive Language",
    2:"No Hate and Offensive"
})
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet                 labels  
0  !!! RT @mayasolovely: As a woman you shouldn't...  No Hate and Offensive  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...     Offensive Language  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...     Offensive Language  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...     Offensive Language  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...     Offensive Language  


In [8]:
data = data[["tweet", "labels"]]
print(data.head())

                                               tweet                 labels
0  !!! RT @mayasolovely: As a woman you shouldn't...  No Hate and Offensive
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...     Offensive Language
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...     Offensive Language
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...     Offensive Language
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...     Offensive Language


In [9]:
def clean(text):
  text = str(text).lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  text = [word for word in text.split(' ') if word not in stopword]
  text=" ".join(text)
  text = [stemmer.stem(word) for word in text.split(' ')]
  text=" ".join(text)
  return text

In [10]:
data["tweet"] = data["tweet"].apply(clean)


In [11]:
print(data["tweet"].head())

0     rt mayasolov woman shouldnt complain clean ho...
1     rt  boy dat coldtyga dwn bad cuffin dat hoe  ...
2     rt urkindofbrand dawg rt  ever fuck bitch sta...
3               rt cganderson vivabas look like tranni
4     rt shenikarobert shit hear might true might f...
Name: tweet, dtype: object


In [12]:
x = np.array(data["tweet"])
y = np.array(data["labels"])

cv = CountVectorizer()
X = cv.fit_transform(x) # Fit and transform the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [13]:
sample = input("Ente a comment: ")
data = cv.transform([sample]).toarray()
print(clf.predict(data))

Ente a comment: Let's unite and kill all the people who are protesting against the government
['Hate Speech']


In [15]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

                       precision    recall  f1-score   support

          Hate Speech       0.36      0.31      0.33       465
No Hate and Offensive       0.81      0.82      0.81      1379
   Offensive Language       0.92      0.93      0.92      6335

             accuracy                           0.87      8179
            macro avg       0.70      0.69      0.69      8179
         weighted avg       0.87      0.87      0.87      8179

