## Import libs

In [1]:
import pandas as pd
import numpy as np
# Tokenizer
from sklearn.feature_extraction. text import CountVectorizer
from sklearn.model_selection import train_test_split
# ML classifier
from sklearn.tree import DecisionTreeClassifier
# NLP lib
import nltk
nltk.download('all')
import re
# words to ignore
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))
stemmer = nltk.SnowballStemmer("english")
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

Mounted at /content/drive


# Main

In [2]:
# Read Data
data = pd.read_csv("/content/drive/My Drive/labeled_data.csv")
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [3]:
# Map data to speech
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Speech", 2: "No Hate and Offensive Speech"})
data = data[["tweet", "labels"]]
data.head()

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive Speech
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Speech
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Speech
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Speech
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Speech


In [16]:
# Def cleaning preprocessing function
def clean(text):
  text = str(text).lower()
  # sub . and ?
  text = re.sub('[.?]', '', text) 
  # Remove https
  text = re.sub('https?://\S+|www.\S+', '', text)
  # remove <>and .? inside them
  text = re.sub('<.?>+', '', text)
  # Remove punctuation
  text = re.sub('[%s] %re.escape(string.punctuation)', '', text)
  # sub newline
  text = re.sub('\n', '', text)
  # sub word-number-word cases 
  text = re.sub('\w\d\w', '', text)
  # filter via stopword
  text = [word for word in text.split(' ') if word not in stopword]
  text = " ".join(text)
  # Replacing with stem words
  text = [stemmer.stem(word) for word in text.split(' ')]
  text = " ".join(text)
  return text


In [5]:
# apply clean
data["tweet"] = data["tweet"].apply(clean)
data.head()

Unnamed: 0,tweet,labels
0,!!! rt @mayasolovely: woman complain clean hou...,No Hate and Offensive Speech
1,!!!!! rt @mlee: boy dat coldtyga dwn bad cuffi...,Offensive Speech
2,!!!!!!! rt @urkindofbrand dawg!!!! rt @babife:...,Offensive Speech
3,!!!!!!!!! rt @c_g_anderson: @viva_bas look lik...,Offensive Speech
4,!!!!!!!!!!!!! rt @shenikaroberts: shit hear mi...,Offensive Speech


In [6]:
# Tokenize data
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x)
# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
# Model building
model = DecisionTreeClassifier()
# Training the model
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [8]:
# Testing the model
y_pred = model. predict (X_test)
# Accuracy Score of our model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.8831152952683702


In [15]:
string1 = "This is a string to try some stuff out for dimichatzakis@gmail.com. \n You can find me at https://docs.python.org/3/library/re.html. See u 2 around. U2around <.hi??>"
string2 = clean(string1)
print(string2)

string tri stuff dimichatzakis@gmailcom  find  see u 2 around round <hi>


In [17]:
string1 = "This is a string to try some stuff out for dimichatzakis@gmail.com. \n You can find me at https://docs.python.org/3/library/re.html. See u 2 around. U2around <.hi??>"
string2 = clean(string1)
print(string2)

string tri stuff dimichatzakis@gmailcom  find  see u 2 around round <hi>
