In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics import f1_score, accuracy_score
import time
from gensim.utils import simple_preprocess
import numpy as np
from sklearn.model_selection import train_test_split
from nlpaug.augmenter.word import SynonymAug
import nlpaug.augmenter.word as naw
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

Loading the data

In [2]:
df = pd.read_csv('re_dataset.csv', encoding='cp1252')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13169 entries, 0 to 13168
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet          13169 non-null  object
 1   HS             13169 non-null  int64 
 2   Abusive        13169 non-null  int64 
 3   HS_Individual  13169 non-null  int64 
 4   HS_Group       13169 non-null  int64 
 5   HS_Religion    13169 non-null  int64 
 6   HS_Race        13169 non-null  int64 
 7   HS_Physical    13169 non-null  int64 
 8   HS_Gender      13169 non-null  int64 
 9   HS_Other       13169 non-null  int64 
 10  HS_Weak        13169 non-null  int64 
 11  HS_Moderate    13169 non-null  int64 
 12  HS_Strong      13169 non-null  int64 
dtypes: int64(12), object(1)
memory usage: 1.3+ MB


In [4]:
df.duplicated().sum()

125

In [5]:
df = df.drop_duplicates()
df.duplicated().sum()

0

In [6]:
df.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0


Turning the Multi-labeled data into Single-Labeled data

In [7]:
df = df.drop(columns=['Abusive'])
sums = ['HS', 'HS_Individual', 'HS_Group', 'HS_Religion', 'HS_Race', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other', 'HS_Weak', 'HS_Moderate', 'HS_Strong']
df['Hate_Speech'] = df[sums].sum(axis=1)
df['Hate_Speech'] = df['Hate_Speech'].apply(lambda x: 1 if x > 0 else 0)
df = df.drop(columns=sums)
df.head()

Unnamed: 0,Tweet,Hate_Speech
0,- disaat semua cowok berusaha melacak perhatia...,1
1,RT USER: USER siapa yang telat ngasih tau elu?...,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13044 entries, 0 to 13168
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Tweet        13044 non-null  object
 1   Hate_Speech  13044 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 305.7+ KB


Pre-Processing

In [9]:
def data_preprocessing(tweet):
    tweet = re.sub(r'\b\\x\S\S', '',tweet)
    tweet = re.sub(r'\B\\x\S+', '',tweet)
    tweet = tweet.lower()
    tweet = re.sub(r'ð', '', tweet)
    tweet = re.sub('[^0-9a-zA-Z]+', ' ', tweet)
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub('\n',' ',tweet)
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ', tweet)
    tweet = re.sub('  +', ' ', tweet)
    tweet = re.sub(r'pic.twitter.com.[\w]+', '', tweet)
    tweet = re.sub('gue','saya', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
    if tweet.startswith("rt user"): return False
    tweet = re.sub('user',' ', tweet)
    return tweet

In [10]:
df.Tweet = df['Tweet'].apply(data_preprocessing)
df.head(10)

Unnamed: 0,Tweet,Hate_Speech
0,disaat semua cowok berusaha melacak perhatian...,1
1,False,0
2,41 kadang aku berfikir kenapa aku tetap percay...,0
3,aku itu aku n nku tau matamu sipit tapi di...,0
4,kaum cebong kapir udah keliatan dongoknya ...,1
5,ya bani taplak dkk,1
6,deklarasi pilkada 2018 aman dan anti hoax warg...,0
7,saya baru aja kelar re watch aldnoah zero pali...,0
8,nah admin belanja satu lagi port terbaik nak m...,0
9,enak lg klo smbil ngewe,0


In [11]:
df.drop(df[df['Tweet'] == False].index, inplace=True)
df.head(10)

Unnamed: 0,Tweet,Hate_Speech
0,disaat semua cowok berusaha melacak perhatian...,1
2,41 kadang aku berfikir kenapa aku tetap percay...,0
3,aku itu aku n nku tau matamu sipit tapi di...,0
4,kaum cebong kapir udah keliatan dongoknya ...,1
5,ya bani taplak dkk,1
6,deklarasi pilkada 2018 aman dan anti hoax warg...,0
7,saya baru aja kelar re watch aldnoah zero pali...,0
8,nah admin belanja satu lagi port terbaik nak m...,0
9,enak lg klo smbil ngewe,0
10,setidaknya gw punya jari tengah buat lu sebelo...,1


In [12]:
df[df.duplicated(['Tweet'], keep=False)]

Unnamed: 0,Tweet,Hate_Speech
40,gapernah mendalami al quran ya bang pantesan m...,1
144,ppp jangan buru buru tolak wacana pilkada lewa...,0
239,monyet turun ke pemukiman warga cibadak sukabu...,0
288,,1
318,,0
...,...,...
12597,warga baduy setelah selesai memperingati tradi...,0
12653,jatim anti berita hoax pilkada aman dan damai,0
12680,sekali dalam setahun selama ratusan tahun mas...,0
12875,lengserkan jokowi sekarang juga,1


In [13]:
df = df.drop_duplicates(subset=['Tweet'])
df.duplicated().sum()

0

In [14]:
df[df.duplicated(['Tweet'], keep=False)]

Unnamed: 0,Tweet,Hate_Speech


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12066 entries, 0 to 13168
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Tweet        12066 non-null  object
 1   Hate_Speech  12066 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 282.8+ KB


Spelling Corrections

In [16]:
kamus_alay = pd.read_csv('new_kamusalay.csv', encoding='cp1252', header=None)
kamus_alay.columns = ["Original", "Corrected"]
alay_dict = dict(zip(kamus_alay['Original'], kamus_alay['Corrected']))
print(kamus_alay)

                  Original                          Corrected
0      anakjakartaasikasik           anak jakarta asyik asyik
1             pakcikdahtua                  pak cik sudah tua
2           pakcikmudalagi                  pak cik muda lagi
3              t3tapjokowi                       tetap jokowi
4                       3x                          tiga kali
...                    ...                                ...
15162            mendikbud  menteri pendidikan dan kebudayaan
15163               mendag                menteri perdagangan
15164              menaker               menteri tenaga kerja
15165             memetwit                            mentwit
15166             megangin                           memegang

[15167 rows x 2 columns]


In [17]:
def normalize_text(tweet):
    for word in kamus_alay['Original']:
        return ' '.join([alay_dict[word] if word in alay_dict else word for word in tweet.split(' ')])

In [18]:
df.Tweet = df['Tweet'].apply(normalize_text)

In [19]:
df.head(10)

Unnamed: 0,Tweet,Hate_Speech
0,di saat semua cowok berusaha melacak perhatia...,1
2,41 kadang aku berpikir kenapa aku tetap percay...,0
3,aku itu aku dan ku tau matamu sipit tapi d...,0
4,kaum cebong kafir sudah kelihatan dongokny...,1
5,ya bani taplak dan kawan kawan,1
6,deklarasi pilihan kepala daerah 2018 aman dan ...,0
7,saya baru saja selesai re watch aldnoah zero p...,0
8,nah admin belanja satu lagi port terbaik nak m...,0
9,enak lagi kalau sambil ngewe,0
10,setidaknya gue punya jari tengah buat kamu seb...,1


Stop words removal

In [20]:
stop_words = pd.read_json('stopwords-id.json') # https://github.com/stopwords-iso/stopwords-id/tree/master

In [21]:
stop_words.columns = ['Words']
stop_words['Words'].head()
stopwords = stop_words['Words'].tolist()

In [22]:
def remove_stopwords(tweet):
    tweet_tokens = word_tokenize(tweet)
    filtered_tweets = [w for w in tweet_tokens if not w in stopwords]
    return " ".join(filtered_tweets)
print(df['Tweet'].iloc[0])
print(remove_stopwords(df['Tweet'].iloc[0]))

 di saat semua cowok berusaha melacak perhatian saya kamu lantas remehkan perhatian yang saya kasih khusus ke kamu basic kamu cowok bego 
cowok berusaha melacak perhatian lantas remehkan perhatian kasih khusus basic cowok bego


In [23]:
df.Tweet = df['Tweet'].apply(remove_stopwords)
df.head(10)

Unnamed: 0,Tweet,Hate_Speech
0,cowok berusaha melacak perhatian lantas remehk...,1
2,41 kadang berpikir percaya tuhan jatuh berkali...,0
3,ku tau matamu sipit,0
4,kaum cebong kafir dongoknya dungu haha,1
5,ya bani taplak kawan kawan,1
6,deklarasi pilihan kepala daerah 2018 aman anti...,0
7,selesai re watch aldnoah zero kampret 2 karakt...,0
8,admin belanja port terbaik nak makan ais kepal...,0
9,enak ngewe,0
10,gue jari gue ukur nyali bacot,1


In [24]:
df['Hate_Speech'].value_counts()

0    7137
1    4929
Name: Hate_Speech, dtype: int64

Tokenizing data for word embedding

In [30]:
tokenized_data = df
tokenized_data['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in tokenized_data['Tweet']] 
print(tokenized_data['tokenized_text'].head(10))

0     [cowok, berusaha, melacak, perhatian, lantas, ...
2     [kadang, berpikir, percaya, tuhan, jatuh, berk...
3                              [ku, tau, matamu, sipit]
4         [kaum, cebong, kafir, dongoknya, dungu, haha]
5                      [ya, bani, taplak, kawan, kawan]
6     [deklarasi, pilihan, kepala, daerah, aman, ant...
7     [selesai, re, watch, aldnoah, zero, kampret, k...
8     [admin, belanja, port, terbaik, nak, makan, ai...
9                                         [enak, ngewe]
10                 [gue, jari, gue, ukur, nyali, bacot]
Name: tokenized_text, dtype: object


Training the Word2Vec model for word embedding

In [725]:
size = 1000
window = 3
min_count = 1
workers = 3
sg = 1

word2vec_model_file = 'F:\\Univ\Sem 4\\Research Methodology in Computer Science\\ML\\' + 'word2vec_' + str(size) + '.model'
start_time = time.time()
tokenized_tweets = pd.Series(tokenized_data['tokenized_text']).values
w2v_model = Word2Vec(tokenized_tweets, min_count=min_count, vector_size=size, workers=workers, window=window, sg=sg)
print(f"Time taken to train word2Vec model: {time.time() - start_time}")
w2v_model.save(word2vec_model_file)

Time taken to train word2Vec model: 10.836922883987427


In [25]:
#Run from here if you already trained the model and put the clean data into a csv file
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics import f1_score, accuracy_score
import time
from gensim.utils import simple_preprocess
import numpy as np
from sklearn.model_selection import train_test_split
from nlpaug.augmenter.word import SynonymAug
import nlpaug.augmenter.word as naw
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

Loading the model from the model file

In [26]:
size = 1000
window = 3
min_count = 1
workers = 3
sg = 1
word2vec_model_file = 'F:\\Univ\Sem 4\\Research Methodology in Computer Science\\ML\\' + 'word2vec_' + str(size) + '.model'
sg_w2v_model = Word2Vec.load(word2vec_model_file)
vocab = sg_w2v_model.wv.key_to_index
print(sg_w2v_model.wv.get_vector(vocab['indonesia']))

[ 1.78887889e-01  4.23283093e-02  1.43088102e-01  1.25633791e-01
 -5.86368814e-02 -1.22436456e-01  7.84967393e-02  9.91348550e-02
 -1.27535522e-01  2.76888628e-02  7.61692412e-03  7.96046183e-02
 -1.70560673e-01 -4.46393108e-03  1.17428355e-01  5.62819354e-02
  3.11888922e-02  1.50347417e-02  9.60466787e-02 -2.43760765e-01
  2.58212467e-03  7.29314983e-03  1.07549161e-01  3.23767662e-02
  1.76293086e-02 -5.37098125e-02  8.60500038e-02 -3.88684794e-02
 -2.11335257e-01  1.00229152e-01 -3.24914954e-03 -2.54530367e-02
  4.94705997e-02 -2.49173772e-02  1.42667562e-01  2.91130356e-02
  1.12307094e-01 -8.34547430e-02  4.22253311e-02 -2.45892987e-01
 -1.49299860e-01  7.34978616e-02 -1.77445978e-01  7.51134902e-02
 -1.30436078e-01  1.00485094e-01 -2.02671722e-01  1.57555908e-01
 -1.24444589e-01  1.36581987e-01 -6.22395650e-02 -6.22601807e-02
  1.13660924e-01 -1.70841068e-01  4.66413274e-02  8.70665833e-02
  5.06300405e-02 -5.64831495e-03 -1.10396817e-01 -1.11593334e-02
 -2.35627249e-01 -3.09876

Function to turn the words in the sentence into vectors

In [27]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [sg_w2v_model.wv[word] for word in words if word in sg_w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(1000)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

Loading the clean data

In [28]:
df = pd.read_csv('cleaned_tweet.csv')

Split data into testing data and training data

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Hate_Speech'], test_size=0.2, random_state=42)

Oversampling of training data

In [30]:
d = {'Tweet':[str(i) for i in X_train], 'Hate_Speech':[i for i in y_train]}
train_data = pd.DataFrame(data=d)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9652 entries, 0 to 9651
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Tweet        9652 non-null   object
 1   Hate_Speech  9652 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 150.9+ KB


In [31]:
train_data['Hate_Speech'].value_counts() #Shows imbalance in the training data

0    5668
1    3984
Name: Hate_Speech, dtype: int64

In [32]:
train_data.tail()

Unnamed: 0,Tweet,Hate_Speech
9647,kadar penderitaan orang beda beda komparasi me...,0
9648,khusus pemilihan berharap 2 calon presiden sit...,0
9649,teras saksi bisu ya maddie duh khawatir dicka nih,0
9650,lebaran ya kang,0
9651,sah survei terbaru mayoritas warga jokowi pres...,1


In [33]:
def augment_text(df,samples,pr=0.2):
    aug = naw.SynonymAug(aug_src='wordnet',lang='ind')

    new_text=[]

    df_n=df[df["Hate_Speech"]==0].reset_index(drop=True)

    for i in np.random.randint(0,len(df_n),samples):
        text = df_n.iloc[i]['Tweet']
        augmented_text = aug.augment(text)
        new_text.append(augmented_text)

    new=pd.DataFrame({'Tweet':new_text,'Hate_Speech':1})
    df = pd.concat([df, new],axis=0).reset_index(drop=True)
    return df
max_size = train_data['Hate_Speech'].value_counts().max() - train_data['Hate_Speech'].value_counts().min()
train = augment_text(train_data, max_size)

In [34]:
train.tail(20)

Unnamed: 0,Tweet,Hate_Speech
11316,[batalyon buta nilai buta hati salam jumat peo...,1
11317,[media sosial bilang bom kemarin fiksi contoh ...,1
11318,[barusan saudara jarang ketemu mampir kediaman...,1
11319,[sesungguhnya percintaan tunanetra melumpuhkan...,1
11320,[untung untungan mengakui komunis mengakui ateis],1
11321,[pekikan asing asing ya],1
11322,[pengalaman urus surat administrasi pernikahan...,1
11323,[seram om bud sih prabowo republik indonesia 1...,1
11324,[kedaerahan jimat khusus tarik investor pengem...,1
11325,[presiden joko widodo memutuskan realisitis me...,1


In [35]:
#Since the oversampled data has brackets, we need to remove it first
def remove_brackets(df):
    new_df = df.copy()
    new_df['Tweet'] = new_df['Tweet'].apply(lambda x: " ".join(str(y) for y in x) if isinstance(x, list) else x)
    return new_df
train = remove_brackets(train)

In [36]:
train.tail()

Unnamed: 0,Tweet,Hate_Speech
11331,bacot doang berani,1
11332,sontak terkejut menyergap untungnya pelajaran ...,1
11333,kursi republik indonesia 1 kursi listrik calon...,1
11334,waria sih harganya bencong pegang kelulusan te...,1
11335,gubernur indonesia woooow salah putra terbaik ...,1


In [37]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11336 entries, 0 to 11335
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Tweet        11336 non-null  object
 1   Hate_Speech  11336 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 177.3+ KB


In [38]:
train['Hate_Speech'].value_counts() #Now the training data is balanced

1    5668
0    5668
Name: Hate_Speech, dtype: int64

In [39]:
train.tail()

Unnamed: 0,Tweet,Hate_Speech
11331,bacot doang berani,1
11332,sontak terkejut menyergap untungnya pelajaran ...,1
11333,kursi republik indonesia 1 kursi listrik calon...,1
11334,waria sih harganya bencong pegang kelulusan te...,1
11335,gubernur indonesia woooow salah putra terbaik ...,1


In [40]:
train = train.sample(frac=1).reset_index(drop=True) #Shuffle the data

In [41]:
train.tail()

Unnamed: 0,Tweet,Hate_Speech
11331,kayak tai era mas jokowi,1
11332,henry tampil konser amal masyarakat cina,0
11333,ateis mengenal tuhan,0
11334,mimpi gue bangkai,0
11335,salah memilih gubernur diusung isu sara,1


In [42]:
train.isna().any() #Check if there are any null values, just in case

Tweet          False
Hate_Speech    False
dtype: bool

In [43]:
print(train.iloc[351])

Tweet          komunisme nya cina kapitalisme nya us pancasil...
Hate_Speech                                                    1
Name: 351, dtype: object


In [44]:
train.to_csv('train_oversampled.csv')

In [45]:
#Re-assign the training data variable
X_train = train['Tweet']
y_train = train['Hate_Speech']

In [46]:
#Vectorize the data
X_train = np.array([vectorize(str(sentence)) for sentence in X_train])
X_test = np.array([vectorize(str(sentence)) for sentence in X_test])

Training ML models

In [47]:
#Logistic Regression Model
clf_lr = LogisticRegression(random_state=42)
clf_lr.fit(X_train, y_train)

In [48]:
#Logistic Regression Score
y_pred = clf_lr.predict(X_test)
print(classification_report(y_test, y_pred))
print(classification_report(y_train, clf_lr.predict(X_train)))

              precision    recall  f1-score   support

           0       0.81      0.74      0.78      1469
           1       0.65      0.73      0.69       945

    accuracy                           0.74      2414
   macro avg       0.73      0.74      0.73      2414
weighted avg       0.75      0.74      0.74      2414

              precision    recall  f1-score   support

           0       0.64      0.73      0.68      5668
           1       0.69      0.59      0.64      5668

    accuracy                           0.66     11336
   macro avg       0.66      0.66      0.66     11336
weighted avg       0.66      0.66      0.66     11336



In [49]:
#Decision Tree Model
clf_dt = DecisionTreeClassifier(random_state=42)
clf_dt.fit(X_train, y_train)

In [50]:
#Decision Tree Score
y_pred = clf_dt.predict(X_test)
print(classification_report(y_test, y_pred))
print(classification_report(y_train, clf_dt.predict(X_train)))

              precision    recall  f1-score   support

           0       0.77      0.66      0.71      1469
           1       0.56      0.69      0.62       945

    accuracy                           0.67      2414
   macro avg       0.66      0.67      0.66      2414
weighted avg       0.69      0.67      0.67      2414

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      5668
           1       0.99      0.94      0.97      5668

    accuracy                           0.97     11336
   macro avg       0.97      0.97      0.97     11336
weighted avg       0.97      0.97      0.97     11336



In [51]:
#SVM Model
clf_svm = svm.SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train, y_train)

In [52]:
#SVM Score
y_pred = clf_svm.predict(X_test)
print(classification_report(y_test, y_pred))
print(classification_report(y_train, clf_svm.predict(X_train)))

              precision    recall  f1-score   support

           0       0.78      0.81      0.79      1469
           1       0.68      0.64      0.66       945

    accuracy                           0.74      2414
   macro avg       0.73      0.72      0.73      2414
weighted avg       0.74      0.74      0.74      2414

              precision    recall  f1-score   support

           0       0.62      0.81      0.70      5668
           1       0.72      0.51      0.60      5668

    accuracy                           0.66     11336
   macro avg       0.67      0.66      0.65     11336
weighted avg       0.67      0.66      0.65     11336



In [53]:
#KNN Model
clf_knn = KNeighborsClassifier(n_neighbors=3)
clf_knn.fit(X_train, y_train)

In [54]:
#KNN Score
y_pred = clf_knn.predict(X_test)
print(classification_report(y_test, y_pred))
print(classification_report(y_train, clf_knn.predict(X_train)))

              precision    recall  f1-score   support

           0       0.79      0.65      0.71      1469
           1       0.57      0.73      0.64       945

    accuracy                           0.68      2414
   macro avg       0.68      0.69      0.68      2414
weighted avg       0.71      0.68      0.69      2414

              precision    recall  f1-score   support

           0       0.77      0.80      0.78      5668
           1       0.79      0.76      0.78      5668

    accuracy                           0.78     11336
   macro avg       0.78      0.78      0.78     11336
weighted avg       0.78      0.78      0.78     11336



In [55]:
#Random Forest Model
# clf_rf = RandomForestClassifier(random_state=69, bootstrap=True, max_depth=50, max_features='log2', min_samples_split=11, n_estimators=2000, n_jobs=40, min_samples_leaf=3, max_leaf_nodes=7, criterion='log_loss')
# clf_rf = RandomForestClassifier(random_state=420, n_jobs=200, max_depth=37, max_features='sqrt', bootstrap=True, min_samples_leaf=7, criterion='log_loss', n_estimators=1000, max_leaf_nodes=20, min_samples_split=11)
clf_rf = RandomForestClassifier(random_state=42, n_jobs=200)
clf_rf.fit(X_train, y_train)

In [56]:
#Random Forest Score
y_pred = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(classification_report(y_train, clf_rf.predict(X_train)))

              precision    recall  f1-score   support

           0       0.82      0.74      0.78      1469
           1       0.65      0.75      0.70       945

    accuracy                           0.75      2414
   macro avg       0.74      0.75      0.74      2414
weighted avg       0.76      0.75      0.75      2414

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      5668
           1       0.96      0.97      0.97      5668

    accuracy                           0.97     11336
   macro avg       0.97      0.97      0.97     11336
weighted avg       0.97      0.97      0.97     11336



Cross Evaluation with other dataset (Translated Dataset)
To make it balanced, we decided to try on 500 labeled '0' data and 500 labeled '1' data for both cross evaluations

In [57]:
trans_df = pd.read_csv('translated_dataset.csv')

In [58]:
trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        2556 non-null   int64 
 1   tweet             2556 non-null   object
 2   Hate_Speech       2556 non-null   int64 
 3   translated_tweet  2549 non-null   object
 4   tokenized_text    2556 non-null   object
dtypes: int64(2), object(3)
memory usage: 100.0+ KB


In [59]:
negative_trans_df = trans_df[trans_df['Hate_Speech'] == 0].sample(500, random_state=42)
positive_trans_df = trans_df[trans_df['Hate_Speech'] == 1].sample(500, random_state=42)

In [60]:
negative_trans_df.head()

Unnamed: 0.1,Unnamed: 0,tweet,Hate_Speech,translated_tweet,tokenized_text
409,409,i criticise australia more than most but it...,0,Saya mengkritik Australia kebanyakan shithole t,"['saya', 'mengkritik', 'australia', 'kebanyaka..."
841,841,probably the same reason it's okay to be white,0,alasan -apa putih,"['alasan', 'apa', 'putih']"
2273,2273,like father like son happy birthday mon negro ...,0,ayah putra selamat ulang negro et mon filneul ...,"['ayah', 'putra', 'selamat', 'ulang', 'negro',..."
2535,2535,instead of saying immigrants are a a threat to...,0,Alih -alih imigran ancaman keamanan nasional m...,"['alih', 'alih', 'imigran', 'ancaman', 'keaman..."
2405,2405,bitch shut yo retarded ass up looking like a g...,0,jalang tutup yo retarded ass up nilai penuh ti...,"['jalang', 'tutup', 'yo', 'retarded', 'ass', '..."


In [61]:
positive_trans_df.head()

Unnamed: 0.1,Unnamed: 0,tweet,Hate_Speech,translated_tweet,tokenized_text
438,438,why would it be an automatic car in 1911 you t...,1,Mengapa mobil otomatis 1911 Anda total mongolo...,"['mengapa', 'mobil', 'otomatis', 'anda', 'tota..."
869,869,good morning thing thong,1,selamat pagi thong,"['selamat', 'pagi', 'thong']"
2311,2311,thanks shithole countries,1,Terima kasih negara -negara shithole,"['terima', 'kasih', 'negara', 'negara', 'shith..."
2543,2543,could be the via knows more than you do it wou...,1,Bisa via Anda lakukan kalinya ekstremis muslin,"['bisa', 'via', 'anda', 'lakukan', 'kalinya', ..."
2444,2444,what a twat,1,Sungguh twat,"['sungguh', 'twat']"


In [62]:
test_trans_df = pd.concat([negative_trans_df, positive_trans_df])
test_trans_df.head()

Unnamed: 0.1,Unnamed: 0,tweet,Hate_Speech,translated_tweet,tokenized_text
409,409,i criticise australia more than most but it...,0,Saya mengkritik Australia kebanyakan shithole t,"['saya', 'mengkritik', 'australia', 'kebanyaka..."
841,841,probably the same reason it's okay to be white,0,alasan -apa putih,"['alasan', 'apa', 'putih']"
2273,2273,like father like son happy birthday mon negro ...,0,ayah putra selamat ulang negro et mon filneul ...,"['ayah', 'putra', 'selamat', 'ulang', 'negro',..."
2535,2535,instead of saying immigrants are a a threat to...,0,Alih -alih imigran ancaman keamanan nasional m...,"['alih', 'alih', 'imigran', 'ancaman', 'keaman..."
2405,2405,bitch shut yo retarded ass up looking like a g...,0,jalang tutup yo retarded ass up nilai penuh ti...,"['jalang', 'tutup', 'yo', 'retarded', 'ass', '..."


In [63]:
test_trans_df = shuffle(test_trans_df)
test_trans_df.head()

Unnamed: 0.1,Unnamed: 0,tweet,Hate_Speech,translated_tweet,tokenized_text
1363,1363,negro if that ain't bars ink what is,1,negro tinta,"['negro', 'tinta']"
1046,1046,keep your wheels spic n span year round with t...,0,Simpan roda Anda spic n span tips trik,"['simpan', 'roda', 'anda', 'spic', 'span', 'ti..."
500,500,camerroonians don't do celebritiessome people...,1,Camerroonians Dont Do Celebritiessome Petugas ...,"['camerroonians', 'dont', 'do', 'celebritiesso..."
521,521,did this negro really call me a broke bitch be...,0,Apakah negro -benar menyebut pelacur patah ism...,"['apakah', 'negro', 'benar', 'menyebut', 'pela..."
1157,1157,"[""how do hoi polloi listen to that korea rat i...",0,Bagaimana Hoi Polloi Mendengarkan Korea Rat It...,"['bagaimana', 'hoi', 'polloi', 'mendengarkan',..."


In [64]:
test_trans_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1363 to 101
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   tweet             1000 non-null   object
 2   Hate_Speech       1000 non-null   int64 
 3   translated_tweet  997 non-null    object
 4   tokenized_text    1000 non-null   object
dtypes: int64(2), object(3)
memory usage: 46.9+ KB


In [72]:
X_test = test_trans_df['translated_tweet']
X_test = np.array([vectorize(str(sentence)) for sentence in X_test])
y_test = test_trans_df['Hate_Speech']

In [67]:
y_pred = clf_lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.84      0.64       500
           1       0.57      0.21      0.30       500

    accuracy                           0.53      1000
   macro avg       0.54      0.53      0.47      1000
weighted avg       0.54      0.53      0.47      1000



In [68]:
y_pred = clf_knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.59      0.56       500
           1       0.53      0.46      0.49       500

    accuracy                           0.53      1000
   macro avg       0.53      0.53      0.52      1000
weighted avg       0.53      0.53      0.52      1000



In [69]:
y_pred = clf_svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.91      0.66       500
           1       0.63      0.16      0.26       500

    accuracy                           0.53      1000
   macro avg       0.58      0.53      0.46      1000
weighted avg       0.58      0.53      0.46      1000



In [70]:
y_pred = clf_dt.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.69      0.58       500
           1       0.51      0.32      0.39       500

    accuracy                           0.51      1000
   macro avg       0.51      0.51      0.49      1000
weighted avg       0.51      0.51      0.49      1000



In [71]:
y_pred = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.66      0.59       500
           1       0.54      0.40      0.46       500

    accuracy                           0.53      1000
   macro avg       0.53      0.53      0.52      1000
weighted avg       0.53      0.53      0.52      1000

