In [1]:
import pandas as pd
import numpy as np


# 1:  READ DATA FILE & MAKE LIST OF EACH TYPE

In [2]:
path_slang = "datafile/abusive_slang_stemmed.txt"
path_religious = "datafile/abusive_religious_stemmed.txt"
path_personal_attack = "datafile/abusive_personal_attack_stemmed.txt"
path_antifaminism = "datafile/abusive_antifaminism_stemmed.txt"
path_political =  "datafile/abusive_political_stemmed.txt"


def read_file(path):
    sent =[]
    sent.clear()
    with open(path, "r",encoding="utf-8") as f:
        for line in f:
            sent.append(line)
    return sent
slang_sent = read_file(path_slang)
religious_sent = read_file(path_religious)
personal_attack_sent = read_file(path_personal_attack)
political_sent = read_file(path_political)
antifaminism_sent = read_file(path_antifaminism)

In [3]:

for i in range(3):
    print(religious_sent[i])

﻿শয়তান অনেক বড় আলেম  আবেদ ছিল

মাসুদ ছোট আলেম ছোট শয়তান

তুই যে ইহুদিদ পয়দা ক কুকু



# 2. REMOVE ALL BAD CHARECTER

In [4]:
import re
def digit_remover(text):
    text = re.sub(r'[\d]','',text)
    text = re.sub(r'[?\!\t\‛\’\(\)\.\।]+','',text)
    text = re.sub(r'[/\-\,]',' ',text)
    text = text.rstrip("\n\r")
    return text


In [5]:
print(digit_remover("হত্যাকাণ্ড/খারাপ,অদৃশ্য-কাল্পনিক,ধর্মের/ধার্মিকদের,0"))

হত্যাকাণ্ড খারাপ অদৃশ্য কাল্পনিক ধর্মের ধার্মিকদের 


In [6]:
def remover(sent):
    for i in range(len(sent)):
        sent[i] = digit_remover(sent[i])
    return sent
slang_sent = remover(slang_sent)
religious_sent = remover(religious_sent)
personal_attack_sent = remover(personal_attack_sent)
political_sent = remover(political_sent)
antifaminism_sent = remover(antifaminism_sent)


In [7]:
for i in range(5):
    print(political_sent[i])


﻿খালেদা ধ্বংস প্রতিক
ভাইয়ে সুনেন জেরকম নেন্ত্রী সেরকম বকতব্য পগল মুগি নেএরি
সবাই নাইছা হাছিনা আপা বাতাস দেন আপা খালি গরম লাগে
আওয়ামী লীগ মানেই পাগরেল দল
হাছিনা বসতি মহিলাদ চেয়ে  নিচু মন মানুষ


# ADD COLUMN AND LABEL EACH TYPE OF DATA

In [8]:
slang_df = pd.DataFrame({'col':slang_sent})
slang_df['label'] = 1
religious_df = pd.DataFrame({'col':religious_sent})
religious_df['label'] = 2
personal_attack_df = pd.DataFrame({'col':personal_attack_sent})
personal_attack_df['label'] = 3
political_df = pd.DataFrame({'col':political_sent})
political_df['label'] = 4
antifaminism_df = pd.DataFrame({'col':antifaminism_sent})
antifaminism_df['label'] = 5

In [9]:
#attack_df.head()

# CONCATENATE EACH TYPE OF DATA AND SUFFLE THEM

In [10]:
frames = [slang_df, religious_df,personal_attack_df,political_df,antifaminism_df]
result = pd.concat(frames,ignore_index=True)
#suffle the data frame
result = result.sample(frac=1)

In [11]:
#result.head(5)

# SPLIT DATA INTO TRAIN AND TEST SET

In [12]:
from sklearn.model_selection import train_test_split
X = result['col']
y = result['label']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 1,test_size=.2)

In [13]:
#print(X_train.head())

In [14]:
result.shape

(2498, 2)

In [15]:
y.value_counts()

3    1051
1     419
2     415
5     338
4     275
Name: label, dtype: int64


#  REPRESENTING TEXT AS NUMERICAL DATA

In [16]:
# import and instantiate tfidfVECTORIZER (with the default parameters)
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(tokenizer=lambda a: a.split())

In [17]:
# learn the 'vocabulary' of the training data (occurs in-place)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [18]:
from imblearn.over_sampling import SMOTE
smote = SMOTE('minority')
X_sm,y_sm = smote.fit_sample(X_train_dtm,y_train)
print(X_sm.shape,y_sm.shape)

(2632, 5261) (2632,)


In [19]:
#vect.get_feature_names()

# APPLY MULTINOMIAL NAIVE BAYES ALGORITHM

In [20]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=0.1)

In [21]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

Wall time: 11.9 ms


MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [22]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [23]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.642

In [24]:
input_sentence = ["ভাইজান কপালে সুধু অপু বুবলি মত ফ্লপ নায়িকা"]
input_sentence_dtm = vect.transform(input_sentence)

In [25]:
typo = nb.predict(input_sentence_dtm)
def get_result(result):
    if(typo==0):
        return ("slang sentence")
    
    elif(typo==1):
        return("religious_abusive_sentence")
    else:
        return("celebrity_abusive_sentence")

print(get_result(typo))

celebrity_abusive_sentence


# APPLY GRID SEARCH ON MULTINOMIAL NB

In [83]:
from sklearn.model_selection import GridSearchCV

In [84]:
alphas = [0.0001,0.001,0.01,1]
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(alpha= alphas)
print(param_grid)

{'alpha': [0.0001, 0.001, 0.01, 1]}


In [85]:
X_dtm = vect.fit_transform(X)
# instantiate and fit the grid
grid = GridSearchCV(nb, param_grid, cv=10, scoring='accuracy', return_train_score=False)
grid.fit(X_dtm, y)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.0001, 0.001, 0.01, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

In [86]:
# examine the best model
print(grid.best_score_)
print(grid.best_params_)

0.6285028022417934
{'alpha': 0.01}


# APPLY LINEAR SVC CLASSIFIER

In [26]:
from sklearn.svm import LinearSVC
svc = LinearSVC(multi_class='ovr',max_iter=800,tol=0.0001,random_state=1)


In [27]:
svc.fit(X_train_dtm, y_train)
y_pred_class_svc = svc.predict(X_test_dtm)

In [28]:
metrics.accuracy_score(y_test, y_pred_class_svc)

0.694

In [29]:
typo_svc = svc.predict(input_sentence_dtm)

def get_result(result):
    if(typo==0):
        return ("slang sentence")
    
    elif(typo==1):
        return("religious_abusive_sentence")
    else:
        return("celebrity_abusive_sentence")

print(get_result(typo_svc))

celebrity_abusive_sentence


In [30]:
# print the classification report
print(metrics.classification_report(y_test, y_pred_class_svc))

              precision    recall  f1-score   support

           1       0.63      0.42      0.51        90
           2       0.82      0.70      0.75        79
           3       0.64      0.85      0.73       204
           4       0.98      0.73      0.83        62
           5       0.62      0.55      0.59        65

   micro avg       0.69      0.69      0.69       500
   macro avg       0.74      0.65      0.68       500
weighted avg       0.71      0.69      0.69       500



In [109]:
svc.get_params

<bound method BaseEstimator.get_params of LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=800,
     multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
     verbose=0)>

# APPLY GRID SEARCH ON LINEARSVC

In [110]:
from sklearn.model_selection import GridSearchCV

In [111]:
#penalties = ['l1','l2']
#losses = ['hinge','squared_hinge']
multi_classes = ['ovr','crammer_singer']
max_iters = [800,1000,1300]
tols = [0.0001,0.001,0.01]


In [112]:
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(multi_class = multi_classes ,max_iter=max_iters,tol=tols)
print(param_grid)

{'multi_class': ['ovr', 'crammer_singer'], 'max_iter': [800, 1000, 1300], 'tol': [0.0001, 0.001, 0.01]}


In [113]:
X_dtm = vect.fit_transform(X)
# instantiate and fit the grid
grid = GridSearchCV(svc, param_grid, cv=10, scoring='accuracy', return_train_score=False)
grid.fit(X_dtm, y)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=800,
     multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
     verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'multi_class': ['ovr', 'crammer_singer'], 'max_iter': [800, 1000, 1300], 'tol': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

In [117]:
# view the results
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

Unnamed: 0,mean_test_score,std_test_score,params
0,0.682946,0.02481,"{'max_iter': 800, 'multi_class': 'ovr', 'tol':..."
1,0.682946,0.02481,"{'max_iter': 800, 'multi_class': 'ovr', 'tol':..."
2,0.682546,0.024863,"{'max_iter': 800, 'multi_class': 'ovr', 'tol':..."
3,0.681345,0.030183,"{'max_iter': 800, 'multi_class': 'crammer_sing..."
4,0.681345,0.030183,"{'max_iter': 800, 'multi_class': 'crammer_sing..."
5,0.681345,0.030183,"{'max_iter': 800, 'multi_class': 'crammer_sing..."
6,0.682946,0.02481,"{'max_iter': 1000, 'multi_class': 'ovr', 'tol'..."
7,0.682946,0.02481,"{'max_iter': 1000, 'multi_class': 'ovr', 'tol'..."
8,0.682546,0.024863,"{'max_iter': 1000, 'multi_class': 'ovr', 'tol'..."
9,0.681345,0.030183,"{'max_iter': 1000, 'multi_class': 'crammer_sin..."


In [118]:
# examine the best model
print(grid.best_score_)
print(grid.best_params_)

0.6829463570856685
{'max_iter': 800, 'multi_class': 'ovr', 'tol': 0.0001}


# APPLY LOGISTIC REGRESSION

In [98]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()


In [99]:
logit.fit(X_train_dtm, y_train)
y_pred_class_logit = logit.predict(X_test_dtm)



In [100]:
metrics.accuracy_score(y_test, y_pred_class_logit)

0.582

In [101]:
typo_logit = logit.predict(input_sentence_dtm)
print(get_result(typo_logit))

celebrity_abusive_sentence


# APPLY RANDOM FOREST CLASSIFIER

In [102]:
from sklearn.ensemble import RandomForestClassifier
randomf = RandomForestClassifier(n_estimators=1000,max_depth=50)
randomf.fit(X_train_dtm, y_train)
y_pred_class_randomf = randomf.predict(X_test_dtm)

In [103]:
metrics.accuracy_score(y_test, y_pred_class_logit)

0.582

In [104]:
typo_randomf = logit.predict(input_sentence_dtm)
print(get_result(typo_randomf))

celebrity_abusive_sentence


# APPLY STOCHASTIC GRADIENT DECENT