# Importing necessary libraries

In [1]:
import pandas as pd

import re

from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

# Loading data

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic
0,e617e2489abe9bca,"""\r\n\r\n A barnstar for you! \r\n\r\n The De...",0
1,9250cf637294e09d,"""\r\n\r\nThis seems unbalanced. whatever I ha...",0
2,ce1aa4592d5240ca,"Marya Dzmitruk was born in Minsk, Belarus in M...",0
3,48105766ff7f075b,"""\r\n\r\nTalkback\r\n\r\n Dear Celestia... """,0
4,0543d4f82e5470b6,New Categories \r\n\r\nI honestly think that w...,0


In [3]:
# Shape of data
train.shape

(5000, 3)

In [4]:
# Different value count of target variable
train.toxic.value_counts()

0    4563
1     437
Name: toxic, dtype: int64

Data is imbalanced with just 437 observations for toxic and 4563 observation for non-toxic comments.

# Getting the comments into a list for easy text cleanup and manipulation



In [5]:
comments = list(train.comment_text.values)

# Cleanup :



*   **IP addresses validation using regular expressions for removing IP addresses :**  
IP Address generally have two versions :  

> - IPV4  
> - IPV6


  
An IPv4 address consists of four numbers, each of which contains one to three digits, with a single dot (.) separating each number or set of digits. IPv4 IP address might look like:  

25.59.209.224  

An IPv6 address consists of eight groups of four hexadecimal digits. Here’s an example IPv6 address:

3001:0da8:75a3:0000:0000:8a2e:0370:7334

In [6]:
# regex for IPV4 IP addresses
[re.search('(\d+\.){1,3}\d+', IPadd) for IPadd in ['IP address 1 is 25.59.209.224', 'IP address 2 is 192.168.0.0', 
                                                 'IP address 3 is 192.168.255.255']]

[<re.Match object; span=(16, 29), match='25.59.209.224'>,
 <re.Match object; span=(16, 27), match='192.168.0.0'>,
 <re.Match object; span=(16, 31), match='192.168.255.255'>]

In [7]:
[re.search('(\d+\.){1,3}\d+', IPadd) for IPadd in comments]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(158, 172), match='71.127.137.171'>,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(42, 53), match='72.75.20.29'>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(61, 75), match='89.241.146.140'>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(61, 73), match='42.60.139.23'>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(384, 387), match='2.3'>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 No

In [8]:
# regex for IPV6 IP address
[re.search('([0-9a-fA-F]+\:){7}[0-9a-fA-F]+',IPadd) for IPadd in ['IP address 1 is 3001:0da8:75a3:0000:0000:8a2e:0370:7334',
                                                                 'IP address 2 is 2001:db8:0:0:0:0:2:1',
                                                                  'IP address 3 is FF01:0:0:0:0:0:0:1']]

[<re.Match object; span=(16, 55), match='3001:0da8:75a3:0000:0000:8a2e:0370:7334'>,
 <re.Match object; span=(16, 36), match='2001:db8:0:0:0:0:2:1'>,
 <re.Match object; span=(16, 34), match='FF01:0:0:0:0:0:0:1'>]

In [9]:
[re.search('([0-9a-fA-F]+\:){7}[0-9a-fA-F]+',sent) for sent in comments]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

Data does not have any IPV6 IP address and only IPV4 IP address is present.

*   **URL validation using regular expressions for removing URLs :**

In [10]:
# regex for URLs
[re.search('http[s]?://\S+',sent) for sent in comments]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(55, 94), match='http://en.wikipedia.org/wiki/Mutilation'>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(165, 229), match='http://spanky.thehawkeye.com/features/IAAP/breaki>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(241, 286), match='http://www.valerosos.com/PreludetoInchon.html'>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <re.Match object; span=(711, 801), match='http://eiga.wikia.com/wiki/Talk:Humanity_and_Pape>,
 None,
 None,
 None,
 None,
 None,
 None,
 Non

 *  Defining a function to :  
    
  - Using regular expressions, remove IP addresses
  - Using regular expressions, remove URLs
  - Normalize the casing
  - Tokenize using word_tokenize from NLTK
  - Remove stop words
  - Remove punctuation


In [11]:
def cleanup(text):
    no_ip=[re.sub('(\d+.){1,3}\d+','',sent) for sent in text]
    no_url=[re.sub('http[s]?://\S+','',sent) for sent in no_ip]
    
    # Normalising text
    norm_case = []
    for sent in no_url:
        norm_case.append(sent.lower())
        
    # Tokenising using word_tokenize using NLTK
    Word_Tokens = [word_tokenize(str(sent)) for sent in norm_case]
    
    stop_final = stopwords.words('english') + list(punctuation)+["'s","'t","'d","'ll","'re","'ve","'m","n't",'must']
    
    # Removing stopwords and punctuation marks
    No_sw = []
    for sent in Word_Tokens:
        No_sw.append([token for token in sent if token not in stop_final])
        
    Cleaned_text = [re.findall('\w+',str(token)) for token in No_sw]
        
    return Cleaned_text

In [12]:
clean_comments = cleanup(comments)



*   Using a counter to find the top terms in the data



In [13]:
term_list = []
for sent in clean_comments:
  term_list.extend(tokens for tokens in sent)

In [14]:
top_term = Counter(term_list)
top_term.most_common(20)

[('article', 1735),
 ('page', 1526),
 ('wikipedia', 1445),
 ('talk', 1230),
 ('please', 1042),
 ('would', 1009),
 ('ass', 990),
 ('fuck', 908),
 ('one', 881),
 ('like', 845),
 ('also', 658),
 ('see', 632),
 ('think', 630),
 ('know', 597),
 ('edit', 582),
 ('people', 553),
 ('use', 552),
 ('name', 542),
 ('may', 534),
 ('articles', 494)]

Contextual stop words - "article", "page", "wikipedia", "talk", "articles", "pages","edit"

In [15]:
Contextual_sw = ["article", "page", "wikipedia", "talk", "articles", "pages", "edit"]



*   Dropping contextual stop words


In [16]:
Cleaned_data = []
for sent in clean_comments:
  Cleaned_data.append([token for token in sent if token not in Contextual_sw])

In [17]:
# Top terms after removing contextual stop words
term_list = []
for sent in Cleaned_data:
  term_list.extend(tokens for tokens in sent)

top_term = Counter(term_list)
top_term.most_common(20)

[('please', 1042),
 ('would', 1009),
 ('ass', 990),
 ('fuck', 908),
 ('one', 881),
 ('like', 845),
 ('also', 658),
 ('see', 632),
 ('think', 630),
 ('know', 597),
 ('people', 553),
 ('use', 552),
 ('name', 542),
 ('may', 534),
 ('time', 489),
 ('thanks', 439),
 ('user', 429),
 ('even', 405),
 ('well', 403),
 ('could', 394)]

In [18]:
Cleaned_data = [" ".join(sent) for sent in Cleaned_data]

In [19]:
Cleaned_data[1]

'seems unbalanced whatever said mathsci said far extreme unpleasant things mention others much greater frequency happy reign like ruth told trying get mathsci pay attention stop uncivil would expect issue request mathsci intentionally unbalanced whatever reason please let know voluntarily close account move things like lot contribute way point contributing project editors administrative leave aggressively rude good editor really deserve people riding ass every time try certain things happily leave hands drama prone think best ludwigs2'

# Seperating data into train & test dataset

In [20]:
X = Cleaned_data
Y = train.toxic

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state=42)



*   Document term matrix using TF-IDF vectorizer


In [22]:
vectorizer = TfidfVectorizer(max_features = 4000)

In [23]:
len(X_train), len(X_test)

(3500, 1500)

In [24]:
X_train_bow = vectorizer.fit_transform(X_train)

In [25]:
X_test_bow = vectorizer.transform(X_test)

In [26]:
X_train_bow.shape, X_test_bow.shape

((3500, 4000), (1500, 4000))

# Model building: Support Vector Machine

In [27]:
SVM_Classifier = svm.SVC(kernel='linear')

In [28]:
SVM_Classifier.fit(X_train_bow, Y_train)

SVC(kernel='linear')

In [29]:
Y_train_pred = SVM_Classifier.predict(X_train_bow)

In [30]:
Y_train_pred[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [31]:
Y_test_pred = SVM_Classifier.predict(X_test_bow)

In [32]:
Y_test_pred[:5]

array([0, 0, 0, 0, 0], dtype=int64)

# Model evaluation: Accuracy, recall, and f1_score



*   Train data evaluation


In [33]:
print("Accuracy Score {}".format(accuracy_score(Y_train,Y_train_pred)))
print("Recall Score {}".format(recall_score(Y_train,Y_train_pred)))
print("F-1 Score {}".format(f1_score(Y_train,Y_train_pred)))

Accuracy Score 0.9685714285714285
Recall Score 0.6447368421052632
F-1 Score 0.7808764940239044


*   Test data evaluation




In [34]:
print("Accuracy Score {}".format(accuracy_score(Y_test,Y_test_pred)))
print("Recall Score {}".format(recall_score(Y_test,Y_test_pred)))
print("F-1 Score {}".format(f1_score(Y_test,Y_test_pred)))

Accuracy Score 0.9506666666666667
Recall Score 0.45112781954887216
F-1 Score 0.6185567010309279


Adjusting the appropriate parameter in the SVC module as the target variable is imbalanced

In [35]:
SVM_Classifier = svm.SVC(kernel='linear', class_weight="balanced")

In [36]:
SVM_Classifier.fit(X_train_bow, Y_train)

SVC(class_weight='balanced', kernel='linear')

In [37]:
Y_train_pred = SVM_Classifier.predict(X_train_bow)

In [38]:
Y_test_pred = SVM_Classifier.predict(X_test_bow)

In [39]:
# Train data evaluation after adjusting the parameters
print("Accuracy Score {}".format(accuracy_score(Y_train,Y_train_pred)))
print("Recall Score {}".format(recall_score(Y_train,Y_train_pred)))
print("F-1 Score {}".format(f1_score(Y_train,Y_train_pred)))

Accuracy Score 0.986
Recall Score 0.9835526315789473
F-1 Score 0.9242658423493044


In [40]:
# Test data evaluation after adjusting the parameters
print("Accuracy Score {}".format(accuracy_score(Y_test,Y_test_pred)))
print("Recall Score {}".format(recall_score(Y_test,Y_test_pred)))
print("F-1 Score {}".format(f1_score(Y_test,Y_test_pred)))

Accuracy Score 0.9393333333333334
Recall Score 0.6766917293233082
F-1 Score 0.6642066420664207


# Hyperparameter Tuning

In [41]:
# Create the parameter grid based on the results of random search 
Param_Grid = {
    'C': [0.1, 1, 10,1000, 10000, 100000]
}

In [42]:
SVM_Classifier = svm.SVC(random_state=42, class_weight="balanced", kernel="linear")

In [43]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [44]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = SVM_Classifier, param_grid = Param_Grid, 
                          cv = kfold, n_jobs = -1, verbose = 1, scoring = "recall" )

In [45]:
grid_search.fit(X_train_bow, Y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    5.0s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             estimator=SVC(class_weight='balanced', kernel='linear',
                           random_state=42),
             n_jobs=-1, param_grid={'C': [0.1, 1, 10, 1000, 10000, 100000]},
             scoring='recall', verbose=1)

In [46]:
# Identify the best parameters

final_clf = svm.LinearSVC(loss='hinge', C=grid_search.best_params_['C'])
final_clf.fit(X_train_bow, Y_train);

In [47]:
grid_search.best_estimator_

SVC(C=1000, class_weight='balanced', kernel='linear', random_state=42)

# Predicting and evaluating using the best estimator

In [48]:
Y_train_pred = final_clf.predict(X_train_bow)

In [49]:
# Train data evaluation after Hyperparameter tuning
print("Accuracy Score {}".format(accuracy_score(Y_train,Y_train_pred)))
print("Recall Score {}".format(recall_score(Y_train,Y_train_pred)))
print("F-1 Score {}".format(f1_score(Y_train,Y_train_pred)))

Accuracy Score 0.9985714285714286
Recall Score 0.9835526315789473
F-1 Score 0.9917081260364843


In [50]:
# Test data evaluation after Hyperparameter tuning
print("Accuracy Score {}".format(accuracy_score(Y_test,Y_test_pred)))
print("Recall Score {}".format(recall_score(Y_test,Y_test_pred)))
print("F-1 Score {}".format(f1_score(Y_test,Y_test_pred)))

Accuracy Score 0.9393333333333334
Recall Score 0.6766917293233082
F-1 Score 0.6642066420664207


# Most prominent terms in the toxic comments

In [51]:
Y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [52]:
toxic_comments = pd.Series(X_test)[Y_test_pred == 1].values

In [53]:
term_list = []
for comment in toxic_comments:
    term_list.extend(word_tokenize(comment))

In [54]:
Counter(term_list).most_common(15)

[('nigger', 184),
 ('die', 157),
 ('jim', 157),
 ('wales', 156),
 ('cuntbag', 126),
 ('fucking', 97),
 ('hate', 85),
 ('jews', 80),
 ('niggers', 80),
 ('spics', 79),
 ('minorities', 79),
 ('like', 24),
 ('go', 22),
 ('people', 20),
 ('fuck', 16)]

In [55]:
def toxic_filter(message):
    if final_clf.predict(vectorizer.transform([message])):
        return 'toxic'
    else:
        return 'not toxic'

In [56]:
toxic_filter("Shut up")

'toxic'