In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns 
import math
import warnings

In [2]:
warnings.filterwarnings('ignore') # Hides warning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)

In [3]:
train = pd.read_csv('train.csv')
print("Training Set:"% train.columns, train.shape, len(train))
test = pd.read_csv('test.csv')
print("Test Set:"% test.columns, test.shape, len(test))

Training Set: (31962, 3) 31962
Test Set: (17197, 2) 17197


In [4]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [6]:
train.label.unique()

array([0, 1], dtype=int64)

In [7]:
train.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
test.isnull().sum()

id       0
tweet    0
dtype: int64

# Data Cleaning

In [9]:
train.tweet

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                     thank you @user for you follow  
Name: tweet, Length: 31962, dtype: object

In [10]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


# stop_words = set(stopwords.words('english'))

# function for text cleaning 
def pre_process(text):
    text = re.sub("(\\d|\\W)+"," ",text)# remove special characters and digits
    text = re.sub('[^a-zA-Z]', ' ' , str(text))
    text = text.lower()
    text = text.strip()
    text = text.split() #Convert to list from string
    # remove stopwords
    all_stopwords = stopwords.words('english')
    text = [word for word in text if word not in set(all_stopwords)]
    # lemmatize
    lmtzr  = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    return ' '.join(text)

train_clean  = train['tweet'].apply(lambda x: pre_process(x))
test_clean  = test['tweet'].apply(lambda x: pre_process(x))

In [11]:
# import re
# def  clean_text(df, text_field):
#     df[text_field] = df[text_field].str.lower()
#     df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
#     return df
# test_clean = clean_text(test, "tweet")
# train_clean = clean_text(train, "tweet")

In [12]:
len(train_clean)

31962

In [13]:
len(test_clean)

17197

In [14]:
train_clean[0]

'user father dysfunctional selfish drag kid dysfunction run'

In [15]:
test_clean[0]

'studiolife aislife requires passion dedication willpower find newmaterials'

In [16]:
test['tweet'] = test_clean
test_clean.head()

0    studiolife aislife requires passion dedication...
1    user white supremacist want everyone see new b...
2     safe way heal acne altwaystoheal healthy healing
3    hp cursed child book reservation already yes h...
4    rd bihday amazing hilarious nephew eli ahmir u...
Name: tweet, dtype: object

In [17]:
train['tweet'] = train_clean

In [18]:
train

Unnamed: 0,id,label,tweet
0,1,0,user father dysfunctional selfish drag kid dys...
1,2,0,user user thanks lyft credit use cause offer w...
2,3,0,bihday majesty
3,4,0,model love u take u time ur
4,5,0,factsguide society motivation
...,...,...,...
31957,31958,0,ate user isz youuu
31958,31959,0,see nina turner airwave trying wrap mantle gen...
31959,31960,0,listening sad song monday morning otw work sad
31960,31961,1,user sikh temple vandalised calgary wso condem...


In [19]:
print(train.isnull().sum())
print(test.isnull().sum())

id       0
label    0
tweet    0
dtype: int64
id       0
tweet    0
dtype: int64


In [20]:
test.shape

(17197, 2)

# Handling imbalanced data using Resampling

In [21]:
# you can find that the tweets regarding hate speeches are comparatively lesser than others, 
# so this is a situation of an unbalanced data.
train.label.value_counts()#Proportion of Minority Class[1] is 7%

0    29720
1     2242
Name: label, dtype: int64

In [22]:
train_majority = train[train.label==0]
len(train_majority)

29720

In [23]:
from sklearn.utils import resample
#set the minority class to a seperate dataframe
train_minority = train[train.label==1]
train_majority = train[train.label==0]
#upsample the minority class
train_minority_upsampled = resample(train_minority ,
                                   replace=True,
                                   n_samples=len(train_majority),
                                   random_state=42)
#concatenate the upsampled dataframe
train_upsampled = pd.concat([train_minority_upsampled,train_majority])
train_upsampled['label'].value_counts()

0    29720
1    29720
Name: label, dtype: int64

In [24]:
train_upsampled

Unnamed: 0,id,label,tweet
12213,12214,1,free sticker stop sexism stopsexism womensrigh...
18858,18859,1,user thought assistant gladshesgone nohope
16237,16238,1,user user stupid idea trump fascist destroy us...
15641,15642,1,user allahsoil riyadh renowned deadliest traff...
23486,23487,1,user mar bar user charging guy entry girl like...
...,...,...,...
31956,31957,0,fishing tomorrow user carnt wait first time year
31957,31958,0,ate user isz youuu
31958,31959,0,see nina turner airwave trying wrap mantle gen...
31959,31960,0,listening sad song monday morning otw work sad


In [25]:
train_upsampled.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

# Creating a Pipeline

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

In [27]:
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),])

In [28]:
pipeline_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', RandomForestClassifier()),])

In [29]:
pipeline_sgd

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('nb', SGDClassifier())])

In [43]:
pipeline_rf

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('nb', RandomForestClassifier())])

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'], train_upsampled['label'], test_size = 0.20, random_state = 42, shuffle=False)

In [31]:
print(train_upsampled.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(59440, 3)
(47552,)
(11888,)
(47552,)
(11888,)


# Training the Model

In [32]:
model_sgd = pipeline_sgd.fit(X_train, y_train)
y_predict = model_sgd.predict(X_test)

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_predict)
print(cm)
accuracy_score(y_test, y_predict)

[[10996   892]
 [    0     0]]


0.9249663526244953

In [34]:
model_rf = pipeline_rf.fit(X_train, y_train)
y_predict = model_rf.predict(X_test)

In [35]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_predict)
print(cm)
accuracy_score(y_test, y_predict)

[[11431   457]
 [    0     0]]


0.9615578734858681

In [36]:
X_test_df = test['tweet']

In [37]:
len(test['tweet'])

17197

In [38]:
#Test data prediction 
X_test_df = X_test_df[:11888]
len(X_test_df)

11888

In [39]:
y_pred = model_sgd.predict(X_test_df)

In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[10354  1534]
 [    0     0]]


0.8709623149394348

In [41]:
y_pred = model_rf.predict(X_test_df)

In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[10830  1058]
 [    0     0]]


0.9110026917900403