**Dataset**
labeled datasset collected from twitter

**Objective**
classify tweets containing hate speech from other tweets.
0 -> no hate speech
1 -> contains hate speech

**Total Estimated Time = 90 Mins**

### Import Libraries

In [99]:
import pandas as pd

### Load Dataset

In [100]:
Data_set = pd.read_csv("dataset.csv")

### EDA

- check NaNs

In [101]:
Data_set.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

- check duplicates

In [102]:
Data_set[["tweet" ,"label"]].duplicated().sum()

2432

- show samples of data texts to find out required preprocessing steps

In [103]:
Data_set.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


- check dataset balancing

In [104]:
Data_set["label"].value_counts()

0    29720
1     2242
Name: label, dtype: int64

- Cleaning and Preprocessing are:
    - 1
    - 2
    - 3

### Cleaning and Preprocessing

In [105]:
duplicated_df = Data_set[Data_set[["tweet" ,"label"]].duplicated()]

In [106]:
Data_without_dup = Data_set.drop(duplicated_df.index)

In [107]:
pd.set_option('display.max_colwidth', 100000)

In [108]:
Data_without_dup.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


In [109]:
from sklearn.model_selection import train_test_split
Train_tweets , Test_tweets , Train_target , Test_target = train_test_split(Data_set["tweet"] 
                                                                           , Data_set["label"] , 
                                                                           test_size = 0.2
                                                                          ,stratify = Data_set["label"]
                                                                          , random_state = 42)

In [110]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions# load spacy model, can be "en_core_web_sm" as well
nlp = spacy.load('en_core_web_md')

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [111]:
# nltk.download('stopwords')
stop_words = stopwords.words('english')
for w in stop_words:
    nlp.vocab[w].is_stop = False

In [116]:
import re

In [117]:
def process_tweet(tweet):
    return " ".join(re.sub("([\@|\#][A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

def text_preprocessing(text):
    text = process_tweet(text)
    
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text(separator=" ")

    text = text.strip()
    text =  " ".join(text.split())

    text = unidecode.unidecode(text)

    text = contractions.fix(text)

    text = text.lower()

    doc = nlp(text) 
    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        
        if token.is_stop and token.pos_ != 'NUM': 
            flag = False
        if token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        if token.pos_ == 'SYM' and flag == True: 
            flag = False
        if (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        if token.pos_ == 'NUM' and flag == True:
            edit = w2n.word_to_num(token.text)
        elif token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text

In [113]:
Train_tweets = pd.DataFrame(Train_tweets)
Train_target = pd.DataFrame(Train_target)

In [114]:
print(Train_tweets.shape , Train_target.shape)
Train_data = pd.concat([Train_tweets , Train_target] , axis=1)

(25569, 1) (25569, 1)


In [118]:
for i in range(Train_tweets.shape[0]):
    Train_tweets.iloc[i , 0] = ' '.join(text_preprocessing(Train_tweets.iloc[i , 0]))

In [119]:
Train_tweets.head()

Unnamed: 0,tweet
26247,do my order at black amp sexy from s collection
13681,it s there it s I m tell that andrew jackson be and
25676,video on have have of this crap
14544,after monaco a podium this time guy maybe
25411,wow open amateur hour on fox just the golf and be people walk in of commentator on air


**If it takes 60 Mins till here, you are doing Great** <br>
**If not! You also are doing Great**

### Modelling

In [120]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn import metrics
import eli5

In [121]:
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5))
clf = LinearSVC()
pipe_tfidf = make_pipeline(vec, clf)
pipe_tfidf.fit(Train_data["tweet"], Train_data["label"])

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5))),
                ('linearsvc', LinearSVC())])

#### Evaluation

In [122]:
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

In [123]:
print_report(pipe_tfidf, Test_tweets, Test_target)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5945
           1       0.89      0.65      0.75       448

    accuracy                           0.97      6393
   macro avg       0.93      0.82      0.87      6393
weighted avg       0.97      0.97      0.97      6393

accuracy: 0.970


In [124]:
eli5.show_weights(clf, vec=vec, top=20)



Weight?,Feature
+1.953,â¦
+1.787,¦
+1.787,â¦
+1.656,ism
+1.488,white
+1.417,hite
+1.322,whit
+1.302,whit
+1.228,cis
+1.188,hit


### Enhancement

- Using different N-grams
- Using different text representation technique

#### Done!