**Dataset**
labeled datasset collected from twitter

**Objective**
classify tweets containing hate speech from other tweets.
0 -> no hate speech
1 -> contains hate speech

**Total Estimated Time = 90 Mins**

### Import Libraries

In [67]:
import pandas as pd

### Load Dataset

In [68]:
Data_set = pd.read_csv("dataset.csv")

### EDA

- check NaNs

In [69]:
Data_set.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

- check duplicates

In [70]:
Data_set[["tweet" ,"label"]].duplicated().sum()

2432

- show samples of data texts to find out required preprocessing steps

In [71]:
Data_set.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


- check dataset balancing

In [72]:
Data_set["label"].value_counts()

0    29720
1     2242
Name: label, dtype: int64

- Cleaning and Preprocessing are:
    - 1
    - 2
    - 3

### Cleaning and Preprocessing

In [73]:
duplicated_df = Data_set[Data_set[["tweet" ,"label"]].duplicated()]

In [74]:
Data_without_dup = Data_set.drop(duplicated_df.index)

In [75]:
pd.set_option('display.max_colwidth', 100000)

In [76]:
Data_without_dup.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


In [77]:
from sklearn.model_selection import train_test_split
Train_tweets , Test_tweets , Train_target , Test_target = train_test_split(Data_set["tweet"] 
                                                                           , Data_set["label"] , 
                                                                           test_size = 0.2
                                                                          ,stratify = Data_set["label"]
                                                                          , random_state = 42)

In [78]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions# load spacy model, can be "en_core_web_sm" as well
nlp = spacy.load('en_core_web_md')

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [79]:
# nltk.download('stopwords')
stop_words = stopwords.words('english')
for w in stop_words:
    nlp.vocab[w].is_stop = False

In [80]:
import re

In [81]:
def process_tweet(tweet):
    return " ".join(re.sub("([\@|\#][A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

def text_preprocessing(text):
    text = process_tweet(text)
    
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text(separator=" ")

    text = text.strip()
    text =  " ".join(text.split())

    text = unidecode.unidecode(text)

    text = contractions.fix(text)

    text = text.lower()

    doc = nlp(text) 
    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        
        if token.is_stop and token.pos_ != 'NUM': 
            flag = False
        if token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        if token.pos_ == 'SYM' and flag == True: 
            flag = False
        if (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        if token.pos_ == 'NUM' and flag == True:
            edit = w2n.word_to_num(token.text)
        elif token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text

In [82]:
Train_tweets = pd.DataFrame(Train_tweets)
Train_target = pd.DataFrame(Train_target)

In [83]:
print(Train_tweets.shape , Train_target.shape)
Train_data = pd.concat([Train_tweets , Train_target] , axis=1)

(25569, 1) (25569, 1)


In [84]:
for i in range(Train_tweets.shape[0]):
    Train_data.iloc[i , 0] = ' '.join(text_preprocessing(Train_data.iloc[i , 0]))

In [85]:
# Train_tweets = Train_tweets.apply(lambd a x:' '.join(text_preprocessing(x)))
# Train_tweets.head()

**If it takes 60 Mins till here, you are doing Great** <br>
**If not! You also are doing Great**

### Modelling

In [86]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn import metrics
import eli5

In [87]:
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5))
clf = LinearSVC()
pipe_tfidf = make_pipeline(vec, clf)
pipe_tfidf.fit(Train_data["tweet"], Train_data["label"])

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5))),
                ('linearsvc', LinearSVC())])

#### Evaluation

In [88]:
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

In [89]:
print_report(pipe_tfidf, Test_tweets, Test_target)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5945
           1       0.88      0.44      0.59       448

    accuracy                           0.96      6393
   macro avg       0.92      0.72      0.78      6393
weighted avg       0.95      0.96      0.95      6393

accuracy: 0.957


In [90]:
eli5.show_weights(clf, vec=vec, top=20)



Weight?,Feature
+1.298,ism
+1.246,jew
+1.241,jew
+1.116,sexis
+1.082,sexi
+1.064,sexi
+1.060,rac
+1.060,onse
+1.056,uk
+1.048,aci


### Enhancement

- Using different N-grams
- Using different text representation technique

In [91]:
# from sklearn.preprocessing import LabelEncoder
# # from tensorflow.keras.utils import to_categorical

# encoder = LabelEncoder()
# y_encoded = encoder.fit_transform(Data_set['tweet'])
# y_encoded = to_categorical(y_encoded)

In [92]:
def get_longest_text(texts):
    longest_input = 0
    for text in texts:
        text_len= len(text.split())
        longest_input = max(longest_input, text_len)
    return longest_input

In [93]:
longest_input = get_longest_text(Data_set['tweet'])

In [94]:
import tensorflow as tf

# define the network
inputs = tf.keras.layers.Input((longest_input, 300))
reshaped = tf.keras.layers.Reshape((longest_input, 300, 1))(inputs)


filters = [2, 3, 4]

# define the conv net
conv_1 = tf.keras.layers.Conv2D(100, (filters[0], 300), activation='relu')(reshaped)
conv_2 = tf.keras.layers.Conv2D(100, (filters[1], 300), activation='relu')(reshaped)
conv_3 = tf.keras.layers.Conv2D(100, (filters[2], 300), activation='relu')(reshaped)

# define max-pooling
pool_1 = tf.keras.layers.MaxPooling2D((longest_input - filters[0] + 1, 1), strides=(1,1))(conv_1)
pool_2 = tf.keras.layers.MaxPooling2D((longest_input - filters[1] + 1, 1), strides=(1,1))(conv_2)
pool_3 = tf.keras.layers.MaxPooling2D((longest_input - filters[2] + 1, 1), strides=(1,1))(conv_3)

# concatenate the convs
merged_tensor = tf.keras.layers.concatenate([pool_1, pool_2, pool_3], axis=1)

# now flatten them and add a dense layer
flatten = tf.keras.layers.Flatten()(merged_tensor)

# add a dense layer
clf = tf.keras.layers.Dense(100, activation='relu')(flatten)

# add final output
clf = tf.keras.layers.Dense(len(encoder.classes_), activation='softmax')(clf)

In [95]:
model = tf.keras.models.Model(inputs, clf)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 34, 300)]    0           []                               
                                                                                                  
 reshape_4 (Reshape)            (None, 34, 300, 1)   0           ['input_5[0][0]']                
                                                                                                  
 conv2d_12 (Conv2D)             (None, 33, 1, 100)   60100       ['reshape_4[0][0]']              
                                                                                                  
 conv2d_13 (Conv2D)             (None, 32, 1, 100)   90100       ['reshape_4[0][0]']              
                                                                                            

In [96]:
tf.keras.utils.plot_model(
    model, show_shapes=False, show_layer_names=True,
    rankdir='TB', expand_nested=False, dpi=96
)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [97]:
Train_data

Unnamed: 0,tweet,label
26247,do my order at black amp sexy from s collection,0
13681,it s there it s I m tell that andrew jackson be and,1
25676,video on have have of this crap,1
14544,after monaco a podium this time guy maybe,0
25411,wow open amateur hour on fox just the golf and be people walk in of commentator on air,0
...,...,...
15438,scratch my tweet jo sorry have borrow box tell I we have the complement,0
29797,that special moment where you have you ness,0
15613,kudo for be compliant,0
1955,attack bull game 3d do you think that his head be the city each si,0


In [98]:
print(Train_tweets["tweet"] , Train_target["label"])

26247                             did my first order at @user ð ... something black &amp; sexy from @user 's collection!   #cantwait  
13681                       @user @user ð it's there, it's @user ðalthough i'm told that andrew jackson was  and #cantankerous also
25676                                            @user video: @user on #hateful  #liberals "#america has had enough of this crap!"  @user
14544                                             @user @user after monaco a podium this time guys maybe? #mclarenhonda #canadiangp   #f1
25411    @user @user wow. us open amateur hour on fox. just show the golf and be quite! people walking in front of commentators on air?  
                                                                       ...                                                               
15438         @user scratch my last tweet, jo, sorry. someone had "borrowed" two boxes without telling me. we have the full complement!  
29797                             

In [118]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(Train_data["tweet"] , Train_data["label"] , test_size=.2)

In [119]:
# train our model
model.fit(x_train, y_train)



ValueError: in user code:

    File "C:\Users\Abdelrahman\anaconda3\lib\site-packages\keras\engine\training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Abdelrahman\anaconda3\lib\site-packages\keras\engine\training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Abdelrahman\anaconda3\lib\site-packages\keras\engine\training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Abdelrahman\anaconda3\lib\site-packages\keras\engine\training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\Abdelrahman\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Abdelrahman\anaconda3\lib\site-packages\keras\layers\reshaping\reshape.py", line 111, in _fix_unknown_dimension
        raise ValueError(msg)

    ValueError: Exception encountered when calling layer "reshape_4" (type Reshape).
    
    total size of new array must be unchanged, input_shape = [1], output_shape = [34, 300, 1]
    
    Call arguments received by layer "reshape_4" (type Reshape):
      • inputs=tf.Tensor(shape=(None, 1), dtype=float32)


In [None]:
model.evaluate(x_test, y_test)

#### Done!