In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import string 
import re # for regular expressions

import nltk #for text manipulation 
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords
stop=stopwords.words('english')
stop.remove('not')

from wordcloud import WordCloud
import seaborn as sns 
import matplotlib.pyplot as plt 

pd.set_option("display.max_colwidth", 200) 
%matplotlib inline

from tqdm import tqdm
import gensim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, auc, roc_curve
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

### 1 - Load Data

In [2]:
def load_csv(csv_file_name):
    
    data = pd.read_csv(csv_file_name)
    
    return data

In [3]:
train_path = '../../../../input/twitters/train_E6oV3lV.csv'
train= load_csv(csv_file_name = train_path)
test_path = '../../../../input/twitters/test_tweets_anuFYb8.csv'
test= load_csv(csv_file_name = test_path)

### 2 - Feature engineering

In [4]:
def avg_word(sentence):
    
    words=sentence.split()
    
    return (sum(len(word) for word in words)/len(words))

In [7]:
def raw_data_feature_engineering(data):
    
    data['word_count']=data['tweet'].apply(lambda x:len(str(x).split(" ")))
    data['char_count']=data['tweet'].str.len()
    data['avg_word']=data['tweet'].apply(lambda x:avg_word(x))
    data['stopwords']=data['tweet'].apply(lambda sen:len([x for x in sen.split() if x in stop]))
    data['hashtags']=data['tweet'].apply(lambda sen:len([x for x in sen.split() if x.startswith("#")]))
    data['numerics']=data['tweet'].apply(lambda sen:len([x for x in sen.split() if x.isdigit()]))
    data['upper']=data['tweet'].apply(lambda sen:len([x for x in sen.split() if x.isupper()]))
    
    return data

In [8]:
train = raw_data_feature_engineering(train)
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,stopwords,hashtags,numerics,upper,avg_word
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,21,102,10,1,0,0,4.555556
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,22,122,5,3,0,0,5.315789
2,3,0,bihday your majesty,5,21,1,0,0,0,5.666667
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,17,86,5,1,0,0,4.928571
4,5,0,factsguide: society now #motivation,8,39,1,1,0,0,8.0


### 3 - Data Cleaning

In [9]:
def process_tweet(tweet):
    
    processed_tweet = []
    
    # Convert to lower case
    tweet = tweet.lower()
    
    # Remove @handle 
    tweet = re.sub(r'@[\S]+', '', tweet)
    
    # Remove URLs
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', '', tweet)
    
    # Remove Punctuations, Numbers, and Special Characters
    tweet = re.sub(r'[^A-Za-z#]',' ',tweet)
    
    # Remove Stop words
    tweet = " ".join(word for word in tweet.split() if word not in stop)
    
    # Stemming and lemme the word
    st=PorterStemmer()
    wordnet=WordNetLemmatizer()
    tweet=" ".join(st.stem(i) for i in tweet.split())
    tweet=" ".join(wordnet.lemmatize(i) for i in tweet.split())  
    
    return tweet

In [10]:
%%time
train['tidy_tweet'] = train['tweet'].apply(lambda x: process_tweet(x))

CPU times: user 9.69 s, sys: 111 ms, total: 9.8 s
Wall time: 9.85 s


In [11]:
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,stopwords,hashtags,numerics,upper,avg_word,tidy_tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,21,102,10,1,0,0,4.555556,father dysfunct selfish drag kid dysfunct #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,22,122,5,3,0,0,5.315789,thank #lyft credit use caus offer wheelchair van pdx #disapoint #getthank
2,3,0,bihday your majesty,5,21,1,0,0,0,5.666667,bihday majesti
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,17,86,5,1,0,0,4.928571,#model love u take u time ur
4,5,0,factsguide: society now #motivation,8,39,1,1,0,0,8.0,factsguid societi #motiv


### 4 - Data Visulisation 

In [12]:
def word_cloud(data, sample):
    
    '''
    
    Visulisation - word cloud
    
    '''
    
    all_words = ' '.join([text for text in data['tidy_tweet']]) 
    wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110, background_color='white').generate(all_words) 
    plt.figure(figsize=(10, 7)) 
    plt.imshow(wordcloud, interpolation="bilinear") 
    plt.axis('off')
    # plt.show()
    plt.savefig("../plots/word_cloud_{}.png".format(sample))
    plt.close()

In [13]:
word_cloud(train, 'all')
word_cloud(train[train.label==1], 'racist')
word_cloud(train[train.label==0], 'normal')

In [14]:
def hashtag_extract(x):
    
    '''  
    function to collect hashtags
    '''
    
    hashtags = []    # Loop over the words in the tweet
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)
    return hashtags



In [15]:
def hashtag(data, sample):
    
    '''
    Visulisation - Hash Tag
    '''
    
    ht = hashtag_extract(data['tidy_tweet'])
    ht = sum(ht,[]) 
    
    a = nltk.FreqDist(ht)
    d = pd.DataFrame(
        {
        'Hashtag': list(a.keys()),
        'Count': list(a.values())
        }
    ) 
    # selecting top 20 most frequent hashtags
    d = d.nlargest(columns="Count", n = 20)
    plt.figure(figsize=(20,5))
    ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
    ax.set(ylabel = 'Count')
    # plt.show()
    plt.savefig("../plots/hashtag_{}.png".format(sample))
    plt.close()
    

In [16]:
hashtag(train, 'all')
hashtag(train[train.label==1], 'racist')
hashtag(train[train.label==0], 'regular')

### 5 - Feature Extraction 

In [17]:
def feature_extraction(data, method):
    
    if method == 'bow':
        print('Feture Extraction Method', method)
        bow = CountVectorizer(min_df=2, max_features=1000,lowercase=True, ngram_range=(1,1),analyzer = "word")
        bow.fit(data['tidy_tweet'])
        final = bow.transform(data['tidy_tweet']).toarray()
        print('Feature name == ',bow.get_feature_names()[:5])
        print('Number of uniqe words', final.shape[1])
        print('Shape', final.shape)
        final = pd.DataFrame(final)
        
    elif method == 'tf-idf':
        print('Feture Extraction Method', method)
        tfidf = TfidfVectorizer(ngram_range=(1, 2),min_df=2,max_features=1000, lowercase=True, analyzer='word',stop_words= 'english')
        tfidf.fit(data['tidy_tweet'])
        final=tfidf.transform(data['tidy_tweet']).toarray()
        print('Number of uniqe words',final.shape[1])
        print('Shape',final.shape)
        final=pd.DataFrame(final)
        
    elif method == 'word2vc':
        print('Feture Extraction Method', method)
        tokenize = data['tidy_tweet'].apply(lambda x: x.split())
        w2vec_model = gensim.models.Word2Vec(
            tokenize,
            min_count = 1, 
            size = 100, 
            window = 5, 
            sg = 1,
            hs = 0,
            seed = 34)
        w2vec_model.train(tokenize,total_examples= len(data['tidy_tweet']),epochs=20)
        w2v_words = list(w2vec_model.wv.vocab)
        print("Number of words that occured minimum 5 times ",len(w2v_words))
        print("Sample words ", w2v_words[0:10])
        
        # create a vector for each tweet by taking the average of the vectors of the words present in the tweet.
        vector=[]
        for sent in tqdm(tokenize):
            sent_vec=np.zeros(100)
            count =0
            for word in sent: 
                if word in w2v_words:
                    vec = w2vec_model.wv[word]
                    sent_vec += vec 
                    count += 1
            if count != 0:
                sent_vec /= count #normalize
            vector.append(sent_vec)
        print(len(vector))
        print(len(vector[0]))  
        print('Number of uniqe words',len(vector[1]))
        final=pd.DataFrame(vector)
    
    cols = [x for x in data.columns if x not in ['id', 'tweet', 'tidy_tweet', 'label']] + ['label']
    final[cols] = data[cols]
    
    return final

    
    

In [18]:
bow_train = feature_extraction(train, method = 'bow')
bow_train.head()

Feture Extraction Method bow
Feature name ==  ['abl', 'absolut', 'accept', 'account', 'act']
Number of uniqe words 1000
Shape (31962, 1000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,998,999,word_count,char_count,stopwords,hashtags,numerics,upper,avg_word,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,21,102,10,1,0,0,4.555556,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,22,122,5,3,0,0,5.315789,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,5,21,1,0,0,0,5.666667,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,17,86,5,1,0,0,4.928571,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,8,39,1,1,0,0,8.0,0


In [19]:
tfidf_train = feature_extraction(train, method = 'tf-idf')
tfidf_train.head()

Feture Extraction Method tf-idf
Number of uniqe words 1000
Shape (31962, 1000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,998,999,word_count,char_count,stopwords,hashtags,numerics,upper,avg_word,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,21,102,10,1,0,0,4.555556,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,22,122,5,3,0,0,5.315789,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5,21,1,0,0,0,5.666667,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,17,86,5,1,0,0,4.928571,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,8,39,1,1,0,0,8.0,0


In [20]:
w2v_train = feature_extraction(train, method = 'word2vc')
w2v_train.head()

Feture Extraction Method word2vc


  3%|▎         | 1067/31962 [00:00<00:05, 5565.31it/s]

Number of words that occured minimum 5 times  36836
Sample words  ['father', 'dysfunct', 'selfish', 'drag', 'kid', '#run', 'thank', '#lyft', 'credit', 'use']


100%|██████████| 31962/31962 [00:37<00:00, 855.74it/s] 


31962
100
Number of uniqe words 100


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,word_count,char_count,stopwords,hashtags,numerics,upper,avg_word,label
0,-0.139549,-0.44841,-0.120978,-0.138524,-0.139514,0.132049,0.339232,-0.226315,-0.149572,-0.244663,...,0.232095,-0.037663,21,102,10,1,0,0,4.555556,0
1,-0.019768,-0.139032,-0.146901,-0.140401,-0.033111,0.015446,0.267217,0.186892,0.035876,0.050137,...,0.182768,0.090415,22,122,5,3,0,0,5.315789,0
2,0.135679,0.130832,-0.60935,0.060189,0.146132,-0.404244,-0.260429,0.026808,0.40526,0.046722,...,-0.673974,0.597948,5,21,1,0,0,0,5.666667,0
3,-0.283187,-0.464696,0.070417,0.398729,-0.153747,0.28986,0.484309,0.12335,-0.748885,0.234946,...,-0.569697,0.672804,17,86,5,1,0,0,4.928571,0
4,-0.170669,-0.233206,-0.072722,0.078938,-0.456979,0.290107,0.283605,-0.264674,0.113215,0.049324,...,0.234046,-0.2411,8,39,1,1,0,0,8.0,0


### 6 - Split data

In [21]:
x = bow_train.iloc[:,0:-1]
y = bow_train['label']
x_train_bow, x_val_bow, y_train_bow, y_val_bow = train_test_split(x,y,test_size=0.2)

In [22]:
x=tfidf_train.iloc[:,0:-1]
y=tfidf_train['label']
x_train_tfidf, x_val_tfidf, y_train_tfidf, y_val_tfidf = train_test_split(x,y,test_size=0.2)

In [23]:
x = w2v_train.iloc[:,0:-1] 
y = w2v_train['label']
x_train_w2v,x_val_w2v,y_train_w2v,y_val_w2v = train_test_split(x,y,test_size=0.2)

### 7 - Model Selection

In [24]:
def f1_score_(y_proba,y_test):
    
    proba = y_proba[:,1] >= 0.3
    proba = proba.astype(np.int) 
    
    return f1_score( proba,y_test)  

#### KNN + BOW

In [26]:
k=[3,5,7]
accuracy=[]
for i in k:
    model=KNeighborsClassifier(n_neighbors=i)
    model.fit(x_train_bow,y_train_bow)
    y_pred=model.predict(x_val_bow)
    acc=accuracy_score(y_pred,y_val_bow)
    print('for k=',i,'Accuracy Score',acc)
    accuracy.append(acc)
    y_proba=model.predict_proba(x_val_bow)
    f1_scor=f1_score_(y_proba,y_val_bow)
    print('for k=',i,'f1 score ',f1_scor)

for k= 3 Accuracy Score 0.9200688252776474
for k= 3 f1 score  0.2486126526082131
for k= 5 Accuracy Score 0.9341467229782575
for k= 5 f1 score  0.2876847290640394
for k= 7 Accuracy Score 0.9364930392616925
for k= 7 f1 score  0.30939226519337015


#### KNN + TFIDF

In [27]:
#use tfidf
k=[3,5,11]
accuracy_tfidf=[]
for i in k:
    model=KNeighborsClassifier(n_neighbors=i)
    model.fit(x_train_tfidf,y_train_tfidf)
    y_pred=model.predict(x_val_tfidf)
    acc=accuracy_score(y_pred,y_val_tfidf)
    print('for k=',i,'Accuracy Score',acc)
    accuracy_tfidf.append(acc)
    y_proba=model.predict_proba(x_val_tfidf)
    f1_scor=f1_score_(y_proba,y_val_tfidf)
    print('for k=',i,'f1 score ',f1_scor)

for k= 3 Accuracy Score 0.9292976693258251
for k= 3 f1 score  0.25444596443228457
for k= 5 Accuracy Score 0.9339903018926952
for k= 5 f1 score  0.2627737226277372
for k= 11 Accuracy Score 0.9368058814328172
for k= 11 f1 score  0.23923444976076552


#### KNN + Word2Vec

In [28]:
k=[3,5,11]
accuracy_w2v=[]
for i in k:
    model=KNeighborsClassifier(n_neighbors=i)
    model.fit(x_train_w2v,y_train_w2v)
    y_pred=model.predict(x_val_w2v)
    acc=accuracy_score(y_pred,y_val_w2v)
    print('for k=',i,'Accuracy Score',acc)
    accuracy_w2v.append(acc)
    y_proba=model.predict_proba(x_val_w2v)
    f1_scor=f1_score_(y_proba,y_val_w2v)
    print('for k=',i,'f1 score ',f1_scor)

for k= 3 Accuracy Score 0.9347724073205068
for k= 3 f1 score  0.35319767441860467
for k= 5 Accuracy Score 0.9407164085718754
for k= 5 f1 score  0.40644361833952913
for k= 11 Accuracy Score 0.9416549350852494
for k= 11 f1 score  0.40566037735849053


#### XGB + word2vec

In [30]:
%%time
xgb = XGBClassifier(max_depth=6, n_estimators=1000, nthread= 3).fit(x_train_w2v, y_train_w2v) 
prediction = xgb.predict(x_val_w2v)
print('F1 score: ', f1_score(y_val_w2v, prediction))
print('Accuracy score: ', accuracy_score(y_val_w2v, prediction))
fpr, tpr, _ = roc_curve(y_val_w2v, prediction)
print('AUC: ', auc(fpr, tpr))

F1 score:  0.6823855755894591
Accuracy score:  0.9641795714062256
AUC:  0.7811154448358777
CPU times: user 6min 35s, sys: 1.92 s, total: 6min 37s
Wall time: 3min 43s
