## Load the libraries which will be used in this project

In [None]:
import re  # for the regular expressions
import nltk  # for text manipulation
import string  
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [None]:
test = pd.read_csv("test_tweets_anuFYb8.csv")
train = pd.read_csv("train_E6oV3lV.csv")

## Data Ingestion phase of the cleaning process of text data

In [None]:
train.head(5)

In [None]:
train[train['label'] == 0].head(10)

In [None]:
train[train['label'] == 1].head(10)

In [None]:
train.shape, test.shape

### Let's have a glimpse of the data

In [None]:
train["label"].value_counts()

In [None]:
length_train = train['tweet'].str.len()
length_test = test['tweet'].str.len()
plt.hist(length_train, bins=20, label="train_tweets")
plt.hist(length_test, bins=20, label="test_tweets")
plt.legend()
plt.show()

### We need to clean, but first, let's combine both train and test set.

In [None]:
combine = train.append(test, ignore_index=True)
combine.shape

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [None]:
combine['tidy_tweet'] = np.vectorize(remove_pattern)(combine['tweet'], "@[\w]*")
combine.head()

### Punctuation, Number, and Special Character remover

Here we replace everthin except characters and hashtags with spaces. The regular expression "[^a-zA-Z#]" means anything except alphabeths and '#

In [None]:
combine['tidy_tweet'] = combine['tidy_tweet'].str.replace("^a-zA-Z#", " ")
combine.head(10)

In [None]:
combine['tidy_tweet'] = combine['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

combine.head()

In [None]:
tokenized_tweet = combine['tidy_tweet'].apply(lambda x: x.split()) # tokenizing
tokenized_tweet.head()

### Now we can normalize the tokenized tweets

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokenize_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

### Let's stitch these tokens back together.

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
combine['tidy_tweet'] = tokenized_tweet

In [None]:
all_words = ' '.join([text for text in combine['tidy_tweet']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, 
max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off')
plt.show()

In [None]:
normal_words = ' '.join([text for text in combine['tidy_tweet'][combine['label'] == 0]])

wordcloud = WordCloud(width=800, height=500, random_state=21,
                     max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
negative_words = ' '.join([text for text in combine['tidy_tweet'][combine['label'] == 1]])
wordcloud = WordCloud(width=800, height=500, 
random_state=21, max_font_size=110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# function to collect hashtags
def hashtag_extract(x):
    hashtags = []
    #Loop over the words in tweet
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)
        
    return hashtags

In [None]:
# Extract hashtags from non racist/sexist tweets

HT_regular = hashtag_extract(combine['tidy_tweet'][combine['label'] == 0])

# Extract hashtags from racist/sexist tweets
HT_negative = hashtag_extract(combine['tidy_tweet'][combine['label'] == 1])

# unresting list
HT_regular = sum(HT_regular,[])
HT_negative = sum(HT_negative,[])

In [None]:
a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                 'Count': list(a.values())})
#Selectint top 10 most frequent hashtags
d = d.nlargest(columns="Count", n=10)
plt.figure(figsize=(16, 5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
b = nltk.FreqDist(HT_negative)
e = pd.DataFrame({'Hashtag': list(b.keys()), 'Count': list(b.values())})
# selecting top 10 most frequent hashtags
e = e.nlargest(columns='Count', n=10)
plt.figure(figsize=(16,5))
ax = sns.barplot(data=e, x= "Hashtag", y= "Count")
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, 
max_features=1000, stop_words='english')
# bag of words feature matrix
bow = bow_vectorizer.fit_transform(combine['tidy_tweet'])
bow.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# from smart_open import smart_open
import gensim

tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, 
max_features=1000, stop_words='english')

#TF IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(combine['tidy_tweet'])
tfidf.shape

In [None]:
!pip install --upgrade smart_open

In [None]:
import os
import gensim
tokenized_tweet = combine['tidy_tweet'].apply(lambda x: x.split()) #tokenizing
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            size=200, #desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg =1, # 1 for skip-gram model
            negative = 10, # for negative sampling
            workers = 2, #no.of cores
            seed = 34)

model_w2v.train(tokenized_tweet, total_examples=len(combine['tidy_tweet']), epochs=20)

In [None]:
model_w2v.wv.most_similar(positive='dinner')

In [None]:
model_w2v.wv.most_similar(positive='trump')

In [None]:
model_w2v['food'] # The length of the vector is 200

In [None]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is nor=t in vocabulary
                continue
    if count != 0:
        vec /= count
    return vec

In [None]:
# preparing word2vec feature set
wordvec_arrays = np.zeros((len(tokenized_tweet), 200))
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays) 

wordvec_df.shape

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models.doc2vec import LabeledSentence

## Let's labelise and or tag each tkenised tweer with unique IDs, by using Gensim's LabeledSentence() function to implement doc2vec.

In [None]:
def add_label(twt):
    output = []
    for i, s in zip(twt.index, twt):
        output.append(LabeledSentence(s, ["tweet_" + str(i)]))
    return output

labeled_tweets = add_label(tokenized_tweet) # label all the tweets

labeled_tweets[:6]

## Train doc2vec model

In [None]:
model_d2v = gensim.models.Doc2Vec(dm=1, #dm = 1 'distribution memory' model 
                                dm_mean=21, # dm = 1 fr using mean of the context word vectors
                                size=200, # no. of desired features
                                window=5, # width of the contwext window
                                negative=7, # if > 0 the negative sampling will be used
                                min_count=5, # Ignores all ords with total frequency lower than 2.
                                workers=3, # no. of cores
                                alpha=0.1, # learning rate
                                seed=23)

model_d2v.build_vocab([i for i in tqdm(labeled_tweets)])

model_d2v.train(labeled_tweets, total_examples=len(combine['tidy_tweet']), epochs=15)

In [None]:
# Prepparing doc2vec Feature Set
docvec_arrays = np.zeros((len(tokenized_tweet), 200))
for i in range(len(combine)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1, 200))
    
docvec_df = pd.DataFrame(docvec_arrays)
docvec_df.shape

## Training the model as building models on the datasets with different features such as bags-of-words, word2vec, tf-idf & doc2vec

In [None]:
# Using Logistic Regression to train our model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
 
#bag-of-word features

train_bow = bow[:31962,:]
test_bow = bow[31962:,:]

# splitting data into training and validation set
x_train_bow, x_valid_bow, y_train, y_valid = train_test_split(train_bow, 
train['label'], random_state=42, test_size=0.3)

lreg = LogisticRegression()
lreg.fit(x_train_bow, y_train) # training the model

prediction = lreg.predict_proba(x_valid_bow) # prediction on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0

prediction_int = prediction_int.astype(np.int)


f1_score(y_valid, prediction_int) # calculating f1_score

## Make prediction on test dataset and make submission file

In [None]:
test_pred = lreg.predict_proba(test_bow)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test['label'] = test_pred_int
submission = test[['id', 'label']]
submission.to_csv('sub_lreg_bow.csv', index=False)
# submission

In [None]:
# tfidf features
train_tfidf = tfidf[:31962,:]
test_tfidf = tfidf[31962:,:]
x_train_tfidf = train_tfidf[y_train.index] 
x_valid_tfidf = train_tfidf[y_valid.index]

In [None]:
lreg.fit(x_train_tfidf, y_train)
prediction_tfidf = lreg.predict_proba(x_valid_tfidf)
prediction_int_tfidf = prediction_tfidf[:,1] >= 0.3
prediction_int_tfidf = prediction_int_tfidf.astype(np.int)
f1_score(y_valid, prediction_int_tfidf) # calculating f1 score for the validtion set

In [None]:
# Word2Vec Features
train_w2v = wordvec_df.iloc[:31962,:]
test_w2v = wordvec_df.iloc[31962:,:]
x_train_w2v = train_w2v.iloc[y_train.index,:]
x_valid_w2v = train_w2v.iloc[y_valid.index,:]


lreg.fit(x_train_w2v, y_train)
prediction_w2v = lreg.predict_proba(x_valid_w2v)
prediction_int_w2v = prediction_w2v[:,1] >= 0.3
prediction_int_w2v = prediction_int_w2v.astype(np.int)
f1_score(y_valid, prediction_int_w2v)

In [None]:
# doc2vec features
train_d2v = docvec_df.iloc[:31962,:]
test_d2v = docvec_df.iloc[31962:,:]
x_train_d2v = train_d2v.iloc[y_train.index,:]
x_valid_d2v = train_d2v.iloc[y_valid.index,:]

lreg.fit(x_train_d2v, y_train)
prediction_d2v = lreg.predict_proba(x_valid_d2v)
prediction_int_d2v = prediction_d2v[:,1] >= 0.3
prediction_int_d2v = prediction_int_d2v.astype(np.int)
f1_score(y_valid, prediction_int_d2v)

## Support Vector Machine algorithm

In [None]:
from sklearn import svm

#bag-of-words features
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(x_train_bow, y_train)
prediction_svm = svc.predict_proba(x_valid_bow)
prediction_int_svm = prediction_svm[:,1] >= 0.3
prediction_int_svm = prediction_int_svm.astype(np.int)
f1_score(y_valid, prediction_int_svm)

In [None]:
test_pred_svm = svc.predict_proba(test_bow)
test_pred_int_bow = test_pred_svm[:,1] >= 0.3
test_pred_int_bow = test_pred_int_bow.astype(np.int)
test['label'] = test_pred_int_bow
submission = test[['id','label']]
submission.to_csv('sub_svm_bow.csv', index=False)

In [None]:
#tfidf features

svc_tfidf = svm.SVC(kernel='linear', C=1, probability=True).fit(x_train_tfidf, y_train)

In [None]:
prediction_svm_tfidf = svc.predict_proba(x_train_tfidf)
prediction_int_svm_tfidf = prediction[:,1] >= 0.3
prediction_int_svm_tfidf = prediction_int_svm_tfidf.astype(np.int)
f1_score(y_valid, prediction_int_svm_tfidf)

In [None]:
# word2vec features
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(x_train_w2v, y_train)
prediction_svm_w2v = svc.predict_proba(x_valid_w2v)
prediction_svm_w2v_int = prediction_svm_w2v[:,1] >= 0.3
prediction_svm_w2v_int = prediction_svm_w2v_int.astype(np.int)
f1_score(y_valid, prediction_svm_w2v_int)

In [None]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(x_train_d2v, y_train)
prediction_svm_d2v = svc.predict_proba(x_valid_d2v)
prediction_svm_d2v_int = prediction_svm_d2v[:,1] >= 0.3
prediction_svm_d2v_int = prediction_svm_d2v_int.astype(np.int)
# validation score
f1_score(y_valid, prediction_svm_d2v_int)

## The use RandomForest for the data fitting


In [None]:
from sklearn.ensemble import RandomForestClassifier

# bag-of-words features
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(x_train_bow, y_train)
prediction_rf = rf.predict(x_valid_bow)
# validation score
f1_score(y_valid, prediction_rf)

In [None]:
# tf-idf
test_pred_bow = rf.predict(test_bow) 
test['label'] = test_pred
submission = test[['id', 'label']]
submission.to_csv('sub_rf_bow.csv', index=False)

In [None]:
# tf-idf
rf = RandomForestClassifier(n_estimators =400, random_state=11).fit(x_train_tfidf, y_train)
prediction_rf_tfidf = rf.predict(x_valid_tfidf)
f1_score(y_valid, prediction_rf_tfidf)

In [None]:
# Word2Vec features

rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(x_train_w2v, y_train)
prediction_w2v = rf.predict(x_valid_w2v)
f1_score(y_valid, prediction_w2v)

In [None]:
# Doc2Vec features

rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(x_train_d2v, y_train)
prediction_d2v = rf.predict(x_valid_d2v)
f1_score(y_valid, prediction_d2v)

## Extreme Gradient Boosting (xgBoost) model fitting with available features

In [None]:
from xgboost import XGBClassifier

#Bag-of-Words Features
xgb_model = XGBClassifier(maxdepth=6, n_estimators=1000).fit(x_train_bow, y_train)
prediction_xgb = xgb_model.predict(x_valid_bow)
f1_score(y_valid, prediction_xgb)

In [None]:
# tfidf features
xgb = XGBClassifier(max_depth=6, n_estimators=1000).fit(x_train_tfidf, y_train)
prediction_xgb_tfidf = xgb.predict(x_valid_tfidf)
f1_score(y_valid, prediction_xgb_tfidf)

In [None]:
# Word2vec Features
xgb = XGBClassifier(max_depth=6, n_estimators=1000, nthread=3).fit(x_train_w2v, y_train)
prediction_xgb_w2v = xgb.predict(x_valid_w2v)
f1_score(y_valid, prediction_xgb_w2v)

In [None]:
# Doc2Vec Features
xgb = XGBClassifier(max_depth=6, n_estimators=1000).fit(x_train_d2v, y_train)
prediction_xgb_d2v = xgb.predict(x_valid_d2v)
f1_score(y_valid, prediction_xgb_d2v)

## fine tuning the xgboost parameter especially w2v as the model that gave us highest score.

In [None]:
import xgboost as xgb

In [None]:
#DMatrix as parameter with both features and target
d_train = xgb.DMatrix(x_train_w2v, label=y_train)
d_valid = xgb.DMatrix(x_valid_w2v, label=y_valid)
d_test = xgb.DMatrix(test_w2v)

In [None]:
# Parameters that are going to tune
params = {
    'objective':'binary:logistic',
    'max_deoth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample':1,
    'colsample_bytree': 1
}

In [None]:
#Prepare the customer evaluation to calculate f1_score
def custom_eval(preds, d_train):
    labels = d_train.get_label().astype(np.int)
    preds = (preds >= 0.3).astype(np.int)
    return [('f1_score', f1_score(labels, preds))]

### General approach for tuning

In [None]:
#Tuning max_depth and min_child_weight
griedsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6, 10)
        for min_child_weight in range(5, 8)
    
]

max_f1 = 0. # initializing with 0
best_params = None
for max_depth, min_child_weight in griedsearch_params:
    print("CV with max_depth={}, min__child_weight={}".format(
                    max_depth,
                    min_child_weight))
    # update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    # Cross-validation
    cv_results = xgb.cv(dtrain=d_train, 
                        params=params, 
                        nfold=3,
                    num_boost_round=50,
                        early_stopping_rounds=10,
                        feval=custom_eval,
                        #metrics="rmse", 
                        maximize=True, 
                        seed=123
                       )
    
#Finding the best F1 Score

mean_f1 = cv_results['test-f1_score-mean'].max()


boost_rounds = cv_results['test-f1_score-mean'].argmax()
print("\tF1 Score {} for {} rounds.".format(mean_f1, boost_rounds))
if mean_f1 > max_f1:
    max_f1 = mean_f1
    best_params = (max_depth, min_child_weight)
    
    
print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

# CV with max_depth=6, min_child_weight=5


In [None]:
#Updating the max_depth and min_child_weight parameters
params['max_depth'] = 8
param['min_child_weight'] = 6

## Tuning subsample and colsample

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(5, 10)]
    for colsample in [i/10. for i in range(5,10)]
]

max_f1 = 0.
best_params = None
for subsample, colsample in gridsearch_params:
    print("CV with subsample={}, colsample={}".format(
                                subsample,
                                colsample))
    
    # Update our parameters
    params['colsample'] = colsample
    params['subsample'] = subsample
    cv_results = xgb.cv(
            params=params,
            dtrain=d_train,
            feval = custom_eval,
            num_boost_round=200,
            maximize=True,
            seed=16,
            nfold=5,
            early_stopping_rounds=10)
    
    # Ffinding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (subsample, colsample)
        
print("Best params: {}, {}, F1 Score: {}".format(best_params[1], max_f1))

In [None]:
# Updating the susample and coldsample_bytree

params['subsample'] = .9
params['colsample_bytree'] = .5

In [None]:
# Now let's tune the learning rate

max_f1 = 0.
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    
    #Update ETA
    params['eta'] = eta
    
    # Run CV
    cv_results = xgb.cv(
            params=params,
            dtrain=d_train,
            feval=custom_eval,
            num_boost_round=1000,
            maximize=True,
            seed=16,
            nfold=5,
            early_stopping_rounds=20
    )
    
    # Finding best f1 score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        mean_f1 = mean_f1
        best_params = eta

print("Best params: {}, F1 Score: {}".format(best_params, max_f1))


In [None]:
# List look at the final list of tuned parameters

params

{
    'colsample': 0.9,
    'colsample_bytree': 0.5, 'eta': 0.1,
    'max_depth': 8, 'min_child_weight': 6,
    'objective': 'binary:logistic',
    'subsample': 0.9
}

xgb_model = xgb.train(
    params,
    dtrain,
    feval=custom_eval,
    num_boost_round=1000,
    maximize=True,
    evals=[(d_valid, 'Validation')],
    early_stopping_rounds=10)

In [None]:
#