In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings
import seaborn as sns 
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# import the modules we'll need
from IPython.display import HTML
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [None]:
train = pd.read_csv('/kaggle/input/twitter-tweets/train_2kmZucJ (1).csv')
test = pd.read_csv('/kaggle/input/twitter-tweets/test_oJQbWVk.csv')

**1. Data Inspection **

In [None]:
train[train['label'] == 0].head()

In [None]:
train[train['label'] == 1].head()

In [None]:
train['label'].value_counts()

The percentage of non- racist comments is 74.5 % 

In [None]:
train_length = train['tweet'].str.len()
test_length = test['tweet'].str.len()

plt.hist(train_length,bins = 20,label = 'train_length')
plt.hist(test_length,bins = 20,label = 'test_length')
plt.legend()
plt.show()


**2.  Data Cleaning**

Before data cleaning we can combine train and test data as data cleaning needs to be done on both the datasets

In [None]:
combi = train.append(test, ignore_index=True) 
combi.shape

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt    

2.1 Removing the words startin with @

In [None]:
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*")
combi.head()

In [None]:
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tidy_tweet'], "(https|http)://\S+")

2.2 Removing punctuations,numbers and special characters

In [None]:
combi['tidy_tweet']  = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ") 
combi.head(10)

2.3 Removing Short Words whose length 3 or less

In [None]:
combi['tidy_tweet']  = combi['tidy_tweet'].apply(lambda x : ' '.join([w for w in x.split() if len(w)>3]))
combi.head(10)

2.4 Removing the stop words

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
stop = stopwords.words('english')

tokenized_tweet = combi['tidy_tweet'].apply(lambda x : x.split())
tokenized_tweet.apply(lambda x: [item for item in x if item not in stop])
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i]) 
combi['tidy_tweet'] = tokenized_tweet

2.4 Text Normalization

In [None]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x : x.split()) #tokenization 
tokenized_tweet.head()

In [None]:
#normalizing the tokenized tweets
from nltk.stem.porter import * 
from nltk.stem import WordNetLemmatizer 

stemmer = PorterStemmer() 
lemmatizer = WordNetLemmatizer()
#tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet = tokenized_tweet.apply(lambda x: [lemmatizer.lemmatize(i) for i in x]) # stemming

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i]) 
combi['tidy_tweet'] = tokenized_tweet

**3. Story Generation and Visualization from Tweets**

3.1 Understanding common words used in tweets(using word cloud)

In [None]:
from wordcloud import WordCloud

all_words = ' '.join([text for text in combi['tidy_tweet']])  
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 

plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

3.2 Commonwords in non sexist/racist tweets

In [None]:
nonracist_words = ' '.join([text for text in combi['tidy_tweet'][combi['label'] == 0]])
nonracist_word_cloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(nonracist_words)
                            
plt.figure(figsize=(10, 7)) 
plt.imshow(nonracist_word_cloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

3.3 Common words in racist/sexist comments

In [None]:
all_racist_words = ' '.join([text for text in combi['tidy_tweet'][combi['label'] == 1]])
racist_word_cloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_racist_words)
                            
plt.figure(figsize=(10, 7)) 
plt.imshow(racist_word_cloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

3.4 Understanding the impact of hashtags on tweets sentiment 

In [None]:
#function to collect hastags
def hashtagExtract(x):
    hashtags = []
    for i in x:
        ht = re.findall(r"#(\w+)", i)        
        hashtags.append(ht) 
    return hashtags

In [None]:
#extracting hashtags from non racist comments
HT_nonracist = hashtagExtract(combi['tidy_tweet'][combi['label'] == 0]) 
#extracting hashtags from racist  comments
HT_negative = hashtagExtract(combi['tidy_tweet'][combi['label'] == 1]) 
#unnesting
HT_nonracist = sum(HT_nonracist,[])
HT_negative = sum(HT_negative,[])

In [None]:
a = nltk.FreqDist(HT_nonracist) 
d = pd.DataFrame({'Hashtag': list(a.keys()),'Count': list(a.values())}) 
# selecting top 20 most frequent hashtags     
d = d.nlargest(columns="Count", n = 20) 

plt.figure(figsize=(16,5)) 
ax = sns.barplot(data=d, x= "Hashtag", y = "Count") 
ax.set(ylabel = 'Count') 
plt.show()

In [None]:
b = nltk.FreqDist(HT_negative)
e = pd.DataFrame({'Hashtag':list(b.keys()),'Count': list(b.values())})

e = e.nlargest(columns="Count", n = 20) 

plt.figure(figsize=(16,5)) 
ax = sns.barplot(data=e, x= "Hashtag", y = "Count") 
ax.set(ylabel = 'Count') 
plt.show()

**4. Extracting Features from cleaned tweets**

4.1 Bag of Words used to build sparse matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim
from nltk.tokenize import TreebankWordTokenizer

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2,  max_features=1500,stop_words='english') 

tokenizer = TreebankWordTokenizer()
bow_vectorizer.set_params(tokenizer=tokenizer.tokenize)

# include 1-grams and 2-grams
bow_vectorizer.set_params(ngram_range=(1, 3))

# ignore terms that appear in more than 50% of the documents
bow_vectorizer.set_params(max_df=0.5)

# only keep terms that appear in at least 2 documents
bow_vectorizer.set_params(min_df=2)


In [None]:
bow = bow_vectorizer.fit_transform(combi['tidy_tweet']) 
bow.shape

4.2  TF-IDF Features

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tokenizer = TreebankWordTokenizer()
tfidf_vectorizer.set_params(tokenizer=tokenizer.tokenize)

# include 1-grams and 2-grams
tfidf_vectorizer.set_params(ngram_range=(1, 2))

# ignore terms that appear in more than 50% of the documents
tfidf_vectorizer.set_params(max_df=0.5)

In [None]:
tfidf = tfidf_vectorizer.fit_transform(combi['tidy_tweet']) 
tfidf.shape

4.3 Word2VecFeatures

In [None]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing 
model_w2v = gensim.models.Word2Vec(tokenized_tweet,size=200,window=5,min_count=2,sg = 1,hs = 0,negative = 10,workers= 2,seed = 34)

model_w2v.train(tokenized_tweet, total_examples= len(combi['tidy_tweet']), epochs=20)

In [None]:
model_w2v.wv.most_similar(positive="food")

In [None]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary                                     
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
wordvec_arrays = np.zeros((len(tokenized_tweet), 200)) 
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays) 
    wordvec_df.shape    

In [None]:
wordvec_df.shape

4.3  Doc2Vec Embedding

In [None]:
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import LabeledSentence

In [None]:
def add_label(twt):
    output = []
    for i, s in zip(twt.index, twt):
        output.append(LabeledSentence(s, ["tweet_" + str(i)]))
    return output
labeled_tweets = add_label(tokenized_tweet) # label all the tweets

In [None]:
labeled_tweets[:6]

In [None]:
model_d2v = gensim.models.Doc2Vec(dm=1, dm_mean=1, size=200,window=5,negative=7, min_count=5,workers=3,alpha=0.1,seed = 23) 
model_d2v.build_vocab([i for i in tqdm(labeled_tweets)])
model_d2v.train(labeled_tweets, total_examples= len(combi['tidy_tweet']), epochs=15)

In [None]:
docvec_arrays = np.zeros((len(tokenized_tweet), 200)) 
for i in range(len(combi)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))    

docvec_df = pd.DataFrame(docvec_arrays) 
docvec_df.shape

**5. Model Development**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:

def ModelDevelopment(feat,test_feat,model):
    x_train,x_valid,y_train,y_valid = train_test_split(feat, train['label'],random_state=42,test_size=0.3)
    # training the model 
    model.fit(x_train, y_train) 

    prediction = model.predict_proba(x_valid) # predicting on the validation set 
    prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0 
    prediction_int = prediction_int.astype(np.int) 

    print(f1_score(y_valid, prediction_int))

    #Test Prediction 
    
    test_pred = model.predict_proba(test_feat) 
    test_pred_int = test_pred[:,1] >= 0.3 
    test_pred_int = test_pred_int.astype(np.int) 
    test['label'] = test_pred_int 
    submission = test[['id','label']]     
    
    return  submission

**5.1   Bag-of-Words Features**

In [None]:
# Extracting train and test BoW features 
train_bow = bow[:7920,]
test_bow = bow[7920:9873,]

In [None]:
#Logistic Regression
lreg = LogisticRegression() 
sub = ModelDevelopment(train_bow,test_bow,lreg)
#create_download_link(sub,"Download csv link",'test.csv')

In [None]:
#Support Vector Machine
svc = svm.SVC(kernel='linear', C=1, probability=True)
sub = ModelDevelopment(train_bow,test_bow,svc)
#create_download_link(sub,"Download csv link",'test.csv')

In [None]:
rf = RandomForestClassifier(n_estimators=400, random_state=11)
sub = ModelDevelopment(train_bow,test_bow,rf)
#create_download_link(sub,"Download csv link",'test.csv')

In [None]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000)
sub = ModelDevelopment(train_bow,test_bow,xgb_model)

**5.2   TF- IDF**

In [None]:
train_tf = tfidf[:7920,]
test_tf = tfidf[7920:9873,]
sub = ModelDevelopment(train_tf,test_tf,lreg)
#create_download_link(sub,"Download csv link",'test.csv')

In [None]:
sub = ModelDevelopment(train_tf,test_tf,svc)
#create_download_link(sub,"Download csv link",'test.csv')

In [None]:
sub = ModelDevelopment(train_tf,test_tf,rf)
#create_download_link(sub,"Download csv link",'test.csv')

In [None]:
sub = ModelDevelopment(train_tf,test_tf,xgb_model)

**5.3 Word2Vector**

In [None]:
train_wod = wordvec_df.iloc[:7920,]
test_wod = wordvec_df.iloc[7920:9873,]
sub = ModelDevelopment(train_wod,test_wod,lreg)
create_download_link(sub,"Download csv link",'logistic.csv')

In [None]:
sub = ModelDevelopment(train_wod,test_wod,svc)
create_download_link(sub,"Download csv link",'svc_model_w2v.csv')

In [None]:
sub = ModelDevelopment(train_wod,test_wod,rf)
#create_download_link(sub,"Download csv link",'test.csv')

In [None]:
#best model till now with 0.9000 leadboard f1 score
sub = ModelDevelopment(train_wod,test_wod,xgb_model)
create_download_link(sub,"Download csv link",'xgb_model_w2v.csv')

**5.4 Doc2Features**

In [None]:
train_doc = docvec_df.iloc[:7920,]
test_doc = docvec_df.iloc[7920:9873,]
sub = ModelDevelopment(train_doc,test_doc,lreg)
#create_download_link(sub,"Download csv link",'test.csv')

In [None]:
sub = ModelDevelopment(train_doc,test_doc,svc)

In [None]:
sub = ModelDevelopment(train_doc,test_doc,rf)

In [None]:
sub = ModelDevelopment(train_doc,test_doc,xgb_model)

**Till now xgboost with word2Vector has the best performace.**