# Imports
All the imports which are necessary for the proprocessing steps, model training etc. We import:
1. numpy for linear algebra
2. pandas for preprocessing
3. sklearn for all the models which are implememted in the code
4. nltk for all the NLP related preprocessing

In [73]:
import io
import re
import numpy as np
import pandas as pd  
import matplotlib.pyplot as plt

import sklearn.metrics as metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, f1_score

from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import unicodedata

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk import ne_chunk
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

# Data Reading
The .csv file is read from the Google Drive.

In [74]:
# from google.colab import drive
# drive.mount('/content/gdrive/', force_remount=True)

# train_df = pd.read_csv('gdrive/My Drive/train_df.csv')
# test_df = pd.read_csv('gdrive/My Drive/test_df.csv')

train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")

In [75]:
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,What are interesting facts about Microsoft his...,0
1,dc708b74a108d0fc0ad9,What are those things which are not gonna happ...,0
2,06a27ec5d82dacd8bfe0,"What should I know to avoid being ""upsold"" whe...",0
3,00cbb6b17e3ceb7c5358,How I add any account with payment bank?,0
4,7c304888973a701585a0,Which Multi level marketing products are actua...,0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,How is CSE at VIT Chennai?,0
999996,e80edbfc086f7125940f,"How can we prevent a holocaust by robots, AI, ...",0
999997,1506dfad6bd340782a1f,How can I help a student remember key steps an...,0
999998,b56c60fd407f2f85553c,What is the difference between lace closure & ...,0


In [76]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get an update on Maruti Suzuki All ...
306118,ad4a8498d97c536c67b9,What 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,How can I remove the tan on my forehead?
306120,370191dba26465997879,"If you are a well known hacker, will you be mo..."


# Preprocessing and EDA
The preprocessing starts here and the operations aree performed in order:
1. All the questions are first converted into lower case. This is because, the words 'english' and 'English' have the same meaning and having both the words in different ways will just add as noise to the model.
2. All the HTML tags are removed as the do not depict any importance in defining the meaning of the question.
3. We the remove all the URLs from the questions as they also do not contribute towards classifying the question as a spam or troll.
4. We then remove all the accented words from the questions. For example, the word 'résumé' will be converted to 'resume'.
5. Then punctuation is removed as they do not contribute towards identifying a question as spam or troll.
6. Finally, we remove the extra whitespaces or tabs in the sentences formed.

In [138]:
df = train_df

In [77]:
train_df = train_df[train_df['question_text'].map(lambda x: x.isascii())]
#test_df = test_df[test_df['question_text'].map(lambda x: x.isascii())]


In [144]:
train_df.shape

(978860, 5)

In [152]:
temp = train_df.loc[~train_df['question_text'].isin(df['question_text'])]
temp.shape

(0, 5)

In [147]:
a_set=set(df.question_text.unique())
b_set=set(train_df.question_text.unique())

list(b_set-a_set)

[]

In [78]:
xSecureLower_train = train_df['question_text'].to_string(na_rep='').lower() #ye hai number removal wala
xSecureLower_test = test_df['question_text'].to_string(na_rep='').lower() #ye hai number removal wala

xLower_train = train_df['question_text'].str.lower()
xLower_test = test_df['question_text'].str.lower()

xLower_train, xLower_test

(0         what are interesting facts about microsoft his...
 1         what are those things which are not gonna happ...
 2         what should i know to avoid being "upsold" whe...
 3                  how i add any account with payment bank?
 4         which multi level marketing products are actua...
                                 ...                        
 999995                           how is cse at vit chennai?
 999996    how can we prevent a holocaust by robots, ai, ...
 999997    how can i help a student remember key steps an...
 999998    what is the difference between lace closure & ...
 999999     what happens when you look into a broken mirror?
 Name: question_text, Length: 978860, dtype: object,
 0         my period is due on my wedding day. how can i ...
 1         how many numbers higher than a million can be ...
 2         how come i feel nothing for my family, but sti...
 3         in case of collapse of the democratic party, w...
 4                              

Replace missing values and any other datatype with empty string, and lowercase all the strings

In [79]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower() if isinstance(x, str) else "")
test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower() if isinstance(x, str) else "")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower() if isinstance(x, str) else "")


In [80]:
train_df['question_text'] = train_df['question_text'].astype(str)
test_df['question_text'] = test_df['question_text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['question_text'] = train_df['question_text'].astype(str)


To remove all the HTML tags which might be present in the questions asked

In [81]:
def remove_html_tags(text):
  new_text = re.sub('<.*?>','',text)
  return new_text

To remove any URLs present as they wont have much importance towards the meaning of the question.

In [82]:
def remove_url(text):
  new_text = re.sub(r'https?://\S+|www\.\S+', '', text)
  return new_text

This will remove any accented words and convert them to simple words. For example, résumé will be converted to resume.


In [83]:
def convert_accent_words(text):
  return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

This will remove all the punctuation marks and numbers from the texts as they have no contribution towards making the question spam or troll.


In [84]:
def remove_punctuation_marks_and_numbers(text):
  return re.sub(r'[^a-zA-Z]', ' ', text)

To remove the extra whitespaces form the text

In [85]:
def remove_whitespace(text):
  return re.sub(r'^\s*|\s\s*', ' ', text).strip()

# Lemmatization

In [86]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)
    text = ' '.join([WordNetLemmatizer().lemmatize(word, pos='v') for word in words])
    return text

[nltk_data] Downloading package wordnet to /home/bhavjyot/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/bhavjyot/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/bhavjyot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Calling all the functions for preprocessing on the train dataframe and the test dataframe.

In [87]:
train_df['clean_questions']=train_df['question_text'].apply(lambda cw : remove_html_tags(cw))
train_df['clean_questions']=train_df['question_text'].apply(lambda cw : remove_url(cw))
train_df['clean_questions']=train_df['question_text'].apply(lambda cw : convert_accent_words(cw))
train_df['clean_questions']=train_df['question_text'].apply(lambda cw : remove_punctuation_marks_and_numbers(cw))
train_df['clean_questions']=train_df['question_text'].apply(lambda cw : lemmatize_text(cw))

test_df['clean_questions']=test_df['question_text'].apply(lambda cw : remove_html_tags(cw))
test_df['clean_questions']=test_df['question_text'].apply(lambda cw : remove_url(cw))
test_df['clean_questions']=test_df['question_text'].apply(lambda cw : convert_accent_words(cw))
test_df['clean_questions']=test_df['question_text'].apply(lambda cw : remove_punctuation_marks_and_numbers(cw))
test_df['clean_questions']=test_df['question_text'].apply(lambda cw : lemmatize_text(cw))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['clean_questions']=train_df['question_text'].apply(lambda cw : remove_html_tags(cw))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['clean_questions']=train_df['question_text'].apply(lambda cw : remove_url(cw))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['clean_questions'

In [88]:
train_df

Unnamed: 0,qid,question_text,target,clean_questions
0,dda0b0efc8ba86e81ec4,what are interesting facts about microsoft his...,0,what be interest facts about microsoft history ?
1,dc708b74a108d0fc0ad9,what are those things which are not gonna happ...,0,what be those things which be not gon na happe...
2,06a27ec5d82dacd8bfe0,"what should i know to avoid being ""upsold"" whe...",0,what should i know to avoid be `` upsold '' wh...
3,00cbb6b17e3ceb7c5358,how i add any account with payment bank?,0,how i add any account with payment bank ?
4,7c304888973a701585a0,which multi level marketing products are actua...,0,which multi level market products be actually ...
...,...,...,...,...
999995,4bd96088d0b5f0f2c4f4,how is cse at vit chennai?,0,how be cse at vit chennai ?
999996,e80edbfc086f7125940f,"how can we prevent a holocaust by robots, ai, ...",0,"how can we prevent a holocaust by robots , ai ..."
999997,1506dfad6bd340782a1f,how can i help a student remember key steps an...,0,how can i help a student remember key step and...
999998,b56c60fd407f2f85553c,what is the difference between lace closure & ...,0,what be the difference between lace closure & ...


In [89]:
test_df

Unnamed: 0,qid,question_text,clean_questions
0,a4f3da3a3df9dd881edd,my period is due on my wedding day. how can i ...,my period be due on my wed day . how can i sto...
1,9914c62ed3f69684d549,how many numbers higher than a million can be ...,how many number higher than a million can be f...
2,8138ae48649e37091a91,"how come i feel nothing for my family, but sti...","how come i feel nothing for my family , but st..."
3,981b4753d17ef14d09f7,"in case of collapse of the democratic party, w...","in case of collapse of the democratic party , ..."
4,452e2c705276ba16b7b7,who is émile naoumoff?,who be émile naoumoff ?
...,...,...,...
306117,a352dff4fcc2571815ce,did anyone get an update on maruti suzuki all ...,do anyone get an update on maruti suzuki all i...
306118,ad4a8498d97c536c67b9,what 5 people in history do you find the most ...,what 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,how can i remove the tan on my forehead?,how can i remove the tan on my forehead ?
306120,370191dba26465997879,"if you are a well known hacker, will you be mo...","if you be a well know hacker , will you be mor..."


# Remove Stopwords
All the stopwords such as 'a', 'an', 'the', etc are removed as they do not contribute towards predicting the target. The library nltk is used. The library has a vocabulary of stopwords according to which the stopwords are removed.

In [90]:
import nltk
nltk.download("stopwords")
import nltk.corpus
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

train_df['without_stopwords'] = train_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test_df['without_stopwords'] = test_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bhavjyot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['without_stopwords'] = train_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [91]:
import spacy
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words

train_df['without_stopwords'] = train_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw_spacy)]))
test_df['without_stopwords'] = test_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw_spacy)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['without_stopwords'] = train_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw_spacy)]))


In [92]:
# import gensim
# from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS

# train_df['without_stopwords'] = train_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (STOPWORDS)]))
# test_df['without_stopwords'] = test_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (STOPWORDS)]))

In [93]:
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# train_df['without_stopwords'] = train_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (ENGLISH_STOP_WORDS)]))
# test_df['without_stopwords'] = test_df['clean_questions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (ENGLISH_STOP_WORDS)]))

In [94]:
train_df

Unnamed: 0,qid,question_text,target,clean_questions,without_stopwords
0,dda0b0efc8ba86e81ec4,what are interesting facts about microsoft his...,0,what be interest facts about microsoft history ?,interest facts microsoft history ?
1,dc708b74a108d0fc0ad9,what are those things which are not gonna happ...,0,what be those things which be not gon na happe...,things gon na happen ?
2,06a27ec5d82dacd8bfe0,"what should i know to avoid being ""upsold"" whe...",0,what should i know to avoid be `` upsold '' wh...,know avoid `` upsold '' car brake change ?
3,00cbb6b17e3ceb7c5358,how i add any account with payment bank?,0,how i add any account with payment bank ?,add account payment bank ?
4,7c304888973a701585a0,which multi level marketing products are actua...,0,which multi level market products be actually ...,multi level market products actually worth pur...
...,...,...,...,...,...
999995,4bd96088d0b5f0f2c4f4,how is cse at vit chennai?,0,how be cse at vit chennai ?,cse vit chennai ?
999996,e80edbfc086f7125940f,"how can we prevent a holocaust by robots, ai, ...",0,"how can we prevent a holocaust by robots , ai ...","prevent holocaust robots , ai , alien ?"
999997,1506dfad6bd340782a1f,how can i help a student remember key steps an...,0,how can i help a student remember key step and...,help student remember key step information wri...
999998,b56c60fd407f2f85553c,what is the difference between lace closure & ...,0,what be the difference between lace closure & ...,difference lace closure & lace frontals ?


In [95]:
test_df

Unnamed: 0,qid,question_text,clean_questions,without_stopwords
0,a4f3da3a3df9dd881edd,my period is due on my wedding day. how can i ...,my period be due on my wed day . how can i sto...,period wed day . stop ? pill option .
1,9914c62ed3f69684d549,how many numbers higher than a million can be ...,how many number higher than a million can be f...,"number higher million form digits 0,4,4,5,5,5,3 ?"
2,8138ae48649e37091a91,"how come i feel nothing for my family, but sti...","how come i feel nothing for my family , but st...","come feel family , love pet friends ? like hat..."
3,981b4753d17ef14d09f7,"in case of collapse of the democratic party, w...","in case of collapse of the democratic party , ...","case collapse democratic party , republican pa..."
4,452e2c705276ba16b7b7,who is émile naoumoff?,who be émile naoumoff ?,émile naoumoff ?
...,...,...,...,...
306117,a352dff4fcc2571815ce,did anyone get an update on maruti suzuki all ...,do anyone get an update on maruti suzuki all i...,update maruti suzuki india engineer hire exam ...
306118,ad4a8498d97c536c67b9,what 5 people in history do you find the most ...,what 5 people in history do you find the most ...,5 people history find interest ?
306119,19784a27b55d4b453fda,how can i remove the tan on my forehead?,how can i remove the tan on my forehead ?,remove tan forehead ?
306120,370191dba26465997879,"if you are a well known hacker, will you be mo...","if you be a well know hacker , will you be mor...","know hacker , prone hack ?"


In [96]:
list_of_questions_train = train_df['without_stopwords'].to_list()
og_list_of_questions_train = train_df['clean_questions'].to_list()

list_of_questions_test = test_df['without_stopwords'].to_list()
og_list_of_questions_test = test_df['clean_questions'].to_list()

# Tokenization
Tokenization is performed by using the CountVectorizer.

In [97]:
bow_transformer =  CountVectorizer()  
bow_transformer_train = bow_transformer.fit(list_of_questions_train)
bow_transformer_test = bow_transformer.fit(list_of_questions_test)

In [98]:
messages_bow_train = bow_transformer_train.transform(list_of_questions_train)
messages_bow_test = bow_transformer_train.transform(list_of_questions_test)

# TF-IDF 
TF-IDF is performed on the corpus which we get after performing tokenization.

In [99]:
tfidf_transformer_train = TfidfTransformer().fit(messages_bow_train)  #Applying TF-ID to our questions
tfidf_transformer_test = TfidfTransformer().fit(messages_bow_test)  #Applying TF-ID to our questions

train_set = tfidf_transformer_train.transform(messages_bow_train) 
test_set = tfidf_transformer_train.transform(messages_bow_test) 

In [100]:
train_set

<978860x84732 sparse matrix of type '<class 'numpy.float64'>'
	with 5318357 stored elements in Compressed Sparse Row format>

# Data Splitting
Data is split in the ratio of 80-20 into train data and test data respectively.

In [101]:
train_target = train_df.iloc[:,2]
train_target = train_target.to_numpy().astype(np.float64)
train_target

array([0., 0., 0., ..., 0., 0., 0.])

In [102]:
train_target.shape

(978860,)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(train_set, train_target, test_size=0.2, random_state=42)

In [104]:
X_train.shape, test_set.shape

((783088, 84732), (306122, 84732))

# Multinomial Naive Bayes Classifier

In [105]:
classifier = MultinomialNB()
classifier.fit(X_train,y_train)      #training the model

y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)

In [106]:
print(classification_report(y_train,y_pred_train))   #Results
print(f1_score(y_train, y_pred_train))
print(accuracy_score(y_train, y_pred_train))

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97    736911
         1.0       0.78      0.12      0.21     46177

    accuracy                           0.95    783088
   macro avg       0.87      0.56      0.59    783088
weighted avg       0.94      0.95      0.93    783088

0.2148689299527288
0.9463393641583066


In [107]:
print(classification_report(y_test,y_pred_test))   #Results
print(f1_score(y_test, y_pred_test))
print(accuracy_score(y_test, y_pred_test))

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97    184128
         1.0       0.73      0.11      0.19     11644

    accuracy                           0.94    195772
   macro avg       0.84      0.55      0.58    195772
weighted avg       0.93      0.94      0.92    195772

0.1891669779604034
0.9445630631550987


In [108]:
y_pred = classifier.predict(test_set)
y_pred = y_pred.astype(int)

Test_DF_TARGET = pd.DataFrame(y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission.csv",index=False)

# Removing Biasness from Data
As the data is highly biased, the biasness in the data is removed by under-sampling. In this, the class which is in majority is reduced in size which is much smaller than the actua datapoints under that class.
In the train_df.csv file, we have roughly 930,000 questions labelled as spam and roughly 70,000 questions labelled as troll out of the totall 10,00,000 questions in the dataset. So the balanced dataset after the sampling contains approximately 100,000 questions which are marked as spam and 70,000 questions labelled as troll. We then shuffle the complete dataset and train our Multinomial Naive Bayes on this dataframe.

In [109]:
train_df

Unnamed: 0,qid,question_text,target,clean_questions,without_stopwords
0,dda0b0efc8ba86e81ec4,what are interesting facts about microsoft his...,0,what be interest facts about microsoft history ?,interest facts microsoft history ?
1,dc708b74a108d0fc0ad9,what are those things which are not gonna happ...,0,what be those things which be not gon na happe...,things gon na happen ?
2,06a27ec5d82dacd8bfe0,"what should i know to avoid being ""upsold"" whe...",0,what should i know to avoid be `` upsold '' wh...,know avoid `` upsold '' car brake change ?
3,00cbb6b17e3ceb7c5358,how i add any account with payment bank?,0,how i add any account with payment bank ?,add account payment bank ?
4,7c304888973a701585a0,which multi level marketing products are actua...,0,which multi level market products be actually ...,multi level market products actually worth pur...
...,...,...,...,...,...
999995,4bd96088d0b5f0f2c4f4,how is cse at vit chennai?,0,how be cse at vit chennai ?,cse vit chennai ?
999996,e80edbfc086f7125940f,"how can we prevent a holocaust by robots, ai, ...",0,"how can we prevent a holocaust by robots , ai ...","prevent holocaust robots , ai , alien ?"
999997,1506dfad6bd340782a1f,how can i help a student remember key steps an...,0,how can i help a student remember key step and...,help student remember key step information wri...
999998,b56c60fd407f2f85553c,what is the difference between lace closure & ...,0,what be the difference between lace closure & ...,difference lace closure & lace frontals ?


In [110]:
spam_df = train_df.loc[train_df.iloc[:, 2] == 0]
troll_df = train_df.loc[train_df.iloc[:, 2] == 1]
spam_df.shape, troll_df.shape

((921039, 5), (57821, 5))

In [111]:
transferdata_df = spam_df.sample(n = 100000, random_state = 42)
frames = [transferdata_df, troll_df]
balanced_df = pd.concat(frames)
balanced_df = balanced_df.sample(frac=1)

In [112]:
balanced_df

Unnamed: 0,qid,question_text,target,clean_questions,without_stopwords
704699,dc00684ae996ddf043eb,why is akhilesh yadav sending his goons behind...,1,why be akhilesh yadav send his goons behind wo...,akhilesh yadav send goons women rape defame yo...
129645,53d1c159e133c1886e33,how can i know if someone blocked me?,0,how can i know if someone block me ?,know block ?
454203,c29fe962a4b1e989f825,"why are japanese people, in any kind of non-ma...",1,"why be japanese people , in any kind of non-ma...","japanese people , kind non-mass transport traf..."
918584,fb1b817322f36573a4ff,what are some signs teachers only care about t...,0,what be some sign teachers only care about the...,sign teachers care money ?
819570,1dc05743320243136e2e,what do japanese people think of kinoshita yuka?,0,what do japanese people think of kinoshita yuka ?,japanese people think kinoshita yuka ?
...,...,...,...,...,...
285211,fd97b988137b41a2bb9a,why did socrates believe that self-knowledge w...,0,why do socrates believe that self-knowledge be...,socrates believe self-knowledge sign education ?
868135,d62a700003d811955029,how did christianity come to senegal?,0,how do christianity come to senegal ?,christianity come senegal ?
513297,1b67e0f79885dbe6ae27,i have a weeping 2 inch poison ivy wound on ne...,0,i have a weep 2 inch poison ivy wind on neck ?...,weep 2 inch poison ivy wind neck ? heal ?
956724,826949ec414241fa836f,how can i tell jennifer lawrence that i want t...,0,how can i tell jennifer lawrence that i want t...,tell jennifer lawrence want friend impossible ...


In [113]:
list_of_questions_bdf = balanced_df['without_stopwords'].to_list()
og_list_of_questions_bdf = balanced_df['clean_questions'].to_list()

list_of_questions_test = test_df['without_stopwords'].to_list()
og_list_of_questions_test = test_df['clean_questions'].to_list()

bow_transformer_bdf =  CountVectorizer()  

bow_transformer_bdf = bow_transformer_bdf.fit(list_of_questions_bdf)
bow_transformer_test = bow_transformer_bdf.fit(list_of_questions_test)

messages_bow_bdf = bow_transformer_bdf.transform(list_of_questions_bdf)
messages_bow_test = bow_transformer_test.transform(list_of_questions_test)

tfidf_transformer_bdf = TfidfTransformer().fit(messages_bow_bdf)  #Applying TF-ID to our questions
tfidf_transformer_test = TfidfTransformer().fit(messages_bow_test) 

In [114]:
train_set_bdf = tfidf_transformer_bdf.transform(messages_bow_bdf)
test_set_bdf = tfidf_transformer_test.transform(messages_bow_test) 

train_target_bdf = balanced_df.iloc[:,2]
train_target_bdf = train_target_bdf.to_numpy().astype(np.float64)

X_train_bdf, X_test_bdf, y_train_bdf, y_test_bdf = train_test_split(train_set_bdf, train_target_bdf, test_size=0.2, random_state=42)

classifier_unbiased = MultinomialNB()
classifier_unbiased.fit(X_train_bdf,y_train_bdf)      #training the model


y_pred_train = classifier_unbiased.predict(X_train_bdf)
y_pred_test = classifier_unbiased.predict(X_test_bdf)

In [115]:
print(classification_report(y_test_bdf,y_pred_test))   #Results
print(f1_score(y_test_bdf, y_pred_test))
print(accuracy_score(y_test_bdf, y_pred_test))

              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89     20007
         1.0       0.82      0.78      0.80     11558

    accuracy                           0.86     31565
   macro avg       0.85      0.84      0.85     31565
weighted avg       0.86      0.86      0.86     31565

0.8010657193605684
0.8580706478694757


In [116]:
print(classification_report(y_train_bdf,y_pred_train))   #Results
print(f1_score(y_train_bdf, y_pred_train))
print(accuracy_score(y_train_bdf, y_pred_train))

              precision    recall  f1-score   support

         0.0       0.89      0.92      0.90     79993
         1.0       0.85      0.81      0.83     46263

    accuracy                           0.88    126256
   macro avg       0.87      0.86      0.87    126256
weighted avg       0.88      0.88      0.88    126256

0.8280732827807605
0.8769880243315169


In [117]:
X_train_bdf.shape, X_test_bdf.shape

((126256, 84732), (31565, 84732))

In [118]:
test_set

<306122x84732 sparse matrix of type '<class 'numpy.float64'>'
	with 1723409 stored elements in Compressed Sparse Row format>

In [119]:
y_pred = classifier_unbiased.predict(test_set_bdf)
y_pred = y_pred.astype(int)

Test_DF_TARGET = pd.DataFrame(y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission_unbiased_mnb.csv",index=False)

# Logistic Regression

In [120]:
from sklearn.linear_model import LogisticRegression

def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    y_pred = model.predict(test_set_bdf)
    print('Test Score with', description, 'features', score)
    return model

In [121]:
model_tfidf = simple_logistic_classify(X_train_bdf, y_train_bdf, X_test_bdf, y_test_bdf, 'tf-idf')

Test Score with tf-idf features 0.8753999683193411


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [122]:
y_pred = model_tfidf.predict(test_set_bdf)
y_pred = y_pred.astype(int)

Test_DF_TARGET = pd.DataFrame(y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission_lr.csv",index=False)

# Logistic Regression with Hyperparameter Tuning

In [123]:
import sklearn.model_selection
param_grid_ = {'C': [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]}
bow_search = sklearn.model_selection.GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_)
tfidf_search = sklearn.model_selection.GridSearchCV(LogisticRegression(), cv=10, param_grid=param_grid_)

In [124]:
tfidf_search.fit(X_train_bdf, y_train_bdf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [125]:
tfidf_search.best_score_

0.8750475633338718

In [126]:
tfidf_search.best_params_

{'C': 10.0}

In [127]:
y_pred = tfidf_search.predict(test_set_bdf)
y_pred = y_pred.astype(int)

Test_DF_TARGET = pd.DataFrame(y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission_lr_hyper.csv",index=False)

# Support Vector Machine

In [128]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train_bdf,y_train_bdf)

In [129]:
predictions = model.predict(X_train_bdf)

In [130]:
print(classification_report(y_train_bdf,predictions))   #Results
print(f1_score(y_train_bdf, predictions))
print(accuracy_score(y_train_bdf, predictions))

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97     79993
         1.0       0.95      0.95      0.95     46263

    accuracy                           0.96    126256
   macro avg       0.96      0.96      0.96    126256
weighted avg       0.96      0.96      0.96    126256

0.9487077019451976
0.9623859460144468


In [131]:
predictions_test = model.predict(X_test_bdf)

In [132]:
print(classification_report(y_test_bdf,predictions_test))   #Results
print(f1_score(y_test_bdf, predictions_test))
print(accuracy_score(y_test_bdf, predictions_test))

              precision    recall  f1-score   support

         0.0       0.90      0.93      0.91     20007
         1.0       0.86      0.81      0.84     11558

    accuracy                           0.88     31565
   macro avg       0.88      0.87      0.87     31565
weighted avg       0.88      0.88      0.88     31565

0.8380680551219729
0.8849675273245684


In [133]:
y_pred = model.predict(test_set_bdf)
y_pred = y_pred.astype(int)

Test_DF_TARGET = pd.DataFrame(y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission_svm.csv",index=False)

# SVM with hyperparameter tuning

In [134]:
""" from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train_bdf, y_train_bdf) """

" from sklearn.model_selection import GridSearchCV\n  \n# defining parameter range\nparam_grid = {'C': [0.1, 1, 10, 100, 1000], \n              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n              'kernel': ['rbf']} \n  \ngrid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)\n  \n# fitting the model for grid search\ngrid.fit(X_train_bdf, y_train_bdf) "

In [135]:
""" grid_predictions_test = grid.predict(X_test_bdf)
print(classification_report(y_test_bdf, grid_predictions_test))
print(f1_score(y_test_bdf, grid_predictions_test))
print(accuracy_score(y_test_bdf, grid_predictions_test)) """

' grid_predictions_test = grid.predict(X_test_bdf)\nprint(classification_report(y_test_bdf, grid_predictions_test))\nprint(f1_score(y_test_bdf, grid_predictions_test))\nprint(accuracy_score(y_test_bdf, grid_predictions_test)) '

In [136]:
""" grid_predictions_train = grid.predict(X_train_bdf)
print(classification_report(y_train_bdf, grid_predictions_train))
print(f1_score(y_train_bdf, grid_predictions_train))
print(accuracy_score(y_train_bdf, grid_predictions_train)) """

' grid_predictions_train = grid.predict(X_train_bdf)\nprint(classification_report(y_train_bdf, grid_predictions_train))\nprint(f1_score(y_train_bdf, grid_predictions_train))\nprint(accuracy_score(y_train_bdf, grid_predictions_train)) '

In [137]:
""" y_pred = grid.predict(test_set_bdf)
y_pred = y_pred.astype(int)

Test_DF_TARGET = pd.DataFrame(y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission_svm_hyper.csv",index=False) """

' y_pred = grid.predict(test_set_bdf)\ny_pred = y_pred.astype(int)\n\nTest_DF_TARGET = pd.DataFrame(y_pred,columns=[\'target\'])\nTEST_DF_QID = pd.DataFrame(test_df ,columns=[\'qid\'])\nTEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join=\'inner\')\nTEST_DF.to_csv("sample_submission_svm_hyper.csv",index=False) '