# Library

In [1]:
import numpy as np
import pandas as pd


# Data Analysis

In [2]:
df=pd.read_csv('train.csv')
df.sample(10)

Unnamed: 0,qid,question_text,target
527811,675664f7a0504d77cb04,Is there any way to find which domain names ar...,0
1290492,fced0a665943e4b61cf3,What is the method to find the best way of eat...,0
1056114,cef1c1d664438bfd2f7e,Why can't people who are habitually late to sc...,0
1045492,ccdfc73740c5fcf57405,Whom should I chose my one or my bestfriend wh...,0
174722,222a4bb021af99f2937f,Where can I study project management in Bangla...,0
357863,4624bd3ab45349ece8b9,What are the best things for St. Xavier's Coll...,0
823782,a16d60a4a966db6d841b,Can a mechanical diploma guy get admission in ...,0
836010,a3d428818d6cb5408025,What is the best tasting food that comes from ...,0
936036,b7702443cf49f0502469,What is the taxonomy of the Glaucus atlanticus?,0
908877,b217232adf420144843f,Does emotional affair mean that you no longer ...,0


In [3]:
sincere_df=df[df['target']==0]
insincere_df=df[df['target']==1]

In [8]:
print(sincere_df.sample(10).question_text[:5].values)

['A normal distribution has a standard devuation of 6 and 80 percent of the scores lie to the right of 89. What is the mean?'
 'What is recombinant DNA?'
 'What are the Toefl and GMAT scores for Concours International SAI application?'
 'Why is it that companies hire people for surveys to get "statistics", yet the majority of the participants are liars?'
 'What would you tell people who are afraid of working in Japan due to radiation?']


In [7]:
print(insincere_df.sample(10).question_text[:5].values)

["We try to exercize our right to freedom, so we go and edit a post on Quora, Quora goes and undoes it. Isn't Quora one of the most annoying website ever (and FaceB the most)?"
 'What data does Kimmel rely upon? We know what data the president relied upon.'
 "Has anyone been in the 'so called hospital' in England where the floors are covered in blood through all the stabbings? Donald Trump seems to know."
 'Would you rather have sex with a sheep or never have sex again?'
 "Why didn't Lynyrd Skynyrd give Pat Buchanan more saxophone solos?"]


In [9]:
df['target'].value_counts(normalize=True)*100

target
0    93.812982
1     6.187018
Name: proportion, dtype: float64

In [10]:
test_df=pd.read_csv('test.csv')

In [14]:
test_df.sample(10).question_text[0:5].values

array(['How can functional programming influence hardware design?',
       'Why is Bitcoin the top coin when the transfers take longer and cost more and more?',
       'Which colleges can I get with a rank of 7096 in JEE main paper 2?',
       'How can I challenge the medicine systems of the world?',
       'Have you seen a miserable couple in real life but happy online?'],
      dtype=object)

# Sampling

In [15]:
sample_size= 100_000

In [16]:
sample_df=df.sample(sample_size, random_state=43)

In [17]:
sample_df

Unnamed: 0,qid,question_text,target
958520,bbc9dcf912efd1b70f0a,What does dying from an OD of carfentanil feel...,0
588030,7330209592ec397abb9f,Abigail ratchford Had porn movies?,0
825596,a1cc2949730d82666dbe,Why is it that degree certificates do not expi...,0
893635,af174f06fbc5fbd8f48c,"Who would win in a fight, Goku or Darkseid?",0
15102,02f6d198ae39469f52f9,What does grandpa Jim means when he tells his ...,0
...,...,...,...
755081,93ef9dcdab9de4c26db4,What are the symptoms of lovd?,0
264480,33c1b743a07b9337b7e3,If someone is traveling lets say via Mumbai an...,0
701593,8960100a691687be4e28,Is it gay if you just touch your tips together?,0
695687,883f8316bdbef0bc247a,Im going to be the MC for my school show this ...,0


# Preprocessing

## Tokenization

In [18]:
q0= sincere_df.question_text.values[1]
q0

'Do you have an adopted dog, how would you encourage people to adopt and not shop?'

In [19]:
q1= insincere_df.question_text.values[1]
q1

'Which babies are more sweeter to their parents? Dark skin babies or light skin babies?'

In [20]:
import nltk
from nltk.tokenize import word_tokenize

In [21]:
word_tokenize(q0)

['Do',
 'you',
 'have',
 'an',
 'adopted',
 'dog',
 ',',
 'how',
 'would',
 'you',
 'encourage',
 'people',
 'to',
 'adopt',
 'and',
 'not',
 'shop',
 '?']

In [22]:
word_tokenize(q1)

['Which',
 'babies',
 'are',
 'more',
 'sweeter',
 'to',
 'their',
 'parents',
 '?',
 'Dark',
 'skin',
 'babies',
 'or',
 'light',
 'skin',
 'babies',
 '?']

In [23]:
q0_tok = word_tokenize(q0)
q1_tok=word_tokenize(q1)

## Stop Word Removal

In [24]:
from nltk.corpus import stopwords

In [26]:
english_stopwords= stopwords.words('english')

In [27]:
print(", ".join(english_stopwords))

i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, must

In [28]:
def removd_stopwords(tokens):
    return [word for word in tokens if word.lower() not in english_stopwords]

In [29]:
q0, q0_tok

('Do you have an adopted dog, how would you encourage people to adopt and not shop?',
 ['Do',
  'you',
  'have',
  'an',
  'adopted',
  'dog',
  ',',
  'how',
  'would',
  'you',
  'encourage',
  'people',
  'to',
  'adopt',
  'and',
  'not',
  'shop',
  '?'])

In [30]:
q0_stp= removd_stopwords(q0_tok)
q0_stp

['adopted', 'dog', ',', 'would', 'encourage', 'people', 'adopt', 'shop', '?']

In [31]:
q1_stp= removd_stopwords(q1_tok)
q1_stp

['babies',
 'sweeter',
 'parents',
 '?',
 'Dark',
 'skin',
 'babies',
 'light',
 'skin',
 'babies',
 '?']

## Stemming

In [32]:
from nltk.stem.snowball import SnowballStemmer

In [33]:
stemmer= SnowballStemmer(language='english')

In [34]:
stemmer.stem('going'), stemmer.stem('supposedly'), stemmer.stem('done'), stemmer.stem('doing'), stemmer.stem('do')

('go', 'suppos', 'done', 'do', 'do')

In [35]:
q0_stem= [stemmer.stem(word) for word in q0_stp]
print(q0_stp)
q0_stem

['adopted', 'dog', ',', 'would', 'encourage', 'people', 'adopt', 'shop', '?']


['adopt', 'dog', ',', 'would', 'encourag', 'peopl', 'adopt', 'shop', '?']

In [36]:
q1_stem= [stemmer.stem(word) for word in q1_stp]
print(q1_stp)
q1_stem

['babies', 'sweeter', 'parents', '?', 'Dark', 'skin', 'babies', 'light', 'skin', 'babies', '?']


['babi',
 'sweeter',
 'parent',
 '?',
 'dark',
 'skin',
 'babi',
 'light',
 'skin',
 'babi',
 '?']

## Lemmitization

In [37]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [38]:
lemmatizer=WordNetLemmatizer()

In [39]:
q0_lem= [lemmatizer.lemmatize(word) for word in q0_stp]
print("Ques: ", q0, "\n", "Stopword Removed: ", q0_stp, "\n" , "Ques Stemmed: ", q0_stem, "\n" , "Ques Lemmatized: ", q0_lem)

Ques:  Do you have an adopted dog, how would you encourage people to adopt and not shop? 
 Stopword Removed:  ['adopted', 'dog', ',', 'would', 'encourage', 'people', 'adopt', 'shop', '?'] 
 Ques Stemmed:  ['adopt', 'dog', ',', 'would', 'encourag', 'peopl', 'adopt', 'shop', '?'] 
 Ques Lemmatized:  ['adopted', 'dog', ',', 'would', 'encourage', 'people', 'adopt', 'shop', '?']


In [40]:
q1_lem= [lemmatizer.lemmatize(word) for word in q1_stp]
print("Ques: ", q1, "\n", "Stopword Removed: ", q1_stp, "\n" , "Ques Stemmed: ", q1_stem, "\n" , "Ques Lemmatized: ", q1_lem)

Ques:  Which babies are more sweeter to their parents? Dark skin babies or light skin babies? 
 Stopword Removed:  ['babies', 'sweeter', 'parents', '?', 'Dark', 'skin', 'babies', 'light', 'skin', 'babies', '?'] 
 Ques Stemmed:  ['babi', 'sweeter', 'parent', '?', 'dark', 'skin', 'babi', 'light', 'skin', 'babi', '?'] 
 Ques Lemmatized:  ['baby', 'sweeter', 'parent', '?', 'Dark', 'skin', 'baby', 'light', 'skin', 'baby', '?']


# Bag of Words

## Create a Vocabolary

In [41]:
small_df= sample_df[:5]
small_df.question_text.values

array(['What does dying from an OD of carfentanil feel like?',
       'Abigail ratchford Had porn movies?',
       'Why is it that degree certificates do not expire but IELTS/ TOEFL test results expires after few years?',
       'Who would win in a fight, Goku or Darkseid?',
       'What does grandpa Jim means when he tells his granddaughter that" only fools rush in where angels fear to tread? Explain what lesson scary stories can help teach'],
      dtype=object)

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
small_vec=CountVectorizer()

In [44]:
small_vec.fit(small_df.question_text)

In [45]:
small_vec.vocabulary_

{'what': 56,
 'does': 11,
 'dying': 12,
 'from': 21,
 'an': 2,
 'od': 39,
 'of': 40,
 'carfentanil': 6,
 'feel': 17,
 'like': 35,
 'abigail': 0,
 'ratchford': 44,
 'had': 25,
 'porn': 43,
 'movies': 37,
 'why': 60,
 'is': 31,
 'it': 32,
 'that': 52,
 'degree': 9,
 'certificates': 7,
 'do': 10,
 'not': 38,
 'expire': 13,
 'but': 4,
 'ielts': 29,
 'toefl': 54,
 'test': 51,
 'results': 45,
 'expires': 14,
 'after': 1,
 'few': 18,
 'years': 63,
 'who': 59,
 'would': 62,
 'win': 61,
 'in': 30,
 'fight': 19,
 'goku': 22,
 'or': 42,
 'darkseid': 8,
 'grandpa': 24,
 'jim': 33,
 'means': 36,
 'when': 57,
 'he': 26,
 'tells': 50,
 'his': 28,
 'granddaughter': 23,
 'only': 41,
 'fools': 20,
 'rush': 46,
 'where': 58,
 'angels': 3,
 'fear': 16,
 'to': 53,
 'tread': 55,
 'explain': 15,
 'lesson': 34,
 'scary': 47,
 'stories': 48,
 'can': 5,
 'help': 27,
 'teach': 49}

In [46]:
small_vec.get_feature_names_out()

array(['abigail', 'after', 'an', 'angels', 'but', 'can', 'carfentanil',
       'certificates', 'darkseid', 'degree', 'do', 'does', 'dying',
       'expire', 'expires', 'explain', 'fear', 'feel', 'few', 'fight',
       'fools', 'from', 'goku', 'granddaughter', 'grandpa', 'had', 'he',
       'help', 'his', 'ielts', 'in', 'is', 'it', 'jim', 'lesson', 'like',
       'means', 'movies', 'not', 'od', 'of', 'only', 'or', 'porn',
       'ratchford', 'results', 'rush', 'scary', 'stories', 'teach',
       'tells', 'test', 'that', 'to', 'toefl', 'tread', 'what', 'when',
       'where', 'who', 'why', 'win', 'would', 'years'], dtype=object)

## Transformation

In [47]:
vectors= small_vec.transform(small_df.question_text)
vectors.toarray(), vectors.shape

(array([[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
        [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
         0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 

## Configuring Count Vectorizer Parameters

In [48]:
stemmer = SnowballStemmer(language='english')

In [49]:
def tokenize(text):
    return [stemmer.stem(word) for word in word_tokenize(text) if word.lower() not in english_stopwords]

In [50]:
tokenize('this just, a texting lot feeling doing of ; punc and often.. stop words')

[',', 'text', 'lot', 'feel', ';', 'punc', 'often', '..', 'stop', 'word']

In [51]:
vectorizer = CountVectorizer(lowercase=True, tokenizer=tokenize, max_features=1000)

In [52]:
%%time
vectorizer.fit(sample_df.question_text)



CPU times: total: 30.6 s
Wall time: 46.4 s


In [53]:
len(vectorizer.vocabulary_)

1000

In [54]:
vectorizer.get_feature_names_out()[0:100]

array(['!', '$', '%', '&', "'", "''", "'m", "'s", '(', ')', ',', '-', '.',
       '1', '10', '100', '11', '12', '12th', '15', '2', '20', '2017',
       '2018', '3', '30', '4', '5', '50', '6', '7', '8', ':', '?', '[',
       ']', '``', 'abl', 'abus', 'accept', 'access', 'accomplish',
       'accord', 'account', 'achiev', 'act', 'action', 'activ', 'actor',
       'actual', 'ad', 'add', 'admiss', 'adult', 'advanc', 'advantag',
       'advic', 'affect', 'africa', 'african', 'age', 'ago', 'air',
       'allow', 'alon', 'alreadi', 'also', 'altern', 'alway', 'amazon',
       'america', 'american', 'among', 'amount', 'analysi', 'android',
       'anim', 'anoth', 'answer', 'anyon', 'anyth', 'apart', 'app',
       'appear', 'appl', 'appli', 'applic', 'approach', 'arab', 'area',
       'armi', 'around', 'art', 'asian', 'ask', 'atheist', 'attack',
       'attend', 'attract', 'australia'], dtype=object)

In [55]:
%%time
inputs=vectorizer.transform(sample_df.question_text)

CPU times: total: 30.9 s
Wall time: 45.4 s


In [56]:
inputs.shape

(100000, 1000)

In [57]:
sample_df.question_text.values[0]

'What does dying from an OD of carfentanil feel like?'

In [58]:
%%time
test_inputs=vectorizer.transform(test_df.question_text)

CPU times: total: 1min 39s
Wall time: 2min 42s


## Spliting

In [59]:
from sklearn.model_selection import train_test_split

In [60]:
train_df, val_df, train_target, val_target= train_test_split(inputs, sample_df['target'], test_size=0.2, random_state=42)

In [61]:
train_df.shape, train_target.shape, val_df.shape, val_target.shape

((80000, 1000), (80000,), (20000, 1000), (20000,))

# Training Model

## Model

In [62]:
from sklearn.linear_model import LogisticRegression

In [63]:
max_itterations=1000

In [64]:
model=LogisticRegression(max_iter=max_itterations, solver='sag')

In [65]:
%%time
model.fit(train_df, train_target)

CPU times: total: 7.66 s
Wall time: 14.5 s


## Making Predictions Using Model

In [66]:
train_pred= model.predict(train_df)

In [67]:
pd.Series(train_pred).value_counts()

0    77921
1     2079
Name: count, dtype: int64

In [68]:
pd.Series(train_target).value_counts()

target
0    75121
1     4879
Name: count, dtype: int64

In [69]:
from sklearn.metrics import accuracy_score, f1_score

In [70]:
accuracy_score(train_target,train_pred)

0.9481

In [73]:
random_pred= np.random.choice((0,1), len(train_target))
f1_score(train_target,train_pred), f1_score(train_target,random_pred)

(0.40327680367921814, 0.11011391866789015)

In [74]:
val_pred= model.predict(val_df)

In [75]:
accuracy_score(val_target,val_pred)

0.94425

In [76]:
f1_score(val_target,val_pred)

0.3725379853685988

In [77]:
model.predict(vectorizer.transform(sincere_df.question_text.values[:10]))

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [78]:
model.predict(vectorizer.transform(insincere_df.question_text.values[:10]))

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

# Predicting on Kaggle Test Set

In [80]:
test_pred = model.predict(test_inputs)

In [81]:
sub_df= pd.read_csv('sample_submission.csv')

In [82]:
sub_df

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,0
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0
...,...,...
375801,ffff7fa746bd6d6197a9,0
375802,ffffa1be31c43046ab6b,0
375803,ffffae173b6ca6bfa563,0
375804,ffffb1f7f1a008620287,0


In [83]:
sub_df.prediction=test_pred

In [85]:
sub_df.prediction.value_counts()

prediction
0    366091
1      9715
Name: count, dtype: int64

In [87]:
sub_df.to_csv('submission.csv', index=None)