In [88]:
import numpy as np
import pandas as pd

In [107]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sub.csv')
sub

Unnamed: 0,Title
0,Sudden anxiety about work
1,Need advice
2,I don’t feel like myself.
3,Panic attacks
4,Stress
...,...
294,I’ve pushed my closest friend away
295,Thinking about this event makes me overthink a...
296,Symptoms yet I feel no anxiety?
297,Does anyone else struggle with “celebrating” t...


In [90]:
import re
import string
import nltk
from nltk.corpus import stopwords

In [91]:
def rmv_spl_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to both test and training datasets
train['Title'] = train['Title'].apply(lambda x: rmv_spl_text(x))
test['Title'] = test['Title'].apply(lambda x: rmv_spl_text(x))

# Let's take a look at the updated text
train['Title'].head()

0                                       clonazepam  mg
1    i am just now trying to get a handle on my anx...
2                  panic attacks are back after years 
3               terrified for tooth extractionsedation
4           question regarding first therapist meeting
Name: Title, dtype: object

In [92]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

In [93]:
toknzr = nltk.tokenize.RegexpTokenizer(r'\w+')
train['Title'] = train['Title'].apply(lambda x: toknzr.tokenize(x))
test['Title'] = test['Title'].apply(lambda x: toknzr.tokenize(x))
train['Title'].head()

0                                     [clonazepam, mg]
1    [i, am, just, now, trying, to, get, a, handle,...
2            [panic, attacks, are, back, after, years]
3          [terrified, for, tooth, extractionsedation]
4     [question, regarding, first, therapist, meeting]
Name: Title, dtype: object

In [94]:
def rmv_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words


train['Title'] = train['Title'].apply(lambda x : rmv_stopwords(x))
test['Title'] = test['Title'].apply(lambda x : rmv_stopwords(x))
train.head()

Unnamed: 0,Title,Target
0,"[clonazepam, mg]",0
1,"[trying, get, handle, anxiety, im, looking, ad...",1
2,"[panic, attacks, back, years]",0
3,"[terrified, tooth, extractionsedation]",0
4,"[question, regarding, first, therapist, meeting]",0


In [95]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

train['Title'] = train['Title'].apply(lambda x : combine_text(x))
test['Title'] = test['Title'].apply(lambda x : combine_text(x))
train['Title']
train.head()

Unnamed: 0,Title,Target
0,clonazepam mg,0
1,trying get handle anxiety im looking advice,1
2,panic attacks back years,0
3,terrified tooth extractionsedation,0
4,question regarding first therapist meeting,0


In [66]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(train['Title'])
test_vectors = count_vectorizer.transform(test["Title"])

## Keeping only non-zero elements to preserve space 
print(train_vectors[0].todense())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [67]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train['Title'])
test_tfidf = tfidf.transform(test["Title"])

In [102]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_vectors, train["Target"], cv=5, scoring="f1")
scores

array([0.92307692, 1.        , 0.94117647, 1.        , 0.75      ])

In [69]:
clf.fit(train_vectors, train["Target"])

LogisticRegression()

In [70]:
clf_tfidf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf_tfidf, train_tfidf, train["Target"], cv=5, scoring="f1")
scores

array([0.25      , 0.44444444, 0.54545455, 0.4       , 0.61538462])

In [71]:
# Fitting a simple Naive Bayes on Counts
clf_NB = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB, train_vectors, train["Target"], cv=5, scoring="f1")
scores

array([0.92307692, 1.        , 0.94117647, 0.94117647, 0.60869565])

In [72]:
clf_NB.fit(train_vectors, train["Target"])

MultinomialNB()

In [73]:
# Fitting a simple Naive Bayes on TFIDF
clf_NB_TFIDF = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_TFIDF, train_tfidf, train["Target"], cv=10, scoring="f1")
scores

array([0.8       , 1.        , 1.        , 1.        , 0.88888889,
       1.        , 1.        , 0.85714286, 0.88888889, 0.66666667])

In [96]:
final = sub

In [97]:
def rmv_spl_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to both test and training datasets
final['Title'] = final['Title'].apply(lambda x: rmv_spl_text(x))

# Let's take a look at the updated text
final['Title'].head()

0    sudden anxiety about work
1                  need advice
2     i don’t feel like myself
3                panic attacks
4                       stress
Name: Title, dtype: object

In [98]:
toknzr = nltk.tokenize.RegexpTokenizer(r'\w+')
final['Title'] = final['Title'].apply(lambda x: toknzr.tokenize(x))
final['Title'].head()

0     [sudden, anxiety, about, work]
1                     [need, advice]
2    [i, don, t, feel, like, myself]
3                   [panic, attacks]
4                           [stress]
Name: Title, dtype: object

In [99]:
def rmv_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words


final['Title'] = final['Title'].apply(lambda x : rmv_stopwords(x))
final.head()

Unnamed: 0,Title
0,"[sudden, anxiety, work]"
1,"[need, advice]"
2,"[feel, like]"
3,"[panic, attacks]"
4,[stress]


In [100]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

final['Title'] = final['Title'].apply(lambda x : combine_text(x))
final.head()

Unnamed: 0,Title
0,sudden anxiety work
1,need advice
2,feel like
3,panic attacks
4,stress


In [79]:
count_vectorizer = CountVectorizer()
final_vectors = count_vectorizer.fit_transform(final['Title'])

## Keeping only non-zero elements to preserve space 
print(final_vectors[0].todense())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [101]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train['Title'])
final = tfidf.transform(final["Title"])

In [108]:
clf.fit(train_tfidf, train["Target"])

LogisticRegression()

In [109]:
sub["target"] = clf.predict(final)
sub.to_csv('C:/Users/Admin/submission.csv',index=True)

In [110]:
sub

Unnamed: 0,Title,target
0,Sudden anxiety about work,1
1,Need advice,0
2,I don’t feel like myself.,0
3,Panic attacks,0
4,Stress,0
...,...,...
294,I’ve pushed my closest friend away,0
295,Thinking about this event makes me overthink a...,0
296,Symptoms yet I feel no anxiety?,0
297,Does anyone else struggle with “celebrating” t...,0
