In [22]:
import pandas as pd
import numpy as np

In [23]:
import nltk
import string

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import cross_validate
from nltk.corpus import wordnet

In [25]:
df = pd.read_csv('Train.csv',index_col='ID')#train
df2 = pd.read_csv('Test.csv',index_col='ID')#test
df3 = pd.read_csv('SampleSubmission.csv')#submission

In [26]:
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)



In [27]:
# Lemmatizing test
df['text'] = df['text'].apply(lambda x: lemmatize_sentence(x))
df.head()

Unnamed: 0_level_0,text,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
SUAVK39Z,I feel that it be well I dieAm happy,Depression
9JDAGUV3,Why do I get hallucination ?,Drugs
419WR1LQ,I be stresseed due to lack of financial suppor...,Depression
6UY7DX6Q,Why be life important ?,Suicide
FYC0FTFB,How could I be help to go through the depressi...,Depression


In [28]:
# Lemmatizing test
df2['text'] = df2['text'].apply(lambda x: lemmatize_sentence(x))
df2.head()

Unnamed: 0_level_0,text
ID,Unnamed: 1_level_1
02V56KMO,How to overcome bad feeling and emotion
03BMGTOK,I feel like give up in life
03LZVFM6,I be so depressed feel like get no strength to...
0EPULUM5,I feel so low especially since I have no one t...
0GM4C5GD,can i be successful when I be a drug addict ?


In [29]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [30]:
df.dtypes

text     object
label    object
dtype: object

In [31]:
df.describe()

Unnamed: 0,text,label
count,616,616
unique,600,4
top,How can I stop use alcohol ?,Depression
freq,4,352


In [32]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [33]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process,)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB(fit_prior=False)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [34]:
pipeline.fit(df['text'],df['label'])

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x7f4df2da34d0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))],
         verbose=False)

In [35]:
predictions = pipeline.predict_proba(df2['text'])

In [36]:
df2['predictions']  = predictions

ValueError: Wrong number of items passed 4, placement implies 1

In [37]:
df2.iloc[0:90]

Unnamed: 0_level_0,text
ID,Unnamed: 1_level_1
02V56KMO,How to overcome bad feeling and emotion
03BMGTOK,I feel like give up in life
03LZVFM6,I be so depressed feel like get no strength to...
0EPULUM5,I feel so low especially since I have no one t...
0GM4C5GD,can i be successful when I be a drug addict ?
...,...
A1QFJ71D,I be depress when I get pregnant
A54FELHT,Is bhang harmful to my health ? If I can stop ...
A8F3BDON,When be the right age to consume alcohol
AFY1KE95,"Hopelessnesss , feel like give up"


In [38]:
df4 = df2.drop(['text'],axis=1)

In [39]:
df4 = pd.get_dummies(df4['predictions'])

KeyError: 'predictions'

In [17]:
df4 = df4.reindex(['Depression', 'Alcohol', 'Suicide', 'Drugs'],axis=1)

NameError: name 'df4' is not defined

In [18]:
df4.head()

NameError: name 'df4' is not defined

In [19]:
df4.to_csv('NaivesBayes.csv')

NameError: name 'df4' is not defined