In [1]:
import pandas as pd
import numpy as np

In [2]:
import nltk
import string

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import cross_validate
from nltk.corpus import wordnet

In [4]:
df = pd.read_csv('Train.csv',index_col='ID')#train
df2 = pd.read_csv('Test.csv',index_col='ID')#test
df3 = pd.read_csv('SampleSubmission.csv')#submission

In [5]:
df.head()

Unnamed: 0_level_0,text,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
SUAVK39Z,I feel that it was better I dieAm happy,Depression
9JDAGUV3,Why do I get hallucinations?,Drugs
419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
6UY7DX6Q,Why is life important?,Suicide
FYC0FTFB,How could I be helped to go through the depres...,Depression


In [6]:
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)



In [7]:
# Lemmatizing test
df['text'] = df['text'].apply(lambda x: lemmatize_sentence(x))
df.head()

Unnamed: 0_level_0,text,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
SUAVK39Z,I feel that it be well I dieAm happy,Depression
9JDAGUV3,Why do I get hallucination ?,Drugs
419WR1LQ,I be stresseed due to lack of financial suppor...,Depression
6UY7DX6Q,Why be life important ?,Suicide
FYC0FTFB,How could I be help to go through the depressi...,Depression


In [8]:
# Lemmatizing test
df2['text'] = df2['text'].apply(lambda x: lemmatize_sentence(x))
df2.head()

Unnamed: 0_level_0,text
ID,Unnamed: 1_level_1
02V56KMO,How to overcome bad feeling and emotion
03BMGTOK,I feel like give up in life
03LZVFM6,I be so depressed feel like get no strength to...
0EPULUM5,I feel so low especially since I have no one t...
0GM4C5GD,can i be successful when I be a drug addict ?


In [9]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [10]:
df.dtypes

text     object
label    object
dtype: object

In [11]:
df.describe()

Unnamed: 0,text,label
count,616,616
unique,600,4
top,How can I stop use alcohol ?,Depression
freq,4,352


In [12]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [13]:
from sklearn.preprocessing import LabelBinarizer

#binarizer = LabelBinarizer()
#n=pd.DataFrame(binarizer.fit_transform(train['label']), columns=binarizer.classes_)

binarizer = LabelBinarizer()
binarizer.fit(df['label'])

# transform target variable
y = binarizer.transform(df['label'])

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [16]:

# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(df['text'], y, test_size=0.2, random_state=9)

In [17]:
# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [18]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import log_loss

In [19]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [20]:
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [21]:
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [22]:
y_pred[3]

array([0, 1, 0, 0])

In [26]:
# predict probabilities
y_pred_prob = clf.predict_proba(xval_tfidf)

In [27]:
log_loss(yval,y_pred_prob)

0.6222489472051234

In [29]:
y_pred_prob

array([[0.09322946, 0.83646601, 0.04806058, 0.07067131],
       [0.48411136, 0.07651113, 0.48121074, 0.03876905],
       [0.14194427, 0.6041095 , 0.0532301 , 0.18630373],
       [0.06850868, 0.85584826, 0.05169354, 0.08590489],
       [0.13596486, 0.60872593, 0.09766865, 0.12068262],
       [0.09086787, 0.76742856, 0.0776317 , 0.08687139],
       [0.20449209, 0.61327301, 0.06547885, 0.0844596 ],
       [0.17924914, 0.19705163, 0.44903528, 0.08695028],
       [0.12499228, 0.67627207, 0.07990107, 0.10564281],
       [0.0446266 , 0.93891269, 0.04055523, 0.04037506],
       [0.43585728, 0.1581758 , 0.30109016, 0.04929691],
       [0.61755695, 0.2486565 , 0.06621578, 0.0529369 ],
       [0.07021517, 0.77184202, 0.05636727, 0.13246687],
       [0.13973793, 0.39339464, 0.24196387, 0.11269012],
       [0.12105533, 0.61906659, 0.11212967, 0.09757891],
       [0.35572433, 0.52259512, 0.04473584, 0.08880948],
       [0.09953467, 0.36854922, 0.05978939, 0.44106525],
       [0.10452056, 0.80057962,