In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import accuracy_score

In [132]:
train = pd.read_csv('data/train.csv')
train.head(5)

Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,Why is life important?,Suicide
4,FYC0FTFB,How could I be helped to go through the depres...,Depression


In [133]:
x_value = train['text']
y_value = train['label'].replace({'Depression': 0, 'Drugs': 1, 'Suicide': 2, 'Alcohol': 3})
x_train, x_test, y_train, y_test = train_test_split(x_value, y_value, test_size=0.25)

In [134]:
x_train

419                          What to do in order to heal
182                   What is the reason for depression?
439                       What is the benefit of alcohol
334            How can i get out of depression in future
335        Had a problem with my personal looks,feel sad
                             ...                        
536                      I am dealing with family issues
467        How do I seek for assistance with depression?
130                  whom should I disclose to my issues
549    How do I sought for help in wanting to commit ...
229            What is the solution for feeling hopeless
Name: text, Length: 462, dtype: object

In [135]:
y_train

419    2
182    0
439    3
334    0
335    0
      ..
536    0
467    0
130    0
549    2
229    0
Name: label, Length: 462, dtype: int64

In [136]:
vect = CountVectorizer()
x_train_counts = vect.fit_transform(x_train)
x_test_counts = vect.transform(x_test)

tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_test_tfidf = tfidf_transformer.fit_transform(x_test_counts)

In [168]:
clf = LinearSVC(C = 0.55)

In [169]:
a = clf.fit(x_train_tfidf, y_train)

In [172]:
clf.decision_function(x_train_tfidf) 

array([[-0.29914823, -0.8504015 , -0.0511748 , -0.81384153],
       [ 0.79720669, -0.97135328, -1.0357702 , -0.84861366],
       [-0.98783375, -1.01075891, -1.04830252,  0.93184293],
       ...,
       [ 0.63152467, -0.98261569, -0.91724998, -0.76640878],
       [-0.5946963 , -1.17379673,  0.79890579, -1.20621214],
       [ 0.47690729, -0.90297235, -0.83825779, -0.72929474]])

In [165]:
a.predict(x_test_counts)

array([0, 2, 0, 0, 2, 1, 1, 0, 1, 1, 0, 3, 1, 0, 1, 0, 1, 0, 3, 0, 2, 0,
       0, 3, 3, 0, 0, 2, 3, 0, 3, 3, 0, 3, 2, 3, 0, 2, 0, 3, 0, 1, 2, 0,
       0, 1, 3, 0, 3, 2, 0, 0, 3, 3, 2, 3, 0, 3, 0, 0, 1, 0, 3, 0, 2, 0,
       0, 2, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 3, 0, 0, 1,
       0, 2, 0, 0, 0, 3, 0, 2, 0, 0, 3, 0, 3, 0, 1, 2, 0, 0, 0, 0, 0, 0,
       3, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 3, 2, 3, 3, 2, 2, 3, 0, 0,
       0, 2, 0, 1, 3, 0, 3, 3, 0, 2, 0, 0, 3, 0, 1, 0, 3, 3, 0, 0, 0, 3],
      dtype=int64)

In [158]:
Cs = []
for c in [0.001, 0.005, 0.008, 0.03, 0.5, 0.55, 0.6, 0.7, 1, 1.2, 1.5]:
    clf = LinearSVC(C = c)
    clf.fit(x_train_tfidf, y_train)
    pred = clf.predict(x_test_tfidf)
    test_acc = accuracy_score(y_test, pred)
    train_acc = accuracy_score(y_train, clf.predict(x_train_tfidf))
    Cs.append([c, test_acc, train_acc])

In [159]:
pd.DataFrame(Cs)

Unnamed: 0,0,1,2
0,0.001,0.577922,0.569264
1,0.005,0.577922,0.569264
2,0.008,0.616883,0.590909
3,0.03,0.668831,0.694805
4,0.5,0.850649,0.982684
5,0.55,0.850649,0.984848
6,0.6,0.850649,0.987013
7,0.7,0.837662,0.987013
8,1.0,0.818182,0.991342
9,1.2,0.811688,0.991342
