In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import accuracy_score

In [8]:
train = pd.read_csv('data/train.csv')
train.head(5)

x_train = train['text']
y_train = train['label'].replace({'Depression': 0, 'Drugs': 1, 'Suicide': 2, 'Alcohol': 3})

In [9]:
#x_value = train['text']
#y_value = train['label'].replace({'Depression': 0, 'Drugs': 1, 'Suicide': 2, 'Alcohol': 3})
#x_train, x_test, y_train, y_test = train_test_split(x_value, y_value, test_size=0.25)

In [10]:
#x_train

In [11]:
#y_train

In [13]:
vect = CountVectorizer()
x_train_counts = vect.fit_transform(x_train)
#x_test_counts = vect.transform(x_test)

tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
#x_test_tfidf = tfidf_transformer.fit_transform(x_test_counts)

In [14]:
clf = LinearSVC(C = 0.55)

In [15]:
a = clf.fit(x_train_tfidf, y_train)

In [16]:
clf.decision_function(x_train_tfidf) 

array([[ 0.86252138, -0.87820222, -1.11908964, -0.92752158],
       [-0.55596803,  0.16695539, -0.6769077 , -0.89922648],
       [ 0.81642278, -0.94148194, -0.9736579 , -0.93842431],
       ...,
       [-1.10371036, -0.77700064, -0.75403454,  0.46384059],
       [ 1.18797859, -0.98983239, -1.15751183, -1.12297443],
       [ 0.89652239, -1.12579148, -0.84324104, -1.14531141]])

In [165]:
a.predict(x_test_counts)

array([0, 2, 0, 0, 2, 1, 1, 0, 1, 1, 0, 3, 1, 0, 1, 0, 1, 0, 3, 0, 2, 0,
       0, 3, 3, 0, 0, 2, 3, 0, 3, 3, 0, 3, 2, 3, 0, 2, 0, 3, 0, 1, 2, 0,
       0, 1, 3, 0, 3, 2, 0, 0, 3, 3, 2, 3, 0, 3, 0, 0, 1, 0, 3, 0, 2, 0,
       0, 2, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 3, 0, 0, 1,
       0, 2, 0, 0, 0, 3, 0, 2, 0, 0, 3, 0, 3, 0, 1, 2, 0, 0, 0, 0, 0, 0,
       3, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 3, 2, 3, 3, 2, 2, 3, 0, 0,
       0, 2, 0, 1, 3, 0, 3, 3, 0, 2, 0, 0, 3, 0, 1, 0, 3, 3, 0, 0, 0, 3],
      dtype=int64)

In [17]:
test = pd.read_csv('data/Test.csv')

In [38]:
test

Unnamed: 0,ID,text
0,02V56KMO,How to overcome bad feelings and emotions
1,03BMGTOK,I feel like giving up in life
2,03LZVFM6,I was so depressed feel like got no strength t...
3,0EPULUM5,I feel so low especially since I had no one to...
4,0GM4C5GD,can i be successful when I am a drug addict?
...,...,...
304,Z9A6ACLK,Yes
305,ZDUOIGKN,My girlfriend dumped me
306,ZHQ60CCH,How can I go back to being my old self?
307,ZVIJMA4O,Is it true bhang is medicinal?


In [20]:
x_test = test['text']
x_test_counts = vect.transform(x_test)
x_test_tfidf = tfidf_transformer.fit_transform(x_test_counts)

In [25]:
results = pd.DataFrame(clf.decision_function(x_test_tfidf))

In [44]:
test_result = pd.merge(test, results, left_index = True, right_index = True)[['ID', 0, 1, 2, 3]].rename(columns = {0: 'Depression', 
                                                                                                     1: 'Alcohol', 
                                                                                                     2: 'Suicide', 
                                                                                                     3: 'Drugs'}).set_index('ID')

In [45]:
test_result

Unnamed: 0_level_0,Depression,Alcohol,Suicide,Drugs
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
02V56KMO,0.133716,-1.061447,-0.665098,-0.456034
03BMGTOK,1.193052,-1.224791,-1.003438,-1.222990
03LZVFM6,1.756563,-1.279707,-1.130094,-1.546911
0EPULUM5,0.847033,-0.935309,-0.579285,-1.466742
0GM4C5GD,-0.142975,-0.098209,-1.113902,-0.666878
...,...,...,...,...
Z9A6ACLK,0.240426,-0.787936,-0.756916,-0.682932
ZDUOIGKN,0.159713,-0.960699,-0.376304,-0.798961
ZHQ60CCH,-0.463077,-0.994030,-0.258658,-0.394703
ZVIJMA4O,-0.933250,0.580426,-1.036550,-0.652593


In [47]:
test_result.to_csv('output.csv')

In [158]:
Cs = []
for c in [0.001, 0.005, 0.008, 0.03, 0.5, 0.55, 0.6, 0.7, 1, 1.2, 1.5]:
    clf = LinearSVC(C = c)
    clf.fit(x_train_tfidf, y_train)
    pred = clf.predict(x_test_tfidf)
    test_acc = accuracy_score(y_test, pred)
    train_acc = accuracy_score(y_train, clf.predict(x_train_tfidf))
    Cs.append([c, test_acc, train_acc])

In [159]:
pd.DataFrame(Cs)

Unnamed: 0,0,1,2
0,0.001,0.577922,0.569264
1,0.005,0.577922,0.569264
2,0.008,0.616883,0.590909
3,0.03,0.668831,0.694805
4,0.5,0.850649,0.982684
5,0.55,0.850649,0.984848
6,0.6,0.850649,0.987013
7,0.7,0.837662,0.987013
8,1.0,0.818182,0.991342
9,1.2,0.811688,0.991342
