In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, roc_auc_score, classification_report
import pickle

In [2]:
X_train = pd.read_pickle('../X_train_new.pickle')
X_test = pd.read_pickle('../X_test_new.pickle')

In [3]:
X_train.dropna(inplace= True)
X_test.dropna(inplace= True)
X_test.reset_index(inplace=True)

In [4]:
X_train['tokenized_text'] = X_train['tokenized_text'].apply(lambda x: x.split())
X_test['tokenized_text'] = X_test['tokenized_text'].apply(lambda x: x.split())

In [5]:
X_train = X_train.sample(frac=0.03)

In [6]:
def identity_tokenizer(text):
    return text

tfidf_no_token = TfidfVectorizer(tokenizer=identity_tokenizer, preprocessor=identity_tokenizer,token_pattern=None, max_features = 200, min_df = 10)

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=identity_tokenizer,
    preprocessor=identity_tokenizer,
    token_pattern=None) 

In [7]:
x_train = X_train['tokenized_text']
x_test = X_test['tokenized_text']
y_train = X_train['mental_state']
y_test = X_test['mental_state']

In [8]:
tfidf_no_token.fit(x_train)
fit_model = tfidf_no_token.transform(x_train)
test_model = tfidf_no_token.transform(x_test)

In [9]:
%%time
clf = SVC(probability=True, kernel='rbf', verbose = True)
clf.fit(fit_model, y_train)

[LibSVM]CPU times: user 53min 21s, sys: 227 ms, total: 53min 21s
Wall time: 53min 21s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=True)

In [10]:
def performance(y_true, y_hat):
    
    y_true = list(map(lambda x: np.argmax(x), y_true))
    y_hat = list(map(lambda x: np.argmax(x), y_hat))
    
    y_true_roc = pd.DataFrame(y_true, columns = ['mental_state'])
    y_hat_roc = pd.DataFrame(y_hat, columns = ['mental_state'])
    
    y_true_roc = pd.get_dummies(y_true_roc['mental_state']).values
    y_hat_roc = pd.get_dummies(y_hat_roc['mental_state']).values
    
    # result template
    print('-'*40+' Result Report '+'-'*40)
    
    # Accuracy
    print('Accuracy: ', accuracy_score(y_true,y_hat))
    
    # Confusion matrix
    print('\n')
    print('Confusion Matrix: \n', confusion_matrix(y_true,y_hat))
    print('\n')

    # Classification Report
    print('Classification Report: \n')
    print(classification_report(y_true, y_hat))
    print('\n')
    
    # Aggregated area under the ROC curve
    print('Area under ROC curve: ', roc_auc_score(y_true_roc, y_hat_roc, multi_class='ovo'))
    
    print('-'*95)

In [11]:
predictions = clf.predict_proba(test_model)
y_target = pd.get_dummies(y_test).values

In [12]:
performance(y_target, predictions)

---------------------------------------- Result Report ----------------------------------------
Accuracy:  0.7113909887290888


Confusion Matrix: 
 [[ 89233  15566   3163]
 [ 18121 129143  20861]
 [ 10620  37424  42299]]


Classification Report: 

              precision    recall  f1-score   support

           0       0.76      0.83      0.79    107962
           1       0.71      0.77      0.74    168125
           2       0.64      0.47      0.54     90343

    accuracy                           0.71    366430
   macro avg       0.70      0.69      0.69    366430
weighted avg       0.71      0.71      0.70    366430



Area under ROC curve:  0.7662392327095512
-----------------------------------------------------------------------------------------------


In [13]:
def check_mental_state(model, post, tokenizer):
    
    dic_state = {0:'Normal', 1:'Depressed', 2:'Suicidal'}
    
    SET_LIMIT_SENTENCE = 300
    
    print('Post: \n')
    print(post)
    print('\n')
    
    list_post = [post]
    list_post_tokenized = tokenizer.transform(list_post)
    
    # classify mental state
    mental_state = model.predict_proba(list_post_tokenized)
    
    print('Normal: ', str(round(mental_state[0][0], 2)*100)+'%')
    print('Depressed: ', str(round(mental_state[0][1], 2)*100)+'%')
    print('Suicidal: ', str(round(mental_state[0][2], 2)*100)+'%')
    
    classify_state = np.argmax(mental_state[0])
    #list(map(lambda x: np.argmax(x), mental_state))
    
    print('\n')
    print('Model Classify: ', dic_state[classify_state])

In [16]:
post = X_test[X_test['mental_state']==2]['tokenized_text'][19]
check_mental_state(clf, post, tfidf_no_token)

Post: 

['i', 'know', 'i', 'tried', 'everything', 'guys', 'i', 'gone', 'different', 'psychologists', 'types', 'anti', 'depressants', 'i', 'made', 'effort', 'get', 'bed', 'every', 'day', 'walk', 'my', 'dog', 'go', 'gym', 'read', 'distant', 'feeling', 'increases', 'exponentially', 'day', 'i', 'never', 'really', 'loved', 'life', 'i', 'loved', 'people', 'it', 'lately', 'spark', 'disappeared', 'i', 'connect', 'anyone', 'fine', 'i', 'even', 'connect', 'my', 'parents', 'kills', 'me', 'inside', 'i', 'want', 'stop', 'mundane', 'cycle', 'my', 'life', 'become', 'initially', 'i', 'told', 'myself', 'i', 'stick', 'it', 'them', 'i', 'longer', 'hold', 'restrictions', 'i', 'always', 'tried', 'my', 'best', 'persevere', 'keep', 'looking', 'forward', 'i', 'cried', 'day', 'today', 'i', 'realized', 'threshold', 'kept', 'me', 'taking', 'action', 'disappeared', 'i', 'want', 'it', 'feel', 'selfish', 'even', 'thoughts', 'somewhere', 'i', 'smiled', 'loved', 'i', 'feel', 'outcome', 'inevitable', 'please', 'help']

In [17]:
post = X_test[X_test['mental_state']==0]['tokenized_text'][2]
check_mental_state(clf, post, tfidf_no_token)

Post: 

['cc', 'room', 'get', 'enough', 'love', 'time', 'day', 'come', 'play', 'some', 'music', 'hang', 'cc', 'room', 'great', 'way', 'find', 'new', 'music', 'chatting', 'lovely', 'people', 'we', 'got', 'people', 'room', 'we', 'would', 'love', 'come', 'party', 'us', 'follow', 'link', 'https', 'hit', 'headphones', 'sidebar', 'go', 'room', 'you', 'need', 'sign', 'you', 'account', 'already', 'it', 'quick', 'easy']


Normal:  87.0%
Depressed:  11.0%
Suicidal:  2.0%


Model Classify:  Normal


In [18]:
post = X_test[X_test['mental_state']==1]['tokenized_text'][366423]
check_mental_state(clf, post, tfidf_no_token)

Post: 



Normal:  0.0%
Depressed:  3.0%
Suicidal:  97.0%


Model Classify:  Suicidal


In [11]:
X_train

Unnamed: 0,tokenized_text,mental_state
1193017,"[fuck, everything, right, you, think, things, ...",1
646108,"[i, i, dramatic, honestly, i, want, fucking, d...",2
282304,"[my, roommate, girlfriend, stole, my, fucking,...",1
922522,"[are, elderly, likely, watch, children, cartoo...",0
1083598,"[my, friends, understand, much, i, want, leave...",2
...,...,...
784427,"[falling, holes, depression, every, days, feel...",1
1135823,"[marks, start, my, last, hour, alive, my, suic...",2
1427449,"[anyone, else, scared, afterlife, we, punished...",2
668644,"[i, got, vip, tickets, my, boyfriend, i, see, ...",0
