In [1]:
#importing the necessary Python libraries and the dataset

import re 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

def read_data(file):
    data = []
    with open(file, 'r')as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
    return data

file = 'text.txt'
data = read_data(file)
print("Number of instances: {}".format(len(data)))
# print(data[:5])



Number of instances: 7480


In [2]:
# create two Python functions for tokenization and generating the features of an input sentence

def ngram(token, n): 
    output = []
    for i in range(n-1, len(token)): 
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram) 
    return output

def create_feature(text, nrange=(1, 1)):
    text_features = [] 
    text = text.lower() 
    text_alphanum = re.sub('[^a-z0-9#]', ' ', text)
    for n in range(nrange[0], nrange[1]+1): 
        text_features += ngram(text_alphanum.split(), n)    
    text_punc = re.sub('[a-z0-9]', ' ', text)
    text_features += ngram(text_punc.split(), 1)
    return Counter(text_features)

In [3]:
#create a Python function to store the labels, our labels will be based on mental health such as "depression", "anxiety", "PTSD", "psychosis", "bipolar disorders", "schizophrenia", "OCD and so on

def convert_label(item, name): 
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)): 
        if items[idx] == 1: 
            label += name[idx] + " "
    
    return label.strip()

issues = ["depression", "anxiety", "PTSD", "psychosis", "bipolar disorders", "schizophrenia", "OCD"]

X_all = []
y_all = []
for label, text in data:
    y_all.append(convert_label(label, issues))
    X_all.append(create_feature(text, nrange=(1, 4)))
    
print(y_all[:5])

['depression', 'anxiety', 'PTSD', 'psychosis', 'bipolar disorders']


In [4]:
#split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 123)

def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    return train_acc, test_acc

from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

X_train

<5984x249535 sparse matrix of type '<class 'numpy.float64'>'
	with 488985 stored elements in Compressed Sparse Row format>

In [5]:
#going to train four machine learning models and then choose the model that works best on the training and testing sets

svc = SVC()
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)
dtree = DecisionTreeClassifier()

clifs = [svc, lsvc, rforest, dtree]

# train and test them 
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))
for clf in clifs: 
    clf_name = clf.__class__.__name__
    train_acc, test_acc = train_test(clf, X_train, X_test, y_train, y_test)
    print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))

| Classifier                | Training Accuracy | Test Accuracy |
| ------------------------- | ----------------- | ------------- |
| SVC                       |         0.9070856 |     0.4518717 |




| LinearSVC                 |         0.9988302 |     0.5768717 |
| RandomForestClassifier    |         0.9988302 |     0.5534759 |
| DecisionTreeClassifier    |         0.9988302 |     0.4618984 |


In [6]:
#assign an emoji to each label that is emotions in this problem, then I’ll write 4 input sentences, 
#then I’ll use our trained machine learning model to take a look at the emotions of our input sentences

l = ["depression", "anxiety", "PTSD", "psychosis", "bipolar disorders", "schizophrenia", "OCD"]
l.sort()
label_freq = {}
for label, _ in data: 
    label_freq[label] = label_freq.get(label, 0) + 1

# print the labels and their counts in sorted order 
for l in sorted(label_freq, key=label_freq.get, reverse=True):
    print("{:10}({})  {}".format(convert_label(l, issues), l, label_freq[l]))

depression(1. 0. 0. 0. 0. 0. 0.)  1084
PTSD      (0. 0. 1. 0. 0. 0. 0.)  1080
psychosis (0. 0. 0. 1. 0. 0. 0.)  1079
anxiety   (0. 1. 0. 0. 0. 0. 0.)  1078
bipolar disorders(0. 0. 0. 0. 1. 0. 0.)  1057
OCD       (0. 0. 0. 0. 0. 0. 1.)  1057
schizophrenia(0. 0. 0. 0. 0. 1. 0.)  1045


In [10]:
issues_dict = {"depression":"depression", "anxiety":"anxiety", "PTSD":"PTSD", "psychosis":"psychosis", "bipolar disorders":"bipolar disorders", "schizophrenia":"schizophrenia", "OCD":"OCD"}
#t1 = "This looks so impressive"
#t2 = "I have a fear of dogs"
#t3 = "My dog died yesterday"
#t4 = "I don't want to talk with you"

t1 = input('How is your health?\n')    

features = create_feature(t1, nrange=(1, 4))
features = vectorizer.transform(features)
prediction = clf.predict(features)[0]
print( text,issues_dict[prediction])

How is your health?
I think people live alone or take any tension ans its effect on body like people not feel fresh
I had shouted at my younger brother and he was always afraid when I called out loudly. depression
