In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

In [3]:
test_df.head(2)

Unnamed: 0,ID,text
0,02V56KMO,How to overcome bad feelings and emotions
1,03BMGTOK,I feel like giving up in life


In [4]:
df.shape

(616, 3)

In [5]:
df[df['label']=='Depression']

Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
4,FYC0FTFB,How could I be helped to go through the depres...,Depression
5,V6VSDJ5I,What are the effects of depression to ones health,Depression
6,9736J4UE,Why is everything so hard to deal with in this...,Depression
...,...,...,...
603,S6WK4NBL,"I feel confused, how can I overcome the problem?",Depression
608,LGAPCAYO,I feel very low and at times,Depression
610,69V3L12G,Life is just hard,Depression
614,1DS3P1XO,I feel unworthy,Depression


In [6]:
# drop the unwanted column
# ID column 

In [7]:
df.drop('ID', inplace=True, axis=1)

In [8]:
df.head()

Unnamed: 0,text,label
0,I feel that it was better I dieAm happy,Depression
1,Why do I get hallucinations?,Drugs
2,I am stresseed due to lack of financial suppor...,Depression
3,Why is life important?,Suicide
4,How could I be helped to go through the depres...,Depression


## Preprocessing the text

In [9]:
print(df['text'].apply(lambda x : len(x.split(' '))).sum())

4823


In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [11]:
SCR = re.compile('[/(){}\[\]\|@,;]')
ESR = re.compile('[^0-9a-z #+_]')
stopword = set(stopwords.words('english'))

In [12]:
def clean_text(text):
    text = text.lower()
    text = SCR.sub(' ', text)
    text = ESR.sub(' ', text)
    text = ' '.join(word for word in text.split() if word not in stopword)
    
    return text

In [13]:
df['text'] = df['text'].apply(clean_text)

In [14]:
df.head()

Unnamed: 0,text,label
0,feel better dieam happy,Depression
1,get hallucinations,Drugs
2,stresseed due lack financial support school,Depression
3,life important,Suicide
4,could helped go depression,Depression


In [15]:
print(df['text'].apply(lambda x : len(x.split(' '))).sum())

2243


In [16]:
# Dividing the data into variables and target

In [17]:
y = df.label
X = df.drop('label', axis=1)

In [18]:
X

Unnamed: 0,text
0,feel better dieam happy
1,get hallucinations
2,stresseed due lack financial support school
3,life important
4,could helped go depression
...,...
611,stop alcoholism
612,become oldself
613,someone stop
614,feel unworthy


### importing important models

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline

In [20]:
#x, y = make_classification(random_state=0)

In [21]:
# Creating a model on MultinomialNB naive Bayes
model = CountVectorizer()
#     ('tfidf', TfidfVectorizer()),
#     ('clf', LogisticRegression()),

# training the model with the train data

X = model.fit_transform(X['text'])

In [22]:
print(model)

CountVectorizer()


In [23]:
y

0      Depression
1           Drugs
2      Depression
3         Suicide
4      Depression
          ...    
611       Alcohol
612       Suicide
613       Alcohol
614    Depression
615    Depression
Name: label, Length: 616, dtype: object

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 

In [25]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33)
from sklearn.metrics import classification_report

In [27]:
classi = LogisticRegression()
classi.fit(X_train, y_train)
y_pred = classi.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     Alcohol       0.90      0.86      0.88        43
  Depression       0.81      0.97      0.89       116
       Drugs       1.00      0.52      0.68        25
     Suicide       0.73      0.40      0.52        20

    accuracy                           0.84       204
   macro avg       0.86      0.69      0.74       204
weighted avg       0.85      0.84      0.82       204



In [28]:
d = df.label
S = df.drop('label', axis=1)

In [29]:
model1 = TfidfVectorizer()
S = model1.fit_transform(S['text'])

In [30]:
S_train, S_test, d_train, d_test = train_test_split(S, d, test_size=0.3, random_state=0)

In [31]:
S_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
classi.fit(S_train, d_train)

LogisticRegression()

In [33]:
pred = classi.predict(S_test)

In [35]:
print(classification_report(d_test, pred))

              precision    recall  f1-score   support

     Alcohol       1.00      0.67      0.80        45
  Depression       0.77      0.99      0.87       115
       Drugs       1.00      0.50      0.67        10
     Suicide       0.50      0.07      0.12        15

    accuracy                           0.81       185
   macro avg       0.82      0.56      0.61       185
weighted avg       0.82      0.81      0.78       185



### Processing the test data

In [36]:
# Dropping the unnecessary column

test_df.drop('ID', inplace=True, axis=1)

In [37]:
test_df.head()

Unnamed: 0,text
0,How to overcome bad feelings and emotions
1,I feel like giving up in life
2,I was so depressed feel like got no strength t...
3,I feel so low especially since I had no one to...
4,can i be successful when I am a drug addict?


In [38]:
# Cleaning the data

test_data = test_df['text'].apply(clean_text)

In [43]:
test_data = model.transform(test_data)

AttributeError: lower not found

In [40]:
# Reshaping the size of the train_data to the size test_data

# test_data.resize(309, 734)

In [41]:
# Making prediction on the test data

Test_prediction = classi.predict(test_data)

ValueError: X has 486 features per sample; expecting 734

In [42]:
pd.Series(Test_prediction)

NameError: name 'Test_prediction' is not defined

In [377]:
Submit_df = pd.DataFrame({'ID':test_df.ID, 'Prediction':Test_prediction})

In [382]:
Submission = pd.get_dummies(Submit_df.Prediction)
Submission['ID'] = test_df.ID

In [388]:
Submission.set_axis(test_df.ID, axis=0, inplace=True)

In [390]:
Submission.drop('ID', axis=1, inplace=True)

In [391]:
Submission

Unnamed: 0_level_0,Alcohol,Depression,Drugs,Suicide
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
02V56KMO,1,0,0,0
03BMGTOK,1,0,0,0
03LZVFM6,1,0,0,0
0EPULUM5,1,0,0,0
0GM4C5GD,1,0,0,0
...,...,...,...,...
Z9A6ACLK,0,1,0,0
ZDUOIGKN,0,0,1,0
ZHQ60CCH,0,0,1,0
ZVIJMA4O,0,0,0,1


In [392]:
test_df

Unnamed: 0,ID,text
0,02V56KMO,How to overcome bad feelings and emotions
1,03BMGTOK,I feel like giving up in life
2,03LZVFM6,I was so depressed feel like got no strength t...
3,0EPULUM5,I feel so low especially since I had no one to...
4,0GM4C5GD,can i be successful when I am a drug addict?
...,...,...
304,Z9A6ACLK,Yes
305,ZDUOIGKN,My girlfriend dumped me
306,ZHQ60CCH,How can I go back to being my old self?
307,ZVIJMA4O,Is it true bhang is medicinal?
