## 2. Data Understanding

In [86]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# import files
train_df = pd.read_csv('Train.csv')
validation_df = pd.read_csv('Test.csv')

In [87]:
# shape of the datasets
print(f'The train data shape: {train_df.shape}')
print(f'The test data shape: {validation_df.shape}')

The train data shape: (616, 3)
The test data shape: (309, 2)


In [88]:
# the columns in the datasets
print(f'The train data columns: \n {train_df.columns}')
print(f'The test data columns: \n {validation_df.columns}')

The train data columns: 
 Index(['ID', 'text', 'label'], dtype='object')
The test data columns: 
 Index(['ID', 'text'], dtype='object')


In [89]:
# the info
print(f'The train data info: {train_df.info()} \n \n')
print(f'The test data info: {validation_df.info()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 616 entries, 0 to 615
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      616 non-null    object
 1   text    616 non-null    object
 2   label   616 non-null    object
dtypes: object(3)
memory usage: 14.6+ KB
The train data info: None 
 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      309 non-null    object
 1   text    309 non-null    object
dtypes: object(2)
memory usage: 5.0+ KB
The test data info: None


In [90]:
# classes proportionality 
train_df['label'].value_counts(normalize=True)

Depression    0.571429
Alcohol       0.227273
Suicide       0.107143
Drugs         0.094156
Name: label, dtype: float64

In [91]:
train_df['length'] =  train_df['text'].apply(len)
train_df['length']

0      39
1      28
2      57
3      22
4      51
       ..
611    36
612    30
613    24
614    16
615    31
Name: length, Length: 616, dtype: int64

In [92]:
train_df.describe()

Unnamed: 0,length
count,616.0
mean,39.813312
std,21.438797
min,8.0
25%,26.0
50%,35.0
75%,48.25
max,196.0


In [93]:
train_df[train_df['length'] == 196]

Unnamed: 0,ID,text,label,length
194,J55053XP,I am financially constrained over school fees ...,Depression,196


In [94]:
print(train_df['text'].iloc[194])

I am financially constrained over school fees and my  family background is not stable with a lot of debts…I have an elderly brother who could easily support me but has no job even after graduating


## 3. Data Preparation

In [95]:
# changing text to lowercase
train_df['text'] = train_df['text'].apply(lambda x: x.lower())
train_df.head()

Unnamed: 0,ID,text,label,length
0,SUAVK39Z,i feel that it was better i dieam happy,Depression,39
1,9JDAGUV3,why do i get hallucinations?,Drugs,28
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression,57
3,6UY7DX6Q,why is life important?,Suicide,22
4,FYC0FTFB,how could i be helped to go through the depres...,Depression,51


In [96]:
# removing the punctuation marks
import string

punc_to_rem = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~...'

train_df['text'] = train_df['text'].apply(lambda x: x.translate(str.maketrans('', '', punc_to_rem)))

train_df.head()

Unnamed: 0,ID,text,label,length
0,SUAVK39Z,i feel that it was better i dieam happy,Depression,39
1,9JDAGUV3,why do i get hallucinations,Drugs,28
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression,57
3,6UY7DX6Q,why is life important,Suicide,22
4,FYC0FTFB,how could i be helped to go through the depres...,Depression,51


In [97]:
print(train_df['text'].iloc[48])

i am facing a lot of challenges in life financially emotionally psycologically and with no solutions…how can i safely look for solutions about depression on google


In [98]:
from textblob import TextBlob

def correct_sent(x):
    correction = TextBlob(str(x))
    correction = correction.correct()
    
    # print(correction)
    return correction

train_df['corrected_sent'] = train_df['text'].apply(lambda x: str(correct_sent(x)))
train_df.head()

Unnamed: 0,ID,text,label,length,corrected_sent
0,SUAVK39Z,i feel that it was better i dieam happy,Depression,39,i feel that it was better i dream happy
1,9JDAGUV3,why do i get hallucinations,Drugs,28,why do i get hallucinations
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression,57,i am stressed due to lack of financial support...
3,6UY7DX6Q,why is life important,Suicide,22,why is life important
4,FYC0FTFB,how could i be helped to go through the depres...,Depression,51,how could i be helped to go through the depres...


In [39]:

# nltk.download('wordnet')
# nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [100]:
# removing stop words (Karen)
stopwords = nltk.corpus.stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()

def remove_stopwords(x):
    sent = [wordnet_lemmatizer.lemmatize(i, 'v') for i in x.split() if i not in stopwords]
    return ' '.join(sent)

train_df['no_stopwords'] = train_df['corrected_sent'].apply(lambda x: remove_stopwords(x))

train_df.head()

Unnamed: 0,ID,text,label,length,corrected_sent,no_stopwords
0,SUAVK39Z,i feel that it was better i dieam happy,Depression,39,i feel that it was better i dream happy,feel better dream happy
1,9JDAGUV3,why do i get hallucinations,Drugs,28,why do i get hallucinations,get hallucinations
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression,57,i am stressed due to lack of financial support...,stress due lack financial support school
3,6UY7DX6Q,why is life important,Suicide,22,why is life important,life important
4,FYC0FTFB,how could i be helped to go through the depres...,Depression,51,how could i be helped to go through the depres...,could help go depression


In [37]:
# find a way to handle the extra punctation (e.g '...')
print(train_df['no_stopwords'].iloc[194] + '\n \n')

print(train_df['no_stopwords'].iloc[48])

financially constrain school fee family background stable lot debts…i elderly brother could easily support job even granulate
 

face lot challenge life financially, emotional, psycologically solutions…how safely look solutions depression goose


In [101]:
# tokenizing the sentences
train_df['tokenized_text'] = train_df['no_stopwords'].apply(lambda x: nltk.word_tokenize(x))
train_df.head()

Unnamed: 0,ID,text,label,length,corrected_sent,no_stopwords,tokenized_text
0,SUAVK39Z,i feel that it was better i dieam happy,Depression,39,i feel that it was better i dream happy,feel better dream happy,"[feel, better, dream, happy]"
1,9JDAGUV3,why do i get hallucinations,Drugs,28,why do i get hallucinations,get hallucinations,"[get, hallucinations]"
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression,57,i am stressed due to lack of financial support...,stress due lack financial support school,"[stress, due, lack, financial, support, school]"
3,6UY7DX6Q,why is life important,Suicide,22,why is life important,life important,"[life, important]"
4,FYC0FTFB,how could i be helped to go through the depres...,Depression,51,how could i be helped to go through the depres...,could help go depression,"[could, help, go, depression]"


In [None]:
# Convert sentences into vectors
train_df['vector_sent'] = 