## imports

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")



In [3]:
#Loading of dataset
df= pd.read_excel('dataset.xlsx')

## Data cleaning

In [4]:
df

Unnamed: 0,text,label
0,oh my gosh,1.0
1,"trouble sleeping, confused mind, restless hear...",1.0
2,"All wrong, back off dear, forward doubt. Stay ...",1.0
3,I've shifted my focus to something else but I'...,1.0
4,"I'm restless and restless, it's been a month n...",1.0
...,...,...
6977,I can't forget you #SpiritHadrian,0.0
6978,€ ®šæœŸâ˜†ã€'..DJ DAIKI! DJ DAIKI! DJ DAIKI!.D...,0.0
6979,Dai5y! <3,0.0
6980,tired of clowns but still hopefully tonight if...,0.0


In [5]:
df.isna().sum()

text     10
label     2
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df.isnull().sum()

text     0
label    0
dtype: int64

**Let's make our label data more readable by replacing the binary with depressed and non-depressed**

In [8]:
df=df.replace(0,'not depressed')
df=df.replace(1,'depressed')

In [12]:
df[df['label']=='not depressed']

Unnamed: 0,text,label
733,"Gr gr dreaming of ex crush to be my game, God",not depressed
734,wkwkwk what a joke,not depressed
735,Leaves are also standby in front of the PC ......,not depressed
736,Thank God even though it's just a ride through,not depressed
737,wedding teaser concept using the song day6 - o...,not depressed
...,...,...
6977,I can't forget you #SpiritHadrian,not depressed
6978,€ ®šæœŸâ˜†ã€'..DJ DAIKI! DJ DAIKI! DJ DAIKI!.D...,not depressed
6979,Dai5y! <3,not depressed
6980,tired of clowns but still hopefully tonight if...,not depressed


In [None]:
df['label'].value_counts()

## Data visualization

In [None]:
plt.figure(figsize=(20, 6), dpi=227)
plt.subplot(1, 2, 1)


labels = ["Not Depression", "Depressed"]
sizes = [6240, 730]
custom_colours = ['#ff7675', '#74b9ff']

plt.pie(sizes,
        labels = labels,
        textprops={'fontsize': 15},
        startangle=140,
        autopct='%1.0f%%',
        colors=custom_colours,
        explode=[0, 0.05])
plt.subplot(1, 2, 2)
sns.barplot(x=labels,y = sizes, palette= 'viridis')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.utils import resample

# Count the number of samples for each class
count_class_0, count_class_1 = df['label'].value_counts()

# Separate samples by class
df_class_0 = df[df['label'] == 'not depressed']
df_class_1 = df[df['label'] == 'depressed']

# Downsample majority class (class 0)
df_class_0_downsampled = resample(df_class_0,
                                  replace=False,  # sample without replacement
                                  n_samples=count_class_1,  # match minority class
                                  random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_class_1, df_class_0_downsampled])

# Shuffle the dataset
df= df_downsampled.sample(frac=1, random_state=42)


In [None]:
sns.displot(data=df,x='label')

from this visualization we can make an inference  that most of the comments are depression negative


## Data modelling


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [None]:
X = df["text"].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 101)

In [None]:
vect=TfidfVectorizer(stop_words="english")

In [None]:
rfc=RandomForestClassifier()

In [None]:
train_x=vect.fit_transform(X_train)

In [None]:
rfc.fit(train_x,y_train)

In [None]:
test_x=vect.transform(X_test)

In [None]:
pred=rfc.predict(test_x)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,pred))

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('vect', TfidfVectorizer()),('rfc', RandomForestClassifier()),])

**This performance metric result shows that our model is very accurate**

## Model Persistency and Full Training

In [None]:
pipe.fit(X_train,y_train)

In [None]:
pred_2=pipe.predict(X_test)

In [None]:
print(classification_report(y_test,pred_2))

## saving the model for deployment

In [None]:
pipe.fit(X,y)

In [None]:
import pickle

In [None]:
# Save the model in Pickle format
with open('depression_analyzer.pkl', 'wb') as model_file:
    pickle.dump(pipe, model_file)

In [None]:
df.columns[0]

In [None]:
import pickle

# Save the item using pickle
with open('chat_input.pkl', 'wb') as chat_input_file:
    pickle.dump(df.columns[0], chat_input_file)
