In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score
import joblib

In [3]:
#Load Dataset
df=pd.read_csv("data_to_be_cleansed.csv")
#Remove redundant columns
df=df.iloc[::,1:]
#Drop rows with missing 'text' values
df=df.dropna(subset=['text'])
#Remove Duplicates
df=df.drop_duplicates(subset=['text','title'])
# Combine 'text' and 'title' columns
df['input'] = df['text'] + " " + df['title']

In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to 'text' and 'title' columns
df['input_text'] = df['input'].apply(preprocess_text)
# Drop the original 'text' and 'title' columns if not needed
df.drop(columns=['text', 'title', 'input'], inplace=True)
# Save the cleaned dataset
df.to_csv('clean.csv', index=False)

In [19]:
# Load the cleaned dataset
df1=pd.read_csv('clean.csv')
# Initialize TF-IDF vectorizer
vz=TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
# Extract features from processed text
vz.fit(df1['input_text'])
# Target column
y=df1['target']

In [20]:
# Split dataset into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
# Train Logistic Regression model
model=LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)
# Evaluate the model
y_pred = model.predict(X_test)
y_train_pred=model.predict(X_train)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7392776523702032
Accuracy: 0.90770533446232
              precision    recall  f1-score   support

           0       0.87      0.73      0.79       182
           1       0.67      0.77      0.72       199
           2       0.84      0.68      0.75       164
           3       0.64      0.76      0.70       160
           4       0.74      0.75      0.74       181

    accuracy                           0.74       886
   macro avg       0.75      0.74      0.74       886
weighted avg       0.75      0.74      0.74       886



In [21]:
# Example user input
new_text = [input()]
 # Replace with your text

# Preprocess the input (vectorize using the same TF-IDF vectorizer)
new_text_vectorized = vectorizer.transform(new_text)

# Predict the class
predicted_class = model.predict(new_text_vectorized)

# Get the probability of each class
predicted_probabilities = model.predict_proba(new_text_vectorized)

# Display the results
print("Predicted Class:", predicted_class[0])
print("Class Probabilities:", predicted_probabilities[0])
dic={0:'Stress',1:'Depression',2:'Bipolar disorder',3:'Personality disorder',4:'Anxiety'}
result=dic[predicted_class[0]]
new_text=f'I am suffering from {result}'

Predicted Class: 0
Class Probabilities: [9.99954039e-01 1.24511281e-05 1.16013735e-05 1.54524368e-05
 6.45654152e-06]


In [22]:
dump=joblib.dump(model,'model.pkl')
dump2=joblib.dump(vz,'vz.pkl')