In [1]:
!pip install scikit-learn==1.3.2
import sklearn
print(sklearn.__version__)


Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.3.2
1.3.2


In [2]:
# modules

import numpy as np
import pandas as pd
import re
import nltk
import random
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score, recall_score
from sklearn.metrics import accuracy_score
from tqdm import tqdm
tqdm.pandas()


In [3]:
stress_data = pd.read_csv(r'/stress.csv')
stress_data.shape

FileNotFoundError: [Errno 2] No such file or directory: '/stress.csv'

In [None]:
stress_data.head(10)

In [None]:
stress_data.isnull().sum()

0 -> No Stress

1 -> Stressed

In [None]:
stress_data['label'].value_counts()

In [None]:
plt.Figure(figsize=(3,2))
sns.countplot(data=stress_data, x='label', palette=['green', 'red'])
plt.title('Stress Distribution')
plt.ylabel('count')
plt.show()

In [None]:
# analyzing random text
random_text = [random.randint(0, stress_data.shape[0]-1) for i in range(5)]
for i in stress_data['text'].loc[random_text]:
  print(i,"\n")

In [None]:
# downloading NLTK resources

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# lemmatizer and stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# function for cleaning text

def clean_text(text):
   # removing URLs
  text = re.sub(r'http\S+|www\S+|https\S+', '', text)

  # removing mentions
  text = re.sub(r'@\w+|[^a-zA-Z\s]', '', text)

  # Convert to lowercase
  text = text.lower()

  # Tokenization
  tokens = word_tokenize(text)

   # removing stopwords and lemmatize
  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
  return ' '.join(tokens)

In [None]:
# using clean_text function in text column of dataset
stress_data['cleaned_text'] = stress_data['text'].progress_apply(clean_text)

In [None]:
stress_data.head()

In [None]:
from wordcloud import WordCloud, STOPWORDS

# Combine all cleaned text into a single string
text = " ".join(i for i in stress_data['cleaned_text'])
stopwords = set(STOPWORDS)

# Generate the WordCloud
wordcloud = WordCloud(stopwords=stopwords,
                      background_color="white").generate(text)

# Plot the WordCloud
plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
stress_data = stress_data[['cleaned_text', 'label']]
stress_data

In [None]:
X = stress_data['cleaned_text'].values
y = stress_data['label'].values

In [None]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [None]:
# converting textual data into numerical data

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB()
}

model_performance = {}
best_model_name = None
best_model_score = 0

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    training_accuracy = accuracy_score(y_train, y_train_pred)
    testing_accuracy = accuracy_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred, average='weighted')
    precision = precision_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')

    model_performance[model_name] = {
        "Training Accuracy": training_accuracy,
        "Testing Accuracy": testing_accuracy,
        "Recall": recall,
        "Precision": precision,
        "F1 Score": f1
    }

    print(f"Model: {model_name}")
    print(f"Training Accuracy: {training_accuracy:.4f}")
    print(f"Testing Accuracy: {testing_accuracy:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("--" * 20)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
    cm_display.plot(cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix for {model_name}")
    plt.show()

    # Check if this model is the best so far
    if testing_accuracy > best_model_score:
        best_model_score = testing_accuracy
        best_model_name = model_name
        best_model = model

print(f"Best Model: {best_model_name} with Testing Accuracy: {best_model_score:.4f}")


In [None]:
# saving the model
import pickle

# Save the best model using pickle
if best_model_name:
    with open(f"{best_model_name.replace(' ', '_')}_stress_best_model.pkl", "wb") as file:
        pickle.dump(best_model, file)
    print(f"Best model '{best_model_name}' saved as '{best_model_name.replace(' ', '_')}_best_model.pkl'")

In [None]:
# loading the saved model

loaded_model = pickle.load(open('/content/Logistic_Regression_stress_best_model.pkl','rb'))

In [None]:
for _ in range(10):
  random_index = random.randint(0, X_test.shape[0]-1)
  X_new = X_test[random_index]
  true_label = y_test[random_index]

  prediction = model.predict(X_new)
  print(f"True Label: {true_label}, Prediction: {prediction}")

In [None]:
from google.colab import files

# Replace 'Logistic_Regression_best_model.pkl' with your actual file name
files.download('Logistic_Regression_stress_best_model.pkl')


In [None]:
with open("tfidf_vectorizer_stress.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)

In [None]:
from google.colab import files
files.download('tfidf_vectorizer_stress.pkl')
