<a href="https://colab.research.google.com/github/Anou26/WellBot/blob/main/MH_ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install nltk



**Data Pre-Processing**

In [23]:
#Import necessary libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [24]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
# Load the CSV file to get a glimpse of its structure and content
file_path = '/content/mental_health.csv'
mental_health_data = pd.read_csv(file_path, encoding="ISO-8859-1")
# Display the first few rows of the dataset
mental_health_data.head()

Unnamed: 0,text,label
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1
2,music recommendations im looking expand playli...,0
3,im done trying feel betterthe reason im still ...,1
4,worried year old girl subject domestic physic...,1


In [45]:
msg=mental_health_data.text
msg

0        dear american teens question dutch person hear...
1        nothing look forward lifei dont many reasons k...
2        music recommendations im looking expand playli...
3        im done trying feel betterthe reason im still ...
4        worried  year old girl subject domestic physic...
                               ...                        
27972    posting everyday people stop caring  religion ...
27973    okay definetly need hear guys opinion ive pret...
27974    cant get dog think ill kill myselfthe last thi...
27975    whats point princess bridei really think like ...
27976    got nudes person might might know snapchat do ...
Name: text, Length: 27977, dtype: object

In [50]:
#Removing Special Characters
msg=msg.str.replace('[^a-zA-Z0-9]+'," ")
msg

0        dear american teens question dutch person hear...
1        nothing look forward lifei dont many reasons k...
2        music recommendations im looking expand playli...
3        im done trying feel betterthe reason im still ...
4        worried  year old girl subject domestic physic...
                               ...                        
27972    posting everyday people stop caring  religion ...
27973    okay definetly need hear guys opinion ive pret...
27974    cant get dog think ill kill myselfthe last thi...
27975    whats point princess bridei really think like ...
27976    got nudes person might might know snapchat do ...
Name: text, Length: 27977, dtype: object

In [51]:
# Manually defined stop words
manual_stop_words = set(["a", "an", "the", "and", "or", "in", "of", "at", "by", "for", "with", "about", "on", "to", "as", "is", "are", "was", "were", "be", "been", "being"])

In [52]:
# Initialize the lemmatizer and stop words list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [53]:
def preprocess_text(text):
    # Here we do, Tokenizing, Removing Stop words, Lowercasing, Lemmatizing
    # Convert text to lowercase and split into words
    tokens = text.lower().split()
    # Remove stop words and lemmatize
    lemmatized = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalnum() and word.lower() not in [stop_words, manual_stop_words]]
    return ' '.join(lemmatized)

In [54]:
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
msg=msg.apply(lambda line:[stemmer.stem(token.lower()) for token in word_tokenize(line)]).apply(lambda token:" ".join(token))

In [55]:
msg=msg.apply(lambda line:[token for token in word_tokenize(line) if len(token)>2]).apply(lambda y:" ".join(y))

**Feature Extraction**

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Applying TF-IDF to the preprocessed text
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
processed_texts = msg.apply(preprocess_text)
X = tfidf_vectorizer.fit_transform(processed_texts)

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Assuming 'label' is the column in our dataset that contains mood classifications
y = mental_health_data['label'].values
y

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
sv=SVC()
# nb=GaussianNB()
rf=RandomForestClassifier()
ab= AdaBoostClassifier()
models=[sv,rf,ab]
for model in models:
  print(model)
  model.fit(X_train,y_train)
  y_pred=model.predict(X_test)
  print(classification_report(y_test,y_pred))

SVC()
              precision    recall  f1-score   support

           0       0.90      0.93      0.92      2802
           1       0.92      0.90      0.91      2794

    accuracy                           0.91      5596
   macro avg       0.91      0.91      0.91      5596
weighted avg       0.91      0.91      0.91      5596

RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2802
           1       0.88      0.90      0.89      2794

    accuracy                           0.89      5596
   macro avg       0.89      0.89      0.89      5596
weighted avg       0.89      0.89      0.89      5596

AdaBoostClassifier()
              precision    recall  f1-score   support

           0       0.86      0.91      0.88      2802
           1       0.90      0.85      0.87      2794

    accuracy                           0.88      5596
   macro avg       0.88      0.88      0.88      5596
weighted avg       0.8

In [73]:
#Classification
y_new=model.predict(tfidf_vectorizer.transform(["i feel good"]))
if y_new==1:
    print("positive")
if y_new==0:
    print("negative")

negative


In [77]:
#ChatBot
def chatbot_response(text):
    processed_text = preprocess_text(text)
    vectorized_text = tfidf_vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    prediction_proba = model.predict_proba(vectorized_text)[0]

    # Provide responses based on the confidence level of the predictions
    if prediction == 1 and max(prediction_proba) > 0.6:
        return "It sounds like you're having a tough day. What can you do to take your mind off things?"
    elif prediction == 0 and max(prediction_proba) > 0.6:
        return "That's great to hear! What's been the best part of your day?"
    else:
        return "I'm here to listen. Tell me more about how you're feeling."

# Adjust the main loop to handle conversations
if __name__ == "__main__":
    main()

Hello, I'm here to help you with your mood. How are you feeling today?
You: good
Chatbot: I'm here to listen. Tell me more about how you're feeling.
You: exit
Chatbot: Goodbye! Take care.


**Integrating the Q/A csv into the chatbot**

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Normalize text by removing punctuation, tokenizing, removing stopwords, and lemmatizing."""
    # Lowercase and remove punctuation
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return cleaned_tokens

def match_question(user_input):
    """Match user input to the closest question in the dataset and return the corresponding answer."""
    # Clean user input
    cleaned_input = set(clean_text(user_input))

    # Attempt to find the best match in the dataset
    best_match = None
    max_overlap = 0
    for idx, row in mental_health_data.iterrows():
        # Clean the question from the dataset
        cleaned_question = set(clean_text(row['Questions']))
        # Calculate overlap using intersection of sets
        overlap = len(cleaned_input & cleaned_question)
        if overlap > max_overlap:
            max_overlap = overlap
            best_match = row['Answers']

    if best_match:
        return best_match
    else:
        return "I'm sorry, I don't have an answer to that question."

# Test the function with a sample user input
test_input = "What should I do if someone has a mental problem?"
match_question(test_input)
