In [27]:
!pip install nltk scikit-learn



In [28]:
import nltk
import pandas as pd
import json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import string

In [29]:
# NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
# JSON file
with open('/content/intents.json', 'r') as f:
    data = json.load(f)

In [31]:
# patterns and responses
patterns = []
responses = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(pattern)
        responses.append(intent['responses'][0])

In [32]:
# DF
df = pd.DataFrame({
    "patterns": patterns,
    "responses": responses
})
df

Unnamed: 0,patterns,responses
0,Hi,Hello there. Tell me how are you feeling today?
1,Hey,Hello there. Tell me how are you feeling today?
2,Is anyone there?,Hello there. Tell me how are you feeling today?
3,Hi there,Hello there. Tell me how are you feeling today?
4,Hello,Hello there. Tell me how are you feeling today?
...,...,...
227,How do I know if I'm unwell?,"If your beliefs , thoughts , feelings or behav..."
228,How can I maintain social connections? What if...,"A lot of people are alone right now, but we do..."
229,What's the difference between anxiety and stress?,Stress and anxiety are often used interchangea...
230,What's the difference between sadness and depr...,"Sadness is a normal reaction to a loss, disapp..."


In [33]:
# preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in stop_words and word not in string.punctuation]
    return ' '.join(words)

df['preprocessed_patterns'] = df['patterns'].apply(preprocess_text)
df

Unnamed: 0,patterns,responses,preprocessed_patterns
0,Hi,Hello there. Tell me how are you feeling today?,hi
1,Hey,Hello there. Tell me how are you feeling today?,hey
2,Is anyone there?,Hello there. Tell me how are you feeling today?,anyone
3,Hi there,Hello there. Tell me how are you feeling today?,hi
4,Hello,Hello there. Tell me how are you feeling today?,hello
...,...,...,...
227,How do I know if I'm unwell?,"If your beliefs , thoughts , feelings or behav...",know 'm unwell
228,How can I maintain social connections? What if...,"A lot of people are alone right now, but we do...",maintain social connections feel lonely
229,What's the difference between anxiety and stress?,Stress and anxiety are often used interchangea...,'s difference anxiety stress
230,What's the difference between sadness and depr...,"Sadness is a normal reaction to a loss, disapp...",'s difference sadness depression


In [63]:
# Split the data
X = df['preprocessed_patterns']
y = df['responses']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)

In [64]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [65]:
# pipeline for the Naive Bayes classifier
nb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

In [66]:
# parameter grid for Naive Bayes
param_grid_nb = {
    'vect__stop_words': [None, 'english'],
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf__alpha': [0.1, 0.5, 1.0]
}

In [67]:
# grid search
grid_search_nb = GridSearchCV(nb_pipeline, param_grid_nb, cv=5, scoring='accuracy')
grid_search_nb.fit(X_train, y_train)



In [68]:
print("Best parameters for Naive Bayes:", grid_search_nb.best_params_)
print("Best cross-validation score for Naive Bayes: {:.2f}".format(grid_search_nb.best_score_))

Best parameters for Naive Bayes: {'clf__alpha': 0.1, 'vect__ngram_range': (1, 1), 'vect__stop_words': None}
Best cross-validation score for Naive Bayes: 0.37


In [69]:
best_nb_pipeline = grid_search_nb.best_estimator_
nb_test_accuracy = best_nb_pipeline.score(X_test, y_test)
print("Test set accuracy for Naive Bayes: {:.2f}".format(nb_test_accuracy))

Test set accuracy for Naive Bayes: 0.54


In [70]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [71]:
# pipeline for the Decision Tree classifier
dt_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', DecisionTreeClassifier())
])

In [72]:
# parameter grid for Decision Tree
param_grid_dt = {
    'vect__stop_words': [None, 'english'],
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

In [73]:
# grid search
grid_search_dt = GridSearchCV(dt_pipeline, param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)



In [74]:
print("Best parameters for Decision Tree:", grid_search_dt.best_params_)
print("Best cross-validation score for Decision Tree: {:.2f}".format(grid_search_dt.best_score_))

Best parameters for Decision Tree: {'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'vect__ngram_range': (1, 1), 'vect__stop_words': 'english'}
Best cross-validation score for Decision Tree: 0.26


In [75]:
best_dt_pipeline = grid_search_dt.best_estimator_
dt_test_accuracy = best_dt_pipeline.score(X_test, y_test)
print("Test set accuracy for Decision Tree: {:.2f}".format(dt_test_accuracy))

Test set accuracy for Decision Tree: 0.54
