In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import pickle

In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def load_and_preprocess_data():
    # Load the dataset
    with open('intents.json', 'r') as file:
        data = json.load(file)

    # Create a DataFrame from the JSON data
    intents_data = []
    for intent in data:
        tag = intent['tag']
        for pattern in intent['patterns']:
            for response in intent['responses']:
                intents_data.append({
                    'tag': tag,
                    'pattern': pattern,
                    'response': response
                })

    df = pd.DataFrame(intents_data)

    # Basic data cleaning
    df['pattern'] = df['pattern'].str.lower()
    df['pattern'] = df['pattern'].str.replace('[_, *]', '')

    # Perform EDA
    print(df.head())
    print(df['tag'].value_counts())

    # Visualize tag distribution
    plt.figure(figsize=(12, 6))
    df['tag'].value_counts().plot(kind='bar')
    plt.title('Distribution of Intent Tags')
    plt.xlabel('Tags')
    plt.ylabel('Count')
    plt.savefig('tag_distribution.png')
    plt.close()

    # Tokenize and lemmatize patterns
    lemmatizer = WordNetLemmatizer()

    def preprocess_text(text):
        tokens = word_tokenize(text)
        return ' '.join([lemmatizer.lemmatize(word.lower()) for word in tokens])

    df['processed_pattern'] = df['pattern'].apply(preprocess_text)

    # Split data into training, validation, and test sets
    X = df['processed_pattern']
    y = df['tag']

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    print("Training set size:", len(X_train))
    print("Validation set size:", len(X_val))
    print("Test set size:", len(X_test))

    # Save the processed data and splits
    data_dict = {
        'df': df,
        'X_train': X_train,
        'X_val': X_val,
        'X_test': X_test,
        'y_train': y_train,
        'y_val': y_val,
        'y_test': y_test
    }

    with open('processed_data.pkl', 'wb') as f:
        pickle.dump(data_dict, f)

if __name__ == "__main__":
    load_and_preprocess_data()

        tag pattern                                           response
0  greeting      hi                     Hi there, How can I help you ?
1  greeting      hi                        Hello, How can I help you ?
2  greeting      hi                    Hey there, How can I help you ?
3  greeting      hi  Hi, I am a chatbot!, About which topic you are...
4  greeting   hello                     Hi there, How can I help you ?
tag
greeting                  12
movies                    12
mindfulness               12
emotional_intelligence    12
books                     12
                          ..
favorite_color             4
weather                    4
goodbye                    4
name                       3
credit_score               3
Name: count, Length: 273, dtype: int64
Training set size: 1176
Validation set size: 252
Test set size: 253


In [4]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [5]:
def train_models():
    # Load processed data
    with open('processed_data.pkl', 'rb') as f:
        data_dict = pickle.load(f)

    X_train = data_dict['X_train']
    X_val = data_dict['X_val']
    y_train = data_dict['y_train']
    y_val = data_dict['y_val']

    # Create TF-IDF vectorizer
    tfidf = TfidfVectorizer(ngram_range=(1, 2))
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_val_tfidf = tfidf.transform(X_val)

    # Implement baseline model (Naive Bayes)
    nb_model = MultinomialNB()
    nb_model.fit(X_train_tfidf, y_train)

    # Evaluate baseline model
    nb_pred = nb_model.predict(X_val_tfidf)
    print("Naive Bayes Baseline Model:")
    print(classification_report(y_val, nb_pred))

    # Train different models and tune hyperparameters
    models = {
        'SVM': SVC(),
        'RandomForest': RandomForestClassifier()
    }

    param_grids = {
        'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'RandomForest': {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
    }

    best_models = {}

    for name, model in models.items():
        grid_search = GridSearchCV(model, param_grids[name], cv=5, n_jobs=-1)
        grid_search.fit(X_train_tfidf, y_train)
        best_models[name] = grid_search.best_estimator_
        
        print(f"\nBest {name} Model:")
        print(f"Best parameters: {grid_search.best_params_}")
        y_pred = grid_search.predict(X_val_tfidf)
        print(classification_report(y_val, y_pred))

    # Save the best models and vectorizer
    model_dict = {
        'tfidf': tfidf,
        'best_models': best_models
    }

    with open('trained_models.pkl', 'wb') as f:
        pickle.dump(model_dict, f)

if __name__ == "__main__":
    train_models()

Naive Bayes Baseline Model:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                          precision    recall  f1-score   support

                                   about       0.50      1.00      0.67         1
                                     age       0.00      0.00      0.00         1
                          alien_invasion       1.00      1.00      1.00         1
                         alien_languages       0.00      0.00      0.00         3
                          arcane_alchemy       1.00      1.00      1.00         1
                                     art       0.00      0.00      0.00         1
                 artificial_intelligence       0.00      0.00      0.00         2
                         astral_artistry       0.00      0.00      0.00         1
                       astral_projection       0.00      0.00      0.00         3
                          book_of_spells       1.00      1.00      1.00         1
                                   books       0.00      0.00      0.00         0
               




Best SVM Model:
Best parameters: {'C': 10, 'kernel': 'linear'}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                          precision    recall  f1-score   support

                                   about       1.00      1.00      1.00         1
                                     age       1.00      1.00      1.00         1
                          alien_invasion       1.00      1.00      1.00         1
                         alien_languages       1.00      1.00      1.00         3
                          arcane_alchemy       1.00      1.00      1.00         1
                                     art       0.50      1.00      0.67         1
                          art_and_crafts       0.00      0.00      0.00         0
                 artificial_intelligence       1.00      0.50      0.67         2
            artificial_superintelligence       0.00      0.00      0.00         0
                         astral_artistry       1.00      1.00      1.00         1
                       astral_projection       1.00      0.33      0.50         3
               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                          precision    recall  f1-score   support

                                   about       1.00      1.00      1.00         1
                                     age       1.00      1.00      1.00         1
                          alien_invasion       1.00      1.00      1.00         1
                         alien_languages       1.00      1.00      1.00         3
                          arcane_alchemy       0.33      1.00      0.50         1
                                     art       0.50      1.00      0.67         1
                 artificial_intelligence       1.00      0.50      0.67         2
                         astral_artistry       1.00      1.00      1.00         1
                       astral_projection       1.00      0.33      0.50         3
                          book_of_spells       1.00      1.00      1.00         1
                                   books       0.00      0.00      0.00         0
               

In [6]:
import pickle
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [7]:
def evaluate_models():
    # Load processed data and trained models
    with open('processed_data.pkl', 'rb') as f:
        data_dict = pickle.load(f)

    with open('trained_models.pkl', 'rb') as f:
        model_dict = pickle.load(f)

    X_test = data_dict['X_test']
    y_test = data_dict['y_test']
    X_train = data_dict['X_train']
    y_train = data_dict['y_train']
    tfidf = model_dict['tfidf']
    best_models = model_dict['best_models']

    X_test_tfidf = tfidf.transform(X_test)
    X_train_tfidf = tfidf.transform(X_train)

    # Evaluate models using relevant metrics
    def evaluate_model(model, X, y):
        y_pred = model.predict(X)
        accuracy = accuracy_score(y, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='weighted')
        return accuracy, precision, recall, f1

    # Evaluate best model on test set
    best_model = max(best_models.values(), key=lambda m: m.score(X_test_tfidf, y_test))
    test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(best_model, X_test_tfidf, y_test)

    print("Best Model Performance on Test Set:")
    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1-score: {test_f1:.4f}")

    # Implement ensemble techniques
    # Note: We're creating new instances of the models to ensure they all have predict_proba
    ensemble_model = VotingClassifier(
        estimators=[
            ('nb', MultinomialNB()),
            ('svm', SVC(probability=True, kernel='linear')),
            ('rf', RandomForestClassifier())
        ],
        voting='soft'
    )

    ensemble_model.fit(X_train_tfidf, y_train)
    ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1 = evaluate_model(ensemble_model, X_test_tfidf, y_test)

    print("\nEnsemble Model Performance on Test Set:")
    print(f"Accuracy: {ensemble_accuracy:.4f}")
    print(f"Precision: {ensemble_precision:.4f}")
    print(f"Recall: {ensemble_recall:.4f}")
    print(f"F1-score: {ensemble_f1:.4f}")

    # Save the final model
    final_model = ensemble_model if ensemble_accuracy > test_accuracy else best_model
    
    final_model_dict = {
        'tfidf': tfidf,
        'model': final_model,
        'df': data_dict['df']
    }

    with open('final_model.pkl', 'wb') as f:
        pickle.dump(final_model_dict, f)

if __name__ == "__main__":
    evaluate_models()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Model Performance on Test Set:
Accuracy: 0.7510
Precision: 0.8123
Recall: 0.7510
F1-score: 0.7514


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Ensemble Model Performance on Test Set:
Accuracy: 0.7510
Precision: 0.8063
Recall: 0.7510
F1-score: 0.7493


In [8]:
# In the models dictionary in model_training.py
models = {
    'SVM': SVC(probability=True),
    'RandomForest': RandomForestClassifier()
}