In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sklearn
import seaborn as sns
# Natural Language Processing (NLP)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Text Vectorization and Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix,
    f1_score, precision_score, recall_score,
    roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score
)
from xgboost import XGBClassifier



In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dennis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dennis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dennis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
with open('../Final_Intents.json', 'r') as file:
    data = json.load(file)

In [None]:
df = pd.

In [4]:
# Function to pair each question with its corresponding response
def pair_questions_responses(data):
    paired_data = []
    for item in data:
        tag = item.get('tag', 'Unknown')
        questions = item.get('questions', [])
        responses = item.get('responses', [])

        for question, response in zip(questions, responses):
            paired_data.append({'tag': tag, 'question': question, 'response': response})

    return paired_data

In [5]:
# Convert the JSON data into a DataFrame
paired_data = pair_questions_responses(data)
df = pd.DataFrame(paired_data)
df.head ()

Unnamed: 0,tag,question,response
0,cloud_computing,What is the fundamental starting point for und...,The fundamental starting point for understandi...
1,cloud_computing,How does Moringa School introduce the concept ...,Moringa School introduces the concept of Cloud...
2,cloud_computing,Why is Cloud Computing considered essential in...,Cloud Computing is considered essential in the...
3,cloud_computing,Can you provide an overview of the Cloud Compu...,The Cloud Computing program at Moringa School ...
4,cloud_computing,What are the eligibility requirements for the ...,To be eligible for the Cloud Computing program...


In [8]:
# Text Cleaning
# Remove any unnecessary characters, symbols, or special characters.
# Convert text to lowercase to ensure uniformity.
df['question'] = df['question'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex=True)
df['response'] = df['response'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex=True)

# Display the cleaned DataFrame
df.head()

Unnamed: 0,tag,question,response
0,cloud_computing,what is the fundamental starting point for und...,the fundamental starting point for understandi...
1,cloud_computing,how does moringa school introduce the concept ...,moringa school introduces the concept of cloud...
2,cloud_computing,why is cloud computing considered essential in...,cloud computing is considered essential in the...
3,cloud_computing,can you provide an overview of the cloud compu...,the cloud computing program at moringa school ...
4,cloud_computing,what are the eligibility requirements for the ...,to be eligible for the cloud computing program...


In [None]:
#df = qna_df.reindex(np.random.permutation(qna_df.index))

In [9]:
# Tokenization
# Tokenization of the 'Question' column
df['Tokenized_Question'] = df['question'].apply(lambda x: nltk.word_tokenize(x))
df['Tokenized_Response'] = df['response'].apply(lambda x: nltk.word_tokenize(x))

In [10]:
# Remove Stopwords
stop_words = set(stopwords.words('english'))
df['Tokenized_Question'] = df['Tokenized_Question'].apply(lambda x: [word for word in x if word not in stop_words])
df['Tokenized_Response'] = df['Tokenized_Response'].apply(lambda x: [word for word in x if word not in stop_words])

In [11]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
df['Tokenized_Question'] = df['Tokenized_Question'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['Tokenized_Response'] = df['Tokenized_Response'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x]) 

In [13]:
# Display the DataFrame with tokenized and processed 'question' and 'response'
df[['tag', 'Tokenized_Question', 'Tokenized_Response']].head()

Unnamed: 0,tag,Tokenized_Question,Tokenized_Response
0,cloud_computing,"[fundamental, starting, point, understanding, ...","[fundamental, starting, point, understanding, ..."
1,cloud_computing,"[moringa, school, introduce, concept, cloud, c...","[moringa, school, introduces, concept, cloud, ..."
2,cloud_computing,"[cloud, computing, considered, essential, real...","[cloud, computing, considered, essential, real..."
3,cloud_computing,"[provide, overview, cloud, computing, program,...","[cloud, computing, program, moringa, school, o..."
4,cloud_computing,"[eligibility, requirement, cloud, computing, p...","[eligible, cloud, computing, program, student,..."


In [45]:
df['Tokenized_Question'].iloc[0]

['fundamental', 'starting', 'point', 'understanding', 'cloud', 'computing']

In [14]:
# Vectorization using TF-IDF for both 'question' and 'response'
tfidf_vectorizer_combined = TfidfVectorizer()
X_tfidf_combined = tfidf_vectorizer_combined.fit_transform(
    df.apply(lambda row: ' '.join(row['Tokenized_Question'] + row['Tokenized_Response']), axis=1)
)

## XGBOOST MODEL

In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_combined, df['tag'], test_size=0.2, random_state=42)

# Train a model 
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.39166666666666666
Classification Report:
                                                          precision    recall  f1-score   support

                           Advanced_Data_Science_Topics       0.00      0.00      0.00         1
                                                Contact       0.50      0.50      0.50         2
                     Cybersecurity_Career_Opportunities       0.50      0.33      0.40         3
                           Cybersecurity_Certifications       0.00      0.00      0.00         2
                       Cybersecurity_Course_Information       0.00      0.00      0.00         1
                                Data_Ethics_and_Privacy       0.00      0.00      0.00         2
            Data_Science_Admission_Eligibility_Criteria       1.00      0.50      0.67         2
                       Data_Science_Application_Process       0.00      0.00      0.00         0
                  Data_Science_Applications_in_Business       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
from sklearn.metrics.pairwise import cosine_similarity

def get_most_similar_response(user_input, df):
    preprocessed_input = preprocess_input(user_input)

    # Vectorize the user input
    vectorized_input = tfidf_vectorizer_combined.transform([preprocessed_input])

    # Vectorize all questions in the DataFrame
    vectorized_questions = tfidf_vectorizer_combined.transform(
        df.apply(lambda row: ' '.join(row['Tokenized_Question'] + row['Tokenized_Response']), axis=1)
    )

    # Calculate cosine similarity between the user input and all questions
    similarities = cosine_similarity(vectorized_input, vectorized_questions).flatten()

    # Get the index of the most similar question
    most_similar_index = similarities.argmax()

    # Retrieve the corresponding response
    most_similar_response = df.loc[most_similar_index, 'response']

    return most_similar_response

# Example usage
user_input = "How does Moringa School support graduates in their journey post-education?"
chatbot_response = get_most_similar_response(user_input, df)
print("Chatbot Response:", chatbot_response)

Chatbot Response: moringa school supports graduates in their journey post education by providing ongoing mentorship  access to resources  and a supportive alumni community 


## Hyperparameter Tuning for XGBoost

In [29]:
# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Initialize the XGBoost classifier
xgb_model = XGBClassifier()

# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(xgb_model, param_grid=param_grid_xgb, cv=5, scoring='accuracy')

# Perform grid search
grid_search_xgb.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
print("Best Parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best Accuracy for XGBoost:", grid_search_xgb.best_score_)



Best Parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
Best Accuracy for XGBoost: 0.4385087719298246


In [42]:
def get_best_response(user_input, model, vectorizer, df):
    preprocessed_input = preprocess_input(user_input)

    # Vectorize the user input
    vectorized_input = vectorizer.transform([preprocessed_input])

    # Predict the category (tag) using the XGBoost model
    predicted_category = model.predict(vectorized_input)[0]

    # Filter based on the predicted category in the original DataFrame
    response_df = df[df['tag'] == predicted_category]

    if not response_df.empty:
        # Retrieve the response associated with the predicted category
        best_response = response_df.iloc[0]['response']
        return best_response
    else:
        return "I'm sorry, I don't have information on that topic."

# Example usage
user_input = "What is the AWS Cloud course at Moringa School about?"
best_response = get_best_response(user_input, grid_search_xgb.best_estimator_, tfidf_vectorizer_combined, df)
print("Best Response:", best_response)

Best Response: the aws cloud course at moringa school is designed to provide participants with a comprehensive understanding of cloud computing using amazon web services  aws  


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_most_similar_response(user_input, df):
    preprocessed_input = preprocess_input(user_input)

    # Vectorize the user input
    vectorized_input = tfidf_vectorizer_combined.transform([preprocessed_input])

    # Vectorize all questions in the DataFrame
    vectorized_questions = tfidf_vectorizer_combined.transform(
        df.apply(lambda row: ' '.join(row['Tokenized_Question'] + row['Tokenized_Response']), axis=1)
    )

    # Calculate cosine similarity between the user input and all questions
    similarities = cosine_similarity(vectorized_input, vectorized_questions).flatten()

    # Get the index of the most similar question
    most_similar_index = similarities.argmax()

    # Retrieve the corresponding response
    most_similar_response = df.loc[most_similar_index, 'response']

    return most_similar_response

# Example usage
user_input = "How does Moringa School support graduates in their journey post-education?"
chatbot_response = get_most_similar_response(user_input, df)
print("Chatbot Response:", chatbot_response)

In [32]:
# Predictions on the test set
y_pred_xgb = grid_search_xgb.predict(X_test)

# Evaluate the XGBoost model
accuracy_xgb = grid_search_xgb.score(X_test, y_test)
print("Accuracy of the Best XGBoost Model:", accuracy_xgb)

# Classification Report
print("\nClassification Report for XGBoost:")
print(classification_report(y_test, y_pred_xgb))

Accuracy of the Best XGBoost Model: 0.39166666666666666

Classification Report for XGBoost:
                                                         precision    recall  f1-score   support

                           Advanced_Data_Science_Topics       0.00      0.00      0.00         1
                                                Contact       0.50      0.50      0.50         2
                     Cybersecurity_Career_Opportunities       0.50      0.33      0.40         3
                           Cybersecurity_Certifications       0.00      0.00      0.00         2
                       Cybersecurity_Course_Information       0.00      0.00      0.00         1
                                Data_Ethics_and_Privacy       0.00      0.00      0.00         2
            Data_Science_Admission_Eligibility_Criteria       1.00      0.50      0.67         2
                       Data_Science_Application_Process       0.00      0.00      0.00         0
                  Data_Science_App

## MULTINOMIAL MODEL

In [23]:
# Train a multinomial model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [24]:
# Make predictions
predictions = model.predict(X_test)

In [25]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.2833333333333333
Classification Report:
                                                          precision    recall  f1-score   support

                           Advanced_Data_Science_Topics       0.00      0.00      0.00         1
                                                Contact       0.00      0.00      0.00         2
                     Cybersecurity_Career_Opportunities       0.00      0.00      0.00         3
                           Cybersecurity_Certifications       0.00      0.00      0.00         2
                       Cybersecurity_Course_Information       0.20      1.00      0.33         1
                                Data_Ethics_and_Privacy       0.00      0.00      0.00         2
            Data_Science_Admission_Eligibility_Criteria       0.00      0.00      0.00         2
                  Data_Science_Applications_in_Business       0.00      0.00      0.00         2
                                    Data_Science_Career       0.00      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
def preprocess_input(user_input):
    user_input = re.sub('[^a-zA-Z0-9]', ' ', user_input.lower())
    tokenized_input = word_tokenize(user_input)
    filtered_input = [word for word in tokenized_input if word.lower() not in stop_words]
    lemmatized_input = [lemmatizer.lemmatize(word) for word in filtered_input]
    return ' '.join(lemmatized_input)

In [45]:
def get_chatbot_response(user_input, df):
    preprocessed_input = preprocess_input(user_input)
    vectorized_input = tfidf_vectorizer.transform([preprocessed_input])
    category_prediction = model.predict(vectorized_input)[0]

    # Filter based on the predicted category in the original DataFrame
    response_df = df[df['Tag'] == category_prediction]

    if not response_df.empty:
        # Assuming you want a random response from the possible responses
        chatbot_response = np.random.choice(response_df['Response'].iloc[0])
        return chatbot_response
    else:
        return "I'm sorry, I don't have information on that topic."

# Example usage
user_input = "How does Moringa School support graduates in their journey post-education?"
chatbot_response = get_chatbot_response(user_input, df)
print("Chatbot Response:", chatbot_response)

Chatbot Response: Moringa School can help assess your needs and find tech talent for your business by connecting you with our pool of skilled graduates.


## Hyperparameter Tuning for Multinomial Naive Bayes

In [33]:
# Define the parameter grid
param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 2.0]
}

# Initialize the Multinomial Naive Bayes classifier
nb_model = MultinomialNB()

# Initialize GridSearchCV
grid_search_nb = GridSearchCV(nb_model, param_grid=param_grid_nb, cv=5, scoring='accuracy')

# Perform grid search
grid_search_nb.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
print("Best Parameters for Multinomial Naive Bayes:", grid_search_nb.best_params_)
print("Best Accuracy for Multinomial Naive Bayes:", grid_search_nb.best_score_)



Best Parameters for Multinomial Naive Bayes: {'alpha': 0.1}
Best Accuracy for Multinomial Naive Bayes: 0.39677631578947364


## RandomForest Model

In [27]:
from sklearn.ensemble import RandomForestClassifier

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_combined, df['tag'], test_size=0.2, random_state=42)

In [69]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Model Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

Random Forest Model Accuracy: 0.4583333333333333
Classification Report:
                                                          precision    recall  f1-score   support

                           Advanced_Data_Science_Topics       0.50      1.00      0.67         1
                                                Contact       0.50      0.50      0.50         2
                     Cybersecurity_Career_Opportunities       0.00      0.00      0.00         3
                           Cybersecurity_Certifications       0.50      0.50      0.50         2
                       Cybersecurity_Course_Information       0.50      1.00      0.67         1
                                Data_Ethics_and_Privacy       1.00      0.50      0.67         2
            Data_Science_Admission_Eligibility_Criteria       1.00      0.50      0.67         2
                       Data_Science_Application_Process       0.00      0.00      0.00         0
                  Data_Science_Applications_in_Busine

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparameter Tuning for Random Forest

In [34]:
# Define the parameter grid
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier()

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(rf_model, param_grid=param_grid_rf, cv=5, scoring='accuracy')

# Perform grid search
grid_search_rf.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Accuracy for Random Forest:", grid_search_rf.best_score_)



Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Accuracy for Random Forest: 0.5114912280701753


In [46]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(**grid_search_rf.best_params_)
rf_model.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=5, n_estimators=200)

In [47]:
y_pred = rf_model.predict(X_test)
rf_model.score(X_test, y_test)

0.43333333333333335

In [None]:
# Make predictions
rf_predictions = rf_model.predict(X_test)

In [68]:
# Evaluate the model
print("Random Forest Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Random Forest Model Accuracy: 0.43333333333333335
Classification Report:
                                                          precision    recall  f1-score   support

                           Advanced_Data_Science_Topics       0.50      1.00      0.67         1
                                                Contact       1.00      0.50      0.67         2
                     Cybersecurity_Career_Opportunities       0.00      0.00      0.00         3
                           Cybersecurity_Certifications       0.50      0.50      0.50         2
                       Cybersecurity_Course_Information       0.50      1.00      0.67         1
                                Data_Ethics_and_Privacy       1.00      0.50      0.67         2
            Data_Science_Admission_Eligibility_Criteria       0.50      0.50      0.50         2
                  Data_Science_Applications_in_Business       0.00      0.00      0.00         2
                                    Data_Science_Car

In [52]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(min_samples_split=5, n_estimators=200)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Model Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

Random Forest Model Accuracy: 0.4666666666666667
Classification Report:
                                                          precision    recall  f1-score   support

                           Advanced_Data_Science_Topics       0.50      1.00      0.67         1
                                                Contact       1.00      0.50      0.67         2
                     Cybersecurity_Career_Opportunities       0.00      0.00      0.00         3
                           Cybersecurity_Certifications       0.50      0.50      0.50         2
                       Cybersecurity_Course_Information       0.50      1.00      0.67         1
                                Data_Ethics_and_Privacy       1.00      0.50      0.67         2
            Data_Science_Admission_Eligibility_Criteria       1.00      0.50      0.67         2
                  Data_Science_Applications_in_Business       0.00      0.00      0.00         2
                                    Data_Science_Care

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Vectorization using TF-IDF for both 'question' and 'response'
tfidf_vectorizer_combined = TfidfVectorizer()
X_tfidf_combined = tfidf_vectorizer_combined.fit_transform(
    df.apply(lambda row: ' '.join(row['Tokenized_Question'] + row['Tokenized_Response']), axis=1)
)

In [65]:
text = ['how does moringa admision process work']
# Tokenize each string in the list
tokenized_text = [nltk.word_tokenize(sentence) for sentence in text]

# Flatten the list of lists to get a list of words
flattened_text = [word for sentence_tokens in tokenized_text for word in sentence_tokens]

text = ' '.join([lemmatizer.lemmatize(word) for word in flattened_text])
#text = text.lower().replace('[^a-zA-Z0-9]', ' ')
text = ' '.join([word for word in text if word not in stop_words])
                            
vectorized = tfidf_vectorizer_combined.transform(text)

ValueError: Iterable over raw text documents expected, string object received.

In [67]:
texts = ['how does moringa admission process work']

# Tokenize and preprocess each string in the list
tokenized_texts = [
    ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(sentence) if word not in stop_words])
    for sentence in texts
]

# Vectorize using TF-IDF
vectorized = tfidf_vectorizer_combined.transform(tokenized_texts)
vectorized

<1x1604 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>