In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sklearn
import seaborn as sns
# Natural Language Processing (NLP)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Text Vectorization and Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix,
    f1_score, precision_score, recall_score,
    roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score
)
from xgboost import XGBClassifier



In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dennis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dennis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dennis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
with open('../Final_Intents.json', 'r') as file:
    data = json.load(file)

In [4]:
# Function to pair each question with its corresponding response
def pair_questions_responses(data):
    paired_data = []
    for item in data:
        tag = item.get('tag', 'Unknown')
        questions = item.get('questions', [])
        responses = item.get('responses', [])

        for question, response in zip(questions, responses):
            paired_data.append({'tag': tag, 'question': question, 'response': response})

    return paired_data

In [5]:
# Convert the JSON data into a DataFrame
paired_data = pair_questions_responses(data)
df = pd.DataFrame(paired_data)
df

Unnamed: 0,tag,question,response
0,Data Science,What does the data science course at Moringa S...,The data science course at Moringa School cove...
1,Data Science,Can you provide details about the curriculum a...,The curriculum and modules in the data science...
2,Data Science,"How long is the data science course, and what ...",The duration of the data science course is fle...
3,Data Science,Tell me about the practical aspects of the dat...,Practical aspects of the data science learning...
4,Data Science,Are there any prerequisites for enrolling in t...,Prerequisites for enrolling in the data scienc...
...,...,...,...
510,Miscellaneous,Is there a limit on the number of financial ai...,The teaching model is hands-on and project-bas...
511,Miscellaneous,How does the Soma Education Loan work?,Students will gain insight into cross-platform...
512,Miscellaneous,Are there repayment options for student loans?,"Upon completion of the course, students will r..."
513,Miscellaneous,What support is available for students applyin...,All classes are fully online. Lecture sessions...


In [6]:
# Text Cleaning
# Remove any unnecessary characters, symbols, or special characters.
# Convert text to lowercase to ensure uniformity.
df['question'] = df['question'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex=True)
df['response'] = df['response'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex=True)
df['tag'] = df['tag'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex=True)

# Display the cleaned DataFrame
df.head()

Unnamed: 0,tag,question,response
0,data science,what does the data science course at moringa s...,the data science course at moringa school cove...
1,data science,can you provide details about the curriculum a...,the curriculum and modules in the data science...
2,data science,how long is the data science course and what ...,the duration of the data science course is fle...
3,data science,tell me about the practical aspects of the dat...,practical aspects of the data science learning...
4,data science,are there any prerequisites for enrolling in t...,prerequisites for enrolling in the data scienc...


In [7]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

QnAs = []
tags = []

for intent in data:
    for question in intent["questions"]:
        QnAs.append(question)
        tags.append(intent['tag'])
    for response in intent["responses"]:
        QnAs.append(response)
        tags.append(intent['tag'])
# Preprocess the text by lemmatizing and lowercasing

qna_df = pd.DataFrame({"QnAs": QnAs, "tags": tags})

qna_df["QnAs"] = qna_df["QnAs"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word.lower()) for word in x.split()]))

qna_df = qna_df.reindex(np.random.permutation(qna_df.index))
#qna_df = preprocess_qnas(qna_df)
qna_df.head()

Unnamed: 0,QnAs,tags
347,"while maintaining a structured approach, we pr...",Data Science
730,"while there may not be strict prerequisites, a...",Cloud Computing
445,are there any discount available for early pay...,DevOps
250,applicant must be above 18 year of age to enro...,Data Science
43,what document do i need to apply for the course?,Data Science


In [8]:
# Tokenize the 'QnAs' column
qna_df["QnAs"] = qna_df["QnAs"].apply(word_tokenize)

# Display the DataFrame with tokenized text
qna_df.head()

Unnamed: 0,QnAs,tags
347,"[while, maintaining, a, structured, approach, ...",Data Science
730,"[while, there, may, not, be, strict, prerequis...",Cloud Computing
445,"[are, there, any, discount, available, for, ea...",DevOps
250,"[applicant, must, be, above, 18, year, of, age...",Data Science
43,"[what, document, do, i, need, to, apply, for, ...",Data Science


In [9]:
# Load the English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from the 'QnAs_tokens' column
qna_df["QnAs"] = qna_df["QnAs"].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])

# Display the DataFrame with removed stopwords
qna_df

Unnamed: 0,QnAs,tags
347,"[maintaining, structured, approach, ,, provide...",Data Science
730,"[may, strict, prerequisites, ,, basic, underst...",Cloud Computing
445,"[discount, available, early, payment, tuition,...",DevOps
250,"[applicant, must, 18, year, age, enroll, data,...",Data Science
43,"[document, need, apply, course, ?]",Data Science
...,...,...
911,"[tell, software, engineering, mobile, track, m...",Miscellaneous
365,"[certification, well-regarded, industry, ,, re...",Data Science
933,"[find, testimonial, individual, completed, spe...",Miscellaneous
435,"[specific, role, devops, professionals, ?]",DevOps


In [10]:
# Vectorization using TF-IDF for QnAs
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(
    qna_df["QnAs"].apply(lambda row: ' '.join(row))
)

## XGBOOST MODEL

In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, qna_df['tags'], test_size=0.2, random_state=42)

# Train a model 
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.8317307692307693
Classification Report:
                       precision    recall  f1-score   support

     Cloud Computing       0.93      1.00      0.97        14
            Contacts       0.25      0.50      0.33         2
       Cybersecurity       0.86      0.92      0.89        13
        Data Science       0.87      0.84      0.86        90
              DevOps       1.00      0.56      0.71         9
          Enrollment       0.86      0.92      0.89        13
       Miscellaneous       0.72      0.64      0.68        28
  Mobile Development       0.91      0.91      0.91        11
Software Engineering       0.76      0.92      0.83        24
               UI/UX       0.75      0.75      0.75         4

            accuracy                           0.83       208
           macro avg       0.79      0.80      0.78       208
        weighted avg       0.84      0.83      0.83       208



In [12]:
model.__dict__

{'n_estimators': 100,
 'objective': 'multi:softprob',
 'max_depth': None,
 'learning_rate': None,
 'verbosity': None,
 'booster': None,
 'tree_method': None,
 'gamma': None,
 'min_child_weight': None,
 'max_delta_step': None,
 'subsample': None,
 'colsample_bytree': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'base_score': None,
 'missing': nan,
 'num_parallel_tree': None,
 'kwargs': {},
 'random_state': None,
 'n_jobs': None,
 'monotone_constraints': None,
 'interaction_constraints': None,
 'importance_type': 'gain',
 'gpu_id': None,
 'validate_parameters': None,
 'classes_': array(['Cloud Computing', 'Contacts', 'Cybersecurity', 'Data Science',
        'DevOps', 'Enrollment', 'Miscellaneous', 'Mobile Development',
        'Software Engineering', 'UI/UX'], dtype=object),
 'n_classes_': 10,
 '_le': XGBoostLabelEncoder(),
 '_features_count': 1637,
 'n_features_in_': 1637,
 '_Booster': <xgboost.core.Boo

## Hyperparameter Tuning for XGBoost

In [13]:
# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Initialize the XGBoost classifier
xgb_model = XGBClassifier()

# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(xgb_model, param_grid=param_grid_xgb, cv=5, scoring='accuracy')

# Perform grid search
grid_search_xgb.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
print("Best Parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best Accuracy for XGBoost:", grid_search_xgb.best_score_)

Best Parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best Accuracy for XGBoost: 0.829326888391891


## MULTINOMIAL MODEL

In [14]:
# Train a multinomial model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [15]:
# Make predictions
predictions = model.predict(X_test)

In [16]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.5865384615384616
Classification Report:
                       precision    recall  f1-score   support

     Cloud Computing       1.00      0.29      0.44        14
            Contacts       0.00      0.00      0.00         2
       Cybersecurity       0.00      0.00      0.00        13
        Data Science       0.53      0.99      0.69        90
              DevOps       1.00      0.33      0.50         9
          Enrollment       0.90      0.69      0.78        13
       Miscellaneous       0.78      0.25      0.38        28
  Mobile Development       1.00      0.09      0.17        11
Software Engineering       0.64      0.38      0.47        24
               UI/UX       0.00      0.00      0.00         4

            accuracy                           0.59       208
           macro avg       0.59      0.30      0.34       208
        weighted avg       0.63      0.59      0.51       208



  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparameter Tuning for Multinomial Naive Bayes

In [17]:
# Define the parameter grid
param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 2.0]
}

# Initialize the Multinomial Naive Bayes classifier
nb_model = MultinomialNB()

# Initialize GridSearchCV
grid_search_nb = GridSearchCV(nb_model, param_grid=param_grid_nb, cv=5, scoring='accuracy')

# Perform grid search
grid_search_nb.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
print("Best Parameters for Multinomial Naive Bayes:", grid_search_nb.best_params_)
print("Best Accuracy for Multinomial Naive Bayes:", grid_search_nb.best_score_)

Best Parameters for Multinomial Naive Bayes: {'alpha': 0.1}
Best Accuracy for Multinomial Naive Bayes: 0.7535747781545343


## RandomForest Model

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, qna_df['tags'], test_size=0.2, random_state=42)

In [19]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Model Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

Random Forest Model Accuracy: 0.8461538461538461
Classification Report:
                       precision    recall  f1-score   support

     Cloud Computing       1.00      1.00      1.00        14
            Contacts       0.50      0.50      0.50         2
       Cybersecurity       1.00      0.77      0.87        13
        Data Science       0.85      0.93      0.89        90
              DevOps       1.00      0.67      0.80         9
          Enrollment       0.92      0.92      0.92        13
       Miscellaneous       0.73      0.68      0.70        28
  Mobile Development       0.71      0.91      0.80        11
Software Engineering       0.84      0.67      0.74        24
               UI/UX       0.80      1.00      0.89         4

            accuracy                           0.85       208
           macro avg       0.84      0.80      0.81       208
        weighted avg       0.85      0.85      0.84       208



## Hyperparameter Tuning for Random Forest

In [20]:
# Define the parameter grid
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier()

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(rf_model, param_grid=param_grid_rf, cv=5, scoring='accuracy')

# Perform grid search
grid_search_rf.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Accuracy for Random Forest:", grid_search_rf.best_score_)

Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy for Random Forest: 0.842558256980016


In [21]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(**grid_search_rf.best_params_)
rf_model.fit(X_train, y_train)

RandomForestClassifier()

In [22]:
y_pred = rf_model.predict(X_test)
rf_model.score(X_test, y_test)

0.8365384615384616

In [23]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(min_samples_split=5, n_estimators=200)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Model Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

Random Forest Model Accuracy: 0.8557692307692307
Classification Report:
                       precision    recall  f1-score   support

     Cloud Computing       1.00      1.00      1.00        14
            Contacts       1.00      0.50      0.67         2
       Cybersecurity       1.00      0.77      0.87        13
        Data Science       0.85      0.94      0.89        90
              DevOps       1.00      0.67      0.80         9
          Enrollment       0.92      0.92      0.92        13
       Miscellaneous       0.79      0.68      0.73        28
  Mobile Development       0.83      0.91      0.87        11
Software Engineering       0.75      0.75      0.75        24
               UI/UX       0.75      0.75      0.75         4

            accuracy                           0.86       208
           macro avg       0.89      0.79      0.83       208
        weighted avg       0.86      0.86      0.85       208



In [24]:
import string

def clean_text(sentence):
    sentence = sentence.lower()
    sentence = word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    
    sentence = [word for word in sentence if word not in stop_words]
    
    return sentence

# Test text
text = "How long does cyber security take ?"

# Clean and preprocess the test text
prep_text = clean_text(text)

# Use the same fitted TfidfVectorizer from your training code
vectorized_txt = tfidf_vectorizer.transform([' '.join(prep_text)])

# Predict using the trained model
pred = model.predict(vectorized_txt)

print(pred)

['Data Science']


In [28]:
# Vectorization using TF-IDF for both 'question' and 'response'
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(
    qna_df["QnAs"].apply(lambda row: ' '.join(row))
)

In [54]:
text = ['How is the curriculum at Moringa School tailored to market-aligned skills in Cloud Computing ?']
# Tokenize each string in the list
tokenized_text = [nltk.word_tokenize(sentence) for sentence in text]

# Flatten the list of lists to get a list of words
flattened_text = [word for sentence_tokens in tokenized_text for word in sentence_tokens]

text = ' '.join([lemmatizer.lemmatize(word) for word in flattened_text])
#text = text.lower().replace('[^a-zA-Z0-9]', ' ')
text = ' '.join([word for word in text if word not in stop_words])
                            
vectorized = tfidf_vectorizer.transform([text])

In [55]:
model.predict(vectorized)

array(['Data Science'], dtype='<U20')

In [46]:
# Example for checking tokenized input
input_text = 'How is the curriculum at Moringa School tailored to market-aligned skills in Cloud Computing ?'
tokenized_input = word_tokenize(input_text.lower())
lemmatized_input = [lemmatizer.lemmatize(word) for word in tokenized_input if word not in stop_words]

print("Tokenized and Lemmatized Input:", lemmatized_input)

# Vectorize the input
vectorized_input = tfidf_vectorizer.transform([' '.join(lemmatized_input)])

# Make predictions
predictions = model.predict(vectorized_input)
print("Predicted Category:", predictions)

Tokenized and Lemmatized Input: ['curriculum', 'moringa', 'school', 'tailored', 'market-aligned', 'skill', 'cloud', 'computing', '?']
Predicted Category: ['Data Science']


In [51]:
# Example input text
text = ['How long does cyber security take ?']

# Tokenize and lemmatize
tokenized_text = [nltk.word_tokenize(sentence) for sentence in text]
flattened_text = [word for sentence_tokens in tokenized_text for word in sentence_tokens]
processed_text = ' '.join([lemmatizer.lemmatize(word) for word in flattened_text])

# Remove stopwords
processed_text = ' '.join([word for word in processed_text.split() if word not in stop_words])

# Transform using the same TF-IDF vectorizer
vectorized_text = tfidf_vectorizer.transform([processed_text])

# Make predictions
predictions = model.predict(vectorized_text)
print(predictions)

['Data Science']


In [49]:
probabilities = model.predict_proba(vectorized_input)
print("Raw Probabilities:", probabilities)

Raw Probabilities: [[0.07300233 0.00998406 0.04632034 0.533577   0.03112474 0.05381455
  0.1375134  0.03749257 0.05902109 0.01814993]]


In [50]:
print("Class Labels:", model.classes_)

Class Labels: ['Cloud Computing' 'Contacts' 'Cybersecurity' 'Data Science' 'DevOps'
 'Enrollment' 'Miscellaneous' 'Mobile Development' 'Software Engineering'
 'UI/UX']
