# **PSYCHOMETRIC MODEL**


In [1]:
# installing the necessary library
import pandas as pd
import numpy as  np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
import joblib
from joblib import dump,load

In [2]:
# Getting the file path
def get_filepath(sub_dir,filename):
    base_dir = os.getcwd()
    full_path = os.path.join(base_dir, sub_dir,filename)
    return full_path

In [3]:
# Reading in the file

filepath = get_filepath("DATA", "Data_Psychometric_1000_updated.csv")
df = pd.read_csv(filepath)

In [4]:
# df = pd.read_csv('/content/Data_Psychometric_1000 (2).csv')

In [5]:
#printing the first 5 columns
df.head()

Unnamed: 0,Mathematical Aptitude,Logical Reasoning,Openness,Conscientiousness,Emotional Stability,Agreeableness/ Assertiveness,Creativity,Aptitude,Programming,Design Software,3D Skills,Web Development Confidence,Suggested Tracks
0,Average,Excellent,Open,Moderate,Calm,In-Between,Creative,Moderately Techincally Inclined,Advanced,Proficient,Not Experienced,Highly Confident,Product Management
1,Excellent,Bad,Somewhat Open,Not,Calm,Not Cooperative,Not Creative,Moderately Techincally Inclined,Advanced,Not Proficient,Not Experienced,Moderately Confident,Blockchain
2,Bad,Excellent,Not Open,Not,Overwhelmed,In-Between,Highly Creative,Not Technically Inclined,Advanced,Not Proficient,Highly Experienced,Moderately Confident,Blockchain
3,Bad,Excellent,Not Open,Not,Overwhelmed,Cooperative,Not Creative,Technically Inclined,Basic,Not Proficient,Highly Experienced,Not Confident,Blockchain
4,Excellent,Bad,Not Open,Moderate,Calm,In-Between,Not Creative,Moderately Techincally Inclined,Basic,Not Proficient,Moderately Experienced,Not Confident,Blockchain


In [6]:
#the shape of the data
df.shape

(1000, 13)

In [7]:
#printing the general information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Mathematical Aptitude         1000 non-null   object
 1   Logical Reasoning             1000 non-null   object
 2   Openness                      1000 non-null   object
 3   Conscientiousness             1000 non-null   object
 4   Emotional Stability           1000 non-null   object
 5   Agreeableness/ Assertiveness  1000 non-null   object
 6   Creativity                    1000 non-null   object
 7   Aptitude                      1000 non-null   object
 8   Programming                   1000 non-null   object
 9   Design Software               1000 non-null   object
 10  3D Skills                     1000 non-null   object
 11  Web Development Confidence    1000 non-null   object
 12  Suggested Tracks              1000 non-null   object
dtypes: object(13)
memor

In [8]:
# General description of the columns
df.describe()

Unnamed: 0,Mathematical Aptitude,Logical Reasoning,Openness,Conscientiousness,Emotional Stability,Agreeableness/ Assertiveness,Creativity,Aptitude,Programming,Design Software,3D Skills,Web Development Confidence,Suggested Tracks
count,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000
unique,3,3,3,3,3,3,3,3,3,3,3,3,5
top,Excellent,Bad,Open,Moderate,Overwhelmed,In-Between,Creative,Technically Inclined,Intermediate,Highly Proficient,Not Experienced,Highly Confident,Blockchain
freq,345,351,341,360,340,348,357,357,348,341,347,340,556


In [9]:
# Checking for null values
df.isnull().sum()

Mathematical Aptitude           0
Logical Reasoning               0
Openness                        0
Conscientiousness               0
Emotional Stability             0
Agreeableness/ Assertiveness    0
Creativity                      0
Aptitude                        0
Programming                     0
Design Software                 0
3D Skills                       0
Web Development Confidence      0
Suggested Tracks                0
dtype: int64

In [10]:
df['Aptitude'].unique()

array(['Moderately Techincally Inclined', 'Not Technically Inclined',
       'Technically Inclined'], dtype=object)

In [11]:
# Duplicating the data to make use of the copy df2
df2 = df

In [12]:
#splitting the data into Feature `X` and Target `y`
X = df.drop(columns=["Suggested Tracks"])
y = df["Suggested Tracks"]

In [13]:
print(y.unique())

['Product Management' 'Blockchain' 'Cyber Security, Data Science'
 '3D Animation Skills, Product Design'
 'Mobile Application, Software Development']


In [14]:
# Encode each categorical column using LabelEncoder
for col in X.columns:
    if X[col].dtype == object:  # Check for object type (categorical data)
        encoder = LabelEncoder()
        X[col] = encoder.fit_transform(X[col])
        #sanitize column name to remove spaces or slashes that could lead to error while saving
        safe_col_name = col.replace(' ','_').replace('/','_').replace('\\','_').replace('-','_')
        # Construct the file path using the get_filepath function for better compatibility
        file_path = get_filepath("DATA", f"{safe_col_name}_encoder.joblib")
        # save each encoder with sanitized name
        joblib.dump(encoder, file_path)

        # Get encoding mapping for the current column
        encoding_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
        print(f"Encoding mapping for {col}: {encoding_mapping}")


Encoding mapping for Mathematical Aptitude: {'Average': 0, 'Bad': 1, 'Excellent': 2}
Encoding mapping for Logical Reasoning: {'Average': 0, 'Bad': 1, 'Excellent': 2}
Encoding mapping for Openness: {'Not Open': 0, 'Open': 1, 'Somewhat Open': 2}
Encoding mapping for Conscientiousness: {'Highly': 0, 'Moderate': 1, 'Not': 2}
Encoding mapping for Emotional Stability: {'Calm': 0, 'Overwhelmed': 1, 'Pressured (Composed)': 2}
Encoding mapping for Agreeableness/ Assertiveness: {'Cooperative': 0, 'In-Between': 1, 'Not Cooperative': 2}
Encoding mapping for Creativity: {'Creative': 0, 'Highly Creative': 1, 'Not Creative': 2}
Encoding mapping for Aptitude: {'Moderately Techincally Inclined': 0, 'Not Technically Inclined': 1, 'Technically Inclined': 2}
Encoding mapping for Programming: {'Advanced': 0, 'Basic': 1, 'Intermediate': 2}
Encoding mapping for Design Software: {'Highly Proficient': 0, 'Not Proficient': 1, 'Proficient': 2}
Encoding mapping for 3D Skills: {'Highly Experienced': 0, 'Moderately

In [15]:
# Initialize RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Convert the resampled data back to DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Suggested Tracks'] = y_resampled

In [16]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_resampled.drop(columns=['Suggested Tracks']), y_resampled, test_size=0.2, random_state=42)

In [48]:
print(X_train.dtypes)
#print(y_train.dtypes)

Mathematical Aptitude           int64
Logical Reasoning               int64
Openness                        int64
Conscientiousness               int64
Emotional Stability             int64
Agreeableness/ Assertiveness    int64
Creativity                      int64
Aptitude                        int64
Programming                     int64
Design Software                 int64
3D Skills                       int64
Web Development Confidence      int64
dtype: object


# MODEL TRAINING

# RANDOM FOREST

In [19]:
#initializing the model
rfc = RandomForestClassifier()

In [20]:
# Stating the parameters for hyperparameter tuning
param_grid = {
    'n_estimators': [150, 200, 300, 500],
    'min_samples_split': [5, 10, 15],
    'max_depth': [10, 13, 15, 17, 20],
    'min_samples_leaf': [2, 4, 5, 6],
    'criterion': ['gini', 'entropy'],
}

In [21]:
# fitting the model to the data
rf_grid_search = RandomizedSearchCV(RandomForestClassifier(),param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)

# Using the best estimator to predict the test data
rf_best_model = rf_grid_search.best_estimator_
rf_predictions = rf_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", rf_grid_search.best_params_)


Best Random Forest Model Parameters: {'n_estimators': 150, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 10, 'criterion': 'entropy'}


In [22]:
Classification_Report = classification_report(y_test, rf_predictions)
print(Classification_Report)

                                          precision    recall  f1-score   support

     3D Animation Skills, Product Design       0.99      1.00      1.00       112
                              Blockchain       1.00      0.90      0.95       110
            Cyber Security, Data Science       0.94      1.00      0.97       112
Mobile Application, Software Development       0.98      1.00      0.99       113
                      Product Management       0.99      1.00      1.00       109

                                accuracy                           0.98       556
                               macro avg       0.98      0.98      0.98       556
                            weighted avg       0.98      0.98      0.98       556



# DECISION TREE

In [23]:
dtc = DecisionTreeClassifier()

In [24]:
param_grid = {
    'min_samples_split': [5, 10, 15],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
}

In [25]:
dtc_grid_search = RandomizedSearchCV(DecisionTreeClassifier(),param_grid, cv=5)
dtc_grid_search.fit(X_train, y_train)

#Predicting the test data
dtc_best_model = dtc_grid_search.best_estimator_
dtc_predictions = dtc_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", dtc_grid_search.best_params_)

Best Random Forest Model Parameters: {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20, 'criterion': 'entropy'}


In [26]:
Classification_Report = classification_report(y_test, dtc_predictions)
print(Classification_Report)

                                          precision    recall  f1-score   support

     3D Animation Skills, Product Design       0.97      1.00      0.98       112
                              Blockchain       1.00      0.89      0.94       110
            Cyber Security, Data Science       0.96      1.00      0.98       112
Mobile Application, Software Development       0.97      1.00      0.99       113
                      Product Management       1.00      1.00      1.00       109

                                accuracy                           0.98       556
                               macro avg       0.98      0.98      0.98       556
                            weighted avg       0.98      0.98      0.98       556



# KNN

In [27]:
knn = KNeighborsClassifier()

In [28]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1,2],
}

In [29]:
knn_grid_search = RandomizedSearchCV(KNeighborsClassifier(),param_grid, cv=5)
knn_grid_search.fit(X_train, y_train)
knn_best_model = knn_grid_search.best_estimator_
knn_predictions = knn_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", knn_grid_search.best_params_)

Best Random Forest Model Parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 5, 'algorithm': 'brute'}


In [30]:
Classification_Report = classification_report(y_test, knn_predictions)
print(Classification_Report)

                                          precision    recall  f1-score   support

     3D Animation Skills, Product Design       0.88      1.00      0.93       112
                              Blockchain       0.98      0.53      0.69       110
            Cyber Security, Data Science       0.90      1.00      0.95       112
Mobile Application, Software Development       0.86      0.97      0.91       113
                      Product Management       0.94      1.00      0.97       109

                                accuracy                           0.90       556
                               macro avg       0.91      0.90      0.89       556
                            weighted avg       0.91      0.90      0.89       556



# SVM

In [31]:
svm = SVC()

In [32]:
param_grid = {
    'C': [100,10, 1.0, 0.1, 0.001, 0.001],
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
}

In [33]:
svm_grid_search = RandomizedSearchCV(SVC(),param_grid, cv=5)
svm_grid_search.fit(X_train, y_train)
svm_best_model = svm_grid_search.best_estimator_
svm_predictions = svm_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", svm_grid_search.best_params_)

Best Random Forest Model Parameters: {'kernel': 'rbf', 'C': 10}


In [34]:
Classification_Report = classification_report(y_test, svm_predictions)
print(Classification_Report)

                                          precision    recall  f1-score   support

     3D Animation Skills, Product Design       0.98      1.00      0.99       112
                              Blockchain       1.00      0.93      0.96       110
            Cyber Security, Data Science       0.97      1.00      0.98       112
Mobile Application, Software Development       0.98      0.99      0.99       113
                      Product Management       0.99      1.00      1.00       109

                                accuracy                           0.98       556
                               macro avg       0.98      0.98      0.98       556
                            weighted avg       0.98      0.98      0.98       556



# LOGISTIC REGRESSION

In [35]:
lr = LogisticRegression()

In [36]:
param_grid = {
    'C': [100,10, 1.0, 0.1, 0.001, 0.001],
}

In [37]:
lr_grid_search = RandomizedSearchCV(LogisticRegression(),param_grid, cv=5)
lr_grid_search.fit(X_train, y_train)
lr_best_model = lr_grid_search.best_estimator_
lr_predictions = lr_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", lr_grid_search.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Random Forest Model Parameters: {'C': 0.1}


In [38]:
Classification_Report = classification_report(y_test, lr_predictions)
print(Classification_Report)

                                          precision    recall  f1-score   support

     3D Animation Skills, Product Design       0.44      0.24      0.31       112
                              Blockchain       0.28      0.09      0.14       110
            Cyber Security, Data Science       0.61      0.89      0.72       112
Mobile Application, Software Development       0.48      0.45      0.47       113
                      Product Management       0.53      0.91      0.67       109

                                accuracy                           0.52       556
                               macro avg       0.47      0.52      0.46       556
                            weighted avg       0.47      0.52      0.46       556



# NAIVE BAYES

In [39]:
# Train Naive Bayes (Multinomial) Classifier
nb_clf = MultinomialNB()

In [40]:

# Naive Bayes (Multinomial) Classifier hyperparameters grid
nb_param_grid = {'alpha': [0.1, 1, 10], 'fit_prior': [True, False]}

In [41]:

# Perform GridSearchCV for Naive Bayes (Multinomial)
nb_grid_search = RandomizedSearchCV(MultinomialNB(), nb_param_grid, cv=5)
nb_grid_search.fit(X_train, y_train)

nb_best_model = nb_grid_search.best_estimator_
nb_predictions = nb_best_model.predict(X_test)

print("Best Naive Bayes (Multinomial) Model Parameters:", nb_grid_search.best_params_)



Best Naive Bayes (Multinomial) Model Parameters: {'fit_prior': True, 'alpha': 10}


In [42]:
Classification_Report = classification_report(y_test, nb_predictions)
print(Classification_Report)

                                          precision    recall  f1-score   support

     3D Animation Skills, Product Design       0.40      0.25      0.31       112
                              Blockchain       0.18      0.05      0.07       110
            Cyber Security, Data Science       0.57      0.92      0.71       112
Mobile Application, Software Development       0.47      0.42      0.44       113
                      Product Management       0.48      0.77      0.59       109

                                accuracy                           0.48       556
                               macro avg       0.42      0.48      0.42       556
                            weighted avg       0.42      0.48      0.42       556



## ☄ **Model Deplyment**

---

Before deployment, the best model (Random Forest) will be serialized (saved) for export and deployment.

In [44]:
# #serializing and saving the best model

 # Construct the path to the file
file_path = os.path.join('DATA', 'rf_best_model.joblib')

# Save the best RandomForest model to a file
dump(rf_best_model, file_path)

['DATA/rf_best_model.joblib']