In [22]:
import pandas as pd
import numpy as  np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import ast
import joblib
from joblib import dump,load

In [23]:
import os
def get_filepath(sub_dir,filename):
    base_dir = os.getcwd()
    full_path = os.path.join(base_dir, sub_dir,filename)
    return full_path

In [24]:
filepath = get_filepath("DATA", "Data_Psychometric_1000_updated.csv")
df = pd.read_csv(filepath)

In [25]:
df.head()

Unnamed: 0,Mathematical Aptitude,Logical Reasoning,Openness,Conscientiousness,Emotional Stability,Agreeableness/ Assertiveness,Creativity,Aptitude,Programming,Design Software,3D Skills,Web Development Confidence,Suggested Tracks
0,Average,Excellent,Open,Moderate,Calm,In-Between,Creative,Moderately Techincally Inclined,Advanced,Proficient,Not Experienced,Highly Confident,Product Management
1,Excellent,Bad,Somewhat Open,Not,Calm,Not Cooperative,Not Creative,Moderately Techincally Inclined,Advanced,Not Proficient,Not Experienced,Moderately Confident,Blockchain
2,Bad,Excellent,Not Open,Not,Overwhelmed,In-Between,Highly Creative,Not Technically Inclined,Advanced,Not Proficient,Highly Experienced,Moderately Confident,Blockchain
3,Bad,Excellent,Not Open,Not,Overwhelmed,Cooperative,Not Creative,Technically Inclined,Basic,Not Proficient,Highly Experienced,Not Confident,Blockchain
4,Excellent,Bad,Not Open,Moderate,Calm,In-Between,Not Creative,Moderately Techincally Inclined,Basic,Not Proficient,Moderately Experienced,Not Confident,Blockchain


In [26]:
df.shape

(1000, 13)

In [27]:
#printing the general information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Mathematical Aptitude         1000 non-null   object
 1   Logical Reasoning             1000 non-null   object
 2   Openness                      1000 non-null   object
 3   Conscientiousness             1000 non-null   object
 4   Emotional Stability           1000 non-null   object
 5   Agreeableness/ Assertiveness  1000 non-null   object
 6   Creativity                    1000 non-null   object
 7   Aptitude                      1000 non-null   object
 8   Programming                   1000 non-null   object
 9   Design Software               1000 non-null   object
 10  3D Skills                     1000 non-null   object
 11  Web Development Confidence    1000 non-null   object
 12  Suggested Tracks              1000 non-null   object
dtypes: object(13)
memor

In [28]:
# General description of the columns
df.describe()

Unnamed: 0,Mathematical Aptitude,Logical Reasoning,Openness,Conscientiousness,Emotional Stability,Agreeableness/ Assertiveness,Creativity,Aptitude,Programming,Design Software,3D Skills,Web Development Confidence,Suggested Tracks
count,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000
unique,3,3,3,3,3,3,3,3,3,3,3,3,5
top,Excellent,Bad,Open,Moderate,Overwhelmed,In-Between,Creative,Technically Inclined,Intermediate,Highly Proficient,Not Experienced,Highly Confident,Blockchain
freq,345,351,341,360,340,348,357,357,348,341,347,340,556


In [29]:
# Checking for null values
df.isnull().sum()

Mathematical Aptitude           0
Logical Reasoning               0
Openness                        0
Conscientiousness               0
Emotional Stability             0
Agreeableness/ Assertiveness    0
Creativity                      0
Aptitude                        0
Programming                     0
Design Software                 0
3D Skills                       0
Web Development Confidence      0
Suggested Tracks                0
dtype: int64

In [30]:
df['Suggested Tracks'].unique() 

array(['Product Management', 'Blockchain', 'Cyber Security, Data Science',
       '3D Animation Skills, Product Design',
       'Mobile Application, Software Development'], dtype=object)

In [31]:
df2 = df

In [32]:
X = df2.drop(columns=["Suggested Tracks"])
y = df2["Suggested Tracks"]

In [33]:
print(y.unique())

['Product Management' 'Blockchain' 'Cyber Security, Data Science'
 '3D Animation Skills, Product Design'
 'Mobile Application, Software Development']


In [34]:
from sklearn.preprocessing import LabelEncoder


# Assuming X is your DataFrame after label encoding
# Encode each categorical column using LabelEncoder
for col in X.columns:
    if X[col].dtype == object:  # Check for object type (categorical data)
        encoder = LabelEncoder()
        X[col] = encoder.fit_transform(X[col])
        
        #sanitize column name to remove spaces or slashes that could lead to error while saving
        safe_col_name = col.replace(' ','_').replace('/','_').replace('\\','_').replace('-','_')
        
        # Construct the file path using the get_filepath function for better compatibility
        file_path = get_filepath("DATA", f"{safe_col_name}_encoder.joblib")

        # save each encoder with sanitized name
        joblib.dump(encoder, file_path)
        
        # Get encoding mapping for the current column
        encoding_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
        print(f"Encoding mapping for {col}: {encoding_mapping}")


Encoding mapping for Mathematical Aptitude: {'Average': 0, 'Bad': 1, 'Excellent': 2}
Encoding mapping for Logical Reasoning: {'Average': 0, 'Bad': 1, 'Excellent': 2}
Encoding mapping for Openness: {'Not Open': 0, 'Open': 1, 'Somewhat Open': 2}
Encoding mapping for Conscientiousness: {'Highly': 0, 'Moderate': 1, 'Not': 2}
Encoding mapping for Emotional Stability: {'Calm': 0, 'Overwhelmed': 1, 'Pressured (Composed)': 2}
Encoding mapping for Agreeableness/ Assertiveness: {'Cooperative': 0, 'In-Between': 1, 'Not Cooperative': 2}
Encoding mapping for Creativity: {'Creative': 0, 'Highly Creative': 1, 'Not Creative': 2}
Encoding mapping for Aptitude: {'Moderately Techincally Inclined': 0, 'Not Technically Inclined': 1, 'Technically Inclined': 2}
Encoding mapping for Programming: {'Advanced': 0, 'Basic': 1, 'Intermediate': 2}
Encoding mapping for Design Software: {'Highly Proficient': 0, 'Not Proficient': 1, 'Proficient': 2}
Encoding mapping for 3D Skills: {'Highly Experienced': 0, 'Moderately

In [35]:
X.head()

Unnamed: 0,Mathematical Aptitude,Logical Reasoning,Openness,Conscientiousness,Emotional Stability,Agreeableness/ Assertiveness,Creativity,Aptitude,Programming,Design Software,3D Skills,Web Development Confidence
0,0,2,1,1,0,1,0,0,0,2,2,0
1,2,1,2,2,0,2,2,0,0,1,2,1
2,1,2,0,2,1,1,1,1,0,1,0,1
3,1,2,0,2,1,0,2,2,1,1,0,2
4,2,1,0,1,0,1,2,0,1,1,1,2


In [40]:
# Initialize RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Convert the resampled data back to DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Suggested Tracks'] = y_resampled

In [41]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [42]:
#initializing the model
rfc = RandomForestClassifier()

In [43]:
# Stating the parameters for hyperparameter tuning
param_grid = {
    'n_estimators': [150, 200, 300, 500],
    'min_samples_split': [5, 10, 15],
    'max_depth': [10, 13, 15, 17, 20],
    'min_samples_leaf': [2, 4, 5, 6],
    'criterion': ['gini', 'entropy'],
}

In [44]:
# fitting the model to the data
rf_grid_search = RandomizedSearchCV(RandomForestClassifier(),param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)

# Using the best estimator to predict the test data
rf_best_model = rf_grid_search.best_estimator_
rf_predictions = rf_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", rf_grid_search.best_params_)

ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/ensemble/_forest.py", line 345, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/pandas/core/generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Cyber Security, Data Science'

--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/ensemble/_forest.py", line 345, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codeibinabo/anaconda3/lib/python3.11/site-packages/pandas/core/generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '3D Animation Skills, Product Design'


In [None]:
Classification_Report = classification_report(y_test, rf_predictions)
print(Classification_Report)

                                                precision    recall  f1-score   support

     ['3D Animation Skills', 'Product Design']       1.00      1.00      1.00        74
            ['Cyber Security', 'Data Science']       0.95      1.00      0.97        36
['Mobile Application', 'Software Development']       1.00      0.88      0.93         8
          ['Product Management', 'Mobile App']       1.00      0.86      0.92         7
        ['Software Development', 'Blockchain']       0.99      0.99      0.99        75

                                      accuracy                           0.98       200
                                     macro avg       0.99      0.94      0.96       200
                                  weighted avg       0.99      0.98      0.98       200



In [None]:
# Train Naive Bayes (Multinomial) Classifier
nb_clf = MultinomialNB()

In [None]:

# Naive Bayes (Multinomial) Classifier hyperparameters grid
nb_param_grid = {'alpha': [0.1, 1, 10], 'fit_prior': [True, False]}

In [None]:

# Perform GridSearchCV for Naive Bayes (Multinomial)
nb_grid_search = RandomizedSearchCV(MultinomialNB(), nb_param_grid, cv=5)
nb_grid_search.fit(X_train, y_train)

nb_best_model = nb_grid_search.best_estimator_
nb_predictions = nb_best_model.predict(X_test)

print("Best Naive Bayes (Multinomial) Model Parameters:", nb_grid_search.best_params_)



ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 745, in fit
    X, y = self._check_X_y(X, y)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 578, in _check_X_y
    return self._validate_data(X, y, accept_sparse="csr", reset=reset)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Mobile Application, Software Development'

--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 745, in fit
    X, y = self._check_X_y(X, y)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 578, in _check_X_y
    return self._validate_data(X, y, accept_sparse="csr", reset=reset)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\Lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Product Management'


In [None]:
Classification_Report = classification_report(y_test, nb_predictions)
print(Classification_Report)

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
param_grid = {
    'min_samples_split': [5, 10, 15],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
}

In [None]:
dtc_grid_search = RandomizedSearchCV(DecisionTreeClassifier(),param_grid, cv=5)
dtc_grid_search.fit(X_train, y_train)

#Predicting the test data
dtc_best_model = dtc_grid_search.best_estimator_
dtc_predictions = dtc_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", dtc_grid_search.best_params_)

In [None]:
Classification_Report = classification_report(y_test, dtc_predictions)
print(Classification_Report)

In [None]:
knn = KNeighborsClassifier()

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1,2],
}

In [None]:
knn_grid_search = RandomizedSearchCV(KNeighborsClassifier(),param_grid, cv=5)
knn_grid_search.fit(X_train, y_train)
knn_best_model = knn_grid_search.best_estimator_
knn_predictions = knn_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", knn_grid_search.best_params_)

In [None]:
Classification_Report = classification_report(y_test, knn_predictions)
print(Classification_Report)

In [None]:
svm = SVC()

In [None]:
param_grid = {
    'C': [100,10, 1.0, 0.1, 0.001, 0.001],
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
}

In [None]:
svm_grid_search = RandomizedSearchCV(SVC(),param_grid, cv=5)
svm_grid_search.fit(X_train, y_train)
svm_best_model = svm_grid_search.best_estimator_
svm_predictions = svm_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", svm_grid_search.best_params_)

In [None]:
Classification_Report = classification_report(y_test, svm_predictions)
print(Classification_Report)

In [None]:
lr = LogisticRegression()

In [None]:
param_grid = {
    'C': [100,10, 1.0, 0.1, 0.001, 0.001],
}

In [None]:
lr_grid_search = RandomizedSearchCV(LogisticRegression(),param_grid, cv=5)
lr_grid_search.fit(X_train, y_train)
lr_best_model = lr_grid_search.best_estimator_
lr_predictions = lr_best_model.predict(X_test)

#Printing the Best estimator and the Evaluation Matrix
print("Best Random Forest Model Parameters:", lr_grid_search.best_params_)

In [None]:
Classification_Report = classification_report(y_test, lr_predictions)
print(Classification_Report)

## â˜„ **Model Deplyment**

---

Before deployment, the best model (Random Forest) will be serialized (saved) for export and deployment.

In [None]:
# #serializing and saving the best model

# # Construct the path to the file
# file_path = os.path.join('DATA', 'rf_best_model.joblib')

# dump(rf_best_model, file_path)

['DATA/rf_best_model.joblib']

In [None]:
# #to load the model

# loaded_model = load('rf_best_model.joblib')