In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import joblib


In [2]:
data = pd.read_csv('Extended_RESPONSES.csv')


In [3]:
data.head()

Unnamed: 0,GENDER,COLLEGE,COURSES,LEVEL,CGPA,Openness Score,Conscientiousness Score
0,MALE,COE,Comp. Engineering,200,3.45,18,33
1,FEMALE,COE,Comp. Engineering,200,3.53,16,31
2,MALE,CST,Comp. Science,400,4.21,32,39
3,FEMALE,COE,Elec. Elect. Engineering,300,2.51,17,18
4,MALE,COE,Comp. Engineering,400,4.15,19,40


In [4]:
def convert_cgpa_to_class(cgpa):
    if cgpa >= 4.5:
        return 'First Class'
    elif 3.5 <= cgpa < 4.5:
        return 'Second Class Upper'
    elif 2.5 <= cgpa < 3.5:
        return 'Second Class Lower'
    else:
        return 'Third Class'

In [5]:
data['Academic Class'] = data['CGPA'].apply(convert_cgpa_to_class)
data = data.drop(columns=['CGPA'])


In [6]:
data.head()

Unnamed: 0,GENDER,COLLEGE,COURSES,LEVEL,Openness Score,Conscientiousness Score,Academic Class
0,MALE,COE,Comp. Engineering,200,18,33,Second Class Lower
1,FEMALE,COE,Comp. Engineering,200,16,31,Second Class Upper
2,MALE,CST,Comp. Science,400,32,39,Second Class Upper
3,FEMALE,COE,Elec. Elect. Engineering,300,17,18,Second Class Lower
4,MALE,COE,Comp. Engineering,400,19,40,Second Class Upper


In [7]:
Q1 = data[['Openness Score', 'Conscientiousness Score']].quantile(0.25)
Q3 = data[['Openness Score', 'Conscientiousness Score']].quantile(0.75)
IQR = Q3 - Q1

outliers = ((data[['Openness Score', 'Conscientiousness Score']] < (Q1 - 1.5 * IQR)) | (data[['Openness Score', 'Conscientiousness Score']] > (Q3 + 1.5 * IQR)))
data_cleaned = data[~(outliers.any(axis=1))]

In [8]:
label_encoder = LabelEncoder()
data_cleaned['Academic Class Encoded'] = label_encoder.fit_transform(data_cleaned['Academic Class'])

In [9]:
X = data_cleaned[['Openness Score', 'Conscientiousness Score', 'LEVEL', 'COLLEGE', 'GENDER']]
y = data_cleaned['Academic Class Encoded']

In [10]:
X = pd.get_dummies(X, columns=['COLLEGE', 'GENDER'])


In [11]:
feature_names = X.columns.tolist()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [13]:
rf_clf = RandomForestClassifier(random_state=42)
xgb_clf = XGBClassifier(random_state=42)
voting_clf = VotingClassifier(estimators=[('rf', rf_clf), ('xgb', xgb_clf)], voting='soft')


In [14]:
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10, 20],
    'xgb__n_estimators': [100, 200],
    'xgb__learning_rate': [0.01, 0.1],
    'xgb__max_depth': [3, 6]
}

In [15]:
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_smote, y_train_smote)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [16]:
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [17]:
print("Best parameters for model:", best_params)
print("Classification report for model:\n", report)


Best parameters for model: {'rf__max_depth': 10, 'rf__n_estimators': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 6, 'xgb__n_estimators': 200}
Classification report for model:
                     precision    recall  f1-score   support

       First Class       0.79      0.94      0.86        16
Second Class Lower       0.33      0.40      0.36        15
Second Class Upper       0.82      0.70      0.75        46
       Third Class       0.86      0.90      0.88        21

          accuracy                           0.73        98
         macro avg       0.70      0.73      0.71        98
      weighted avg       0.75      0.73      0.74        98



In [18]:
joblib_file = "trained_model_combined.pkl"
joblib.dump(best_estimator, joblib_file)

['trained_model_combined.pkl']

In [19]:
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [20]:
joblib.dump(feature_names, "feature_names.pkl")

['feature_names.pkl']