In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report ,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


In [3]:
df1 = pd.read_csv("student-scores.csv")
df = df1.copy()
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   id                          2000 non-null   int64 
 1   first_name                  2000 non-null   object
 2   last_name                   2000 non-null   object
 3   email                       2000 non-null   object
 4   gender                      2000 non-null   object
 5   part_time_job               2000 non-null   bool  
 6   absence_days                2000 non-null   int64 
 7   extracurricular_activities  2000 non-null   bool  
 8   weekly_self_study_hours     2000 non-null   int64 
 9   career_aspiration           2000 non-null   object
 10  math_score                  2000 non-null   int64 
 11  history_score               2000 non-null   int64 
 12  physics_score               2000 non-null   int64 
 13  chemistry_score             2000 non-null   int6

In [5]:
df = df.drop(columns=["id","first_name","last_name","email"])
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


In [6]:
gender_map = {"male": 0, "female" :1}
part_time_map = {False : 0 , True : 1}
extracurricular_map = {False : 0,True : 1}
career_map = {"Lawyer" : 0 ,"Doctor" : 1 ,"Government Officer" : 2 , "Artist" : 3 ,"Unknown" : 4
              ,"Software Engineer" : 5 , "Teacher" : 6 ,"Business Owner" : 7 ,"Scientist" : 8,
              "Banker" : 9 ,"Writer" : 10 ,"Accountant" : 11 , "Designer" : 12 ,"Construction Engineer" : 13,
              "Game Developer" : 14 , "Stock Investor" : 15 ,"Real Estate Developer" : 16 }

df["gender"] = df["gender"].map(gender_map)
df["part_time_job"] = df["part_time_job"].map(part_time_map)
df["extracurricular_activities"] = df["extracurricular_activities"].map(extracurricular_map)
df["career_aspiration"] = df["career_aspiration"].map(career_map)

In [7]:
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,0,0,3,0,27,0,73,81,93,97,63,80,87
1,1,0,2,0,47,1,90,86,96,100,90,88,90
2,1,0,9,1,13,2,81,97,95,96,65,77,94
3,1,0,5,0,3,3,71,74,88,80,89,63,86
4,0,0,5,0,10,4,84,77,65,65,80,74,76


In [8]:
df["Total_Score"] = df["math_score"] + df["physics_score"] + df["history_score"] + df["geography_score"] + df["english_score"] + df["chemistry_score"] + df["biology_score"]
df["Average_Score"] = df["Total_Score"] / 7
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,Total_Score,Average_Score
0,0,0,3,0,27,0,73,81,93,97,63,80,87,574,82.0
1,1,0,2,0,47,1,90,86,96,100,90,88,90,640,91.428571
2,1,0,9,1,13,2,81,97,95,96,65,77,94,605,86.428571
3,1,0,5,0,3,3,71,74,88,80,89,63,86,551,78.714286
4,0,0,5,0,10,4,84,77,65,65,80,74,76,521,74.428571


In [9]:
df.shape

(2000, 15)

## Splitting

In [None]:
x = df.drop(columns=["career_aspiration"],axis = 1)
y = df["career_aspiration"]



###  Data Balance

In [13]:
smote = SMOTE(random_state=42)

x_resampled , y_resampled= smote.fit_resample(x,y)



In [14]:
x_train , x_test , y_train , y_test = train_test_split(x_resampled,y_resampled,test_size=0.2,random_state=42)

In [15]:
x_resampled.value_counts()

gender  part_time_job  absence_days  extracurricular_activities  weekly_self_study_hours  math_score  history_score  physics_score  chemistry_score  biology_score  english_score  geography_score  Total_Score  Average_Score
0       0              4             0                           27                       78          95             70             70               86             96             92               591          84.428571        2
1       0              4             0                           13                       95          67             88             82               89             73             95               593          84.714286        2
0       0              0             0                           2                        80          80             93             64               65             68             70               523          74.840672        1
                                                                                             

## Feature Scaling

In [16]:
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.transform(x_test)

## Model Selection

In [17]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Gradient Boosting Classifier": GradientBoostingClassifier() 
    }

# Train and evaluate each model
for name, model in models.items():
    print("="*50)
    print("Model:", name)
    # Train the model
    model.fit(scaled_x_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(scaled_x_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)

Model: Logistic Regression
Accuracy: 0.48739495798319327
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.54      0.49        68
           1       0.49      0.62      0.55        72
           2       0.42      0.44      0.43        57
           3       0.52      0.57      0.55        58
           4       0.31      0.17      0.22        66
           5       0.32      0.32      0.32        76
           6       0.58      0.92      0.71        71
           7       0.83      0.90      0.87        61
           8       0.41      0.45      0.43        53
           9       0.29      0.10      0.15        61
          10       0.59      0.71      0.65        63
          11       0.44      0.45      0.45        53
          12       0.31      0.16      0.21        68
          13       0.38      0.49      0.43        55
          14       0.61      0.93      0.74        57
          15       0.37      0.24      0.29        63


In [18]:
model = RandomForestClassifier()

model.fit(scaled_x_train, y_train)
# Predict on test set
y_pred = model.predict(scaled_x_test)

# Calculate metrics
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Report: ",classification_report(y_test, y_pred))
print("Confusion Matrix: ",confusion_matrix(y_test, y_pred))

Accuracy:  0.838468720821662
Report:                precision    recall  f1-score   support

           0       0.77      0.88      0.82        68
           1       0.80      1.00      0.89        72
           2       0.78      0.98      0.87        57
           3       0.92      0.93      0.92        58
           4       0.73      0.45      0.56        66
           5       0.61      0.36      0.45        76
           6       0.93      1.00      0.97        71
           7       0.97      0.93      0.95        61
           8       0.75      0.98      0.85        53
           9       0.73      0.75      0.74        61
          10       0.91      0.98      0.95        63
          11       0.89      0.74      0.80        53
          12       0.88      0.88      0.88        68
          13       0.77      0.93      0.84        55
          14       0.90      1.00      0.95        57
          15       0.94      0.73      0.82        63
          16       0.94      0.84      0.89

### Single input prediction

In [19]:
print("Predicted labels : " , model.predict(scaled_x_test[10].reshape(1,-1))[0])
print("Actual : ",y_test.iloc[10])

Predicted labels :  12
Actual :  12


In [20]:
print("Predicted labels : " , model.predict(scaled_x_test[6].reshape(1,-1)))
print("Actual : ",y_test.iloc[6])

Predicted labels :  [6]
Actual :  6


In [21]:
print("Predicted labels : " , model.predict(scaled_x_test[50].reshape(1,-1)))
print("Actual : ",y_test.iloc[50])

Predicted labels :  [12]
Actual :  12


### Save model

In [22]:
import pickle
pickle.dump(scaler,open("Models/scaler.pkl","wb"))
pickle.dump(model,open("Models/model.pkl","wb"))

### Recommendation system

In [23]:
import pickle
import numpy as np

scaler = pickle.load(open("Models/scaler.pkl", 'rb'))
model = pickle.load(open("Models/model.pkl", 'rb'))
class_names = ['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
               'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
               'Banker', 'Writer', 'Accountant', 'Designer',
               'Construction Engineer', 'Game Developer', 'Stock Investor',
               'Real Estate Developer']

def Recommendations(gender, part_time_job, absence_days, extracurricular_activities,
                    weekly_self_study_hours, math_score, history_score, physics_score,
                    chemistry_score, biology_score, english_score, geography_score,
                    total_score,average_score):
  
    gender_encoded = 1 if gender.lower() == 'female' else 0
    part_time_job_encoded = 1 if part_time_job else 0
    extracurricular_activities_encoded = 1 if extracurricular_activities else 0
    
    feature_array = np.array([[gender_encoded, part_time_job_encoded, absence_days, extracurricular_activities_encoded,
                               weekly_self_study_hours, math_score, history_score, physics_score,
                               chemistry_score, biology_score, english_score, geography_score,total_score,average_score]])
    
    
    scaled_features = scaler.transform(feature_array)
    probabilities = model.predict_proba(scaled_features)
    
    top_classes_idx = np.argsort(-probabilities[0])[:5]
    top_classes_names_probs = [(class_names[idx], probabilities[0][idx]) for idx in top_classes_idx]
    
    return top_classes_names_probs

In [24]:
# Example 1
final_recommendations = Recommendations(gender='female',
                                        part_time_job=False,
                                        absence_days=2,
                                        extracurricular_activities=False,
                                        weekly_self_study_hours=7,
                                        math_score=65,
                                        history_score=60,
                                        physics_score=97,
                                        chemistry_score=94,
                                        biology_score=71,
                                        english_score=81,
                                        geography_score=66,
                                        total_score=534,
                                        average_score=76.285714)

print("Top recommended studies with probabilities:")
print("="*50)
for class_name, probability in final_recommendations:
    print(f"{class_name} with probability {probability}")

Top recommended studies with probabilities:
Teacher with probability 0.72
Government Officer with probability 0.12
Unknown with probability 0.09
Real Estate Developer with probability 0.04
Designer with probability 0.01


