Imports


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Load Dataset


In [2]:
df = pd.read_csv("student-scores.csv")
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


Data Cleaning and Preprocessing


In [3]:
# drop columns
df.drop(columns=['id','first_name','last_name','email'], inplace=True)
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


In [4]:
df.isnull().sum()


Unnamed: 0,0
gender,0
part_time_job,0
absence_days,0
extracurricular_activities,0
weekly_self_study_hours,0
career_aspiration,0
math_score,0
history_score,0
physics_score,0
chemistry_score,0


Create New Feature


In [5]:
df["total_score"] = df["math_score"] + df["history_score"] + df["physics_score"] + df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
df["average_score"] = df["total_score"] / 7
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87,574,82.0
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76,521,74.428571


Encoding


In [6]:
encoder = LabelEncoder()

df['gender'] = encoder.fit_transform(df['gender'])
df['part_time_job'] = encoder.fit_transform(df['part_time_job'])
df['extracurricular_activities'] = encoder.fit_transform(df['extracurricular_activities'])

df

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,1,0,3,0,27,Lawyer,73,81,93,97,63,80,87,574,82.000000
1,0,0,2,0,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,0,0,9,1,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,0,0,5,0,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,1,0,5,0,10,Unknown,84,77,65,65,80,74,76,521,74.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,0,2,0,30,Construction Engineer,83,77,84,73,75,84,82,558,79.714286
1996,1,0,2,0,20,Software Engineer,89,65,73,80,87,67,73,534,76.285714
1997,0,0,5,0,14,Software Engineer,97,85,63,93,68,94,78,578,82.571429
1998,0,1,10,1,5,Business Owner,51,96,72,89,95,88,75,566,80.857143


In [7]:
df['career_aspiration'] = df['career_aspiration'].map( {
    'Software Engineer': 0,
    'Business Owner': 1,
    'Unknown': 2,
    'Banker': 3,
    'Lawyer': 4,
    'Accountant': 5,
    'Doctor': 6,
    'Real Estate Developer': 7,
    'Stock Investor': 8,
    'Construction Engineer': 9,
    'Artist': 10,
    'Game Developer': 11,
    'Government Officer': 12,
    'Teacher': 13,
    'Designer': 14,
    'Scientist': 15,
    'Writer': 16
})

Scaling: Normalization


In [8]:
# Define the columns to scale
scale_cols = ['absence_days', 'weekly_self_study_hours', 'math_score', 'history_score',
              'physics_score', 'chemistry_score', 'biology_score',
              'english_score', 'geography_score']

scaler = StandardScaler()

df[scale_cols] = scaler.fit_transform(df[scale_cols])
df

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,1,0,-0.253175,0,0.762334,4,-0.790525,0.052463,0.930377,1.331147,-1.208673,-0.106245,0.525321,574,82.000000
1,0,0,-0.633604,0,2.411605,6,0.495250,0.445147,1.169682,1.565986,0.759435,0.559086,0.783168,640,91.428571
2,0,0,2.029397,1,-0.392155,12,-0.185454,1.309054,1.089913,1.252867,-1.062888,-0.355744,1.126964,605,86.428571
3,0,0,0.507682,0,-1.216791,10,-0.941792,-0.497296,0.531536,0.000391,0.686542,-1.520075,0.439372,551,78.714286
4,1,0,0.507682,0,-0.639546,2,0.041447,-0.261685,-1.303134,-1.173804,0.030506,-0.605244,-0.420119,521,74.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,0,-0.633604,0,1.009725,9,-0.034186,-0.261685,0.212463,-0.547567,-0.333959,0.226421,0.095575,558,79.714286
1996,1,0,-0.633604,0,0.185089,0,0.419616,-1.204128,-0.664988,0.000391,0.540756,-1.187409,-0.677966,534,76.285714
1997,0,0,0.507682,0,-0.309692,0,1.024687,0.366610,-1.462670,1.018028,-0.844209,1.058085,-0.248221,578,82.571429
1998,0,1,2.409825,1,-1.051864,1,-2.454469,1.230517,-0.744756,0.704909,1.123899,0.559086,-0.506068,566,80.857143


Data Balancing


In [9]:
df['career_aspiration'].value_counts()


Unnamed: 0_level_0,count
career_aspiration,Unnamed: 1_level_1
0,315
1,309
2,223
3,169
4,138
5,126
6,119
7,83
8,73
9,68


In [10]:
classes = df['career_aspiration'].unique()
df_list = [df[df['career_aspiration'] == label] for label in classes]

max_size = max([len(sub_df) for sub_df in df_list])

df_upsampled = [resample(sub_df,
                         replace=True,
                         n_samples=max_size,
                         random_state=42)
                for sub_df in df_list]

df = pd.concat(df_upsampled)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df['career_aspiration'].value_counts()

Unnamed: 0_level_0,count
career_aspiration,Unnamed: 1_level_1
8,315
0,315
6,315
9,315
4,315
16,315
7,315
5,315
10,315
12,315


Train Test Split


In [12]:
X = df.drop('career_aspiration', axis=1)
y = df['career_aspiration']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (4284, 14)
Test shape: (1071, 14)


Training, Testing Multiple Classifiers


In [13]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
}

for name, model in models.items():
    print("="*50)
    print("Model:", name)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)

Model: Logistic Regression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.40149393090569563
Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.23      0.20        48
           1       0.58      0.82      0.68        51
           2       0.23      0.09      0.13        75
           3       0.27      0.13      0.18        60
           4       0.39      0.41      0.40        74
           5       0.42      0.46      0.44        61
           6       0.45      0.48      0.47        71
           7       0.31      0.25      0.28        64
           8       0.36      0.16      0.22        57
           9       0.40      0.47      0.43        62
          10       0.41      0.47      0.44        68
          11       0.57      0.82      0.67        68
          12       0.38      0.27      0.32        67
          13       0.48      0.77      0.59        57
          14       0.15      0.07      0.10        70
          15       0.36      0.42      0.39        62
          16       0.39    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.12791783380018673
Classification Report:
               precision    recall  f1-score   support

           0       0.05      0.46      0.09        48
           1       0.24      0.65      0.35        51
           2       0.00      0.00      0.00        75
           3       0.00      0.00      0.00        60
           4       0.00      0.00      0.00        74
           5       0.07      0.21      0.11        61
           6       0.30      0.76      0.43        71
           7       0.00      0.00      0.00        64
           8       0.00      0.00      0.00        57
           9       0.00      0.00      0.00        62
          10       0.00      0.00      0.00        68
          11       0.00      0.00      0.00        68
          12       0.00      0.00      0.00        67
          13       0.00      0.00      0.00        57
          14       0.00      0.00      0.00        70
          15       0.12      0.24      0.16        62
          16       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.21942110177404295
Classification Report:
               precision    recall  f1-score   support

           0       0.12      0.77      0.21        48
           1       0.19      1.00      0.32        51
           2       0.17      0.11      0.13        75
           3       0.00      0.00      0.00        60
           4       0.17      0.05      0.08        74
           5       0.17      0.25      0.20        61
           6       1.00      0.48      0.65        71
           7       0.00      0.00      0.00        64
           8       0.00      0.00      0.00        57
           9       0.00      0.00      0.00        62
          10       0.00      0.00      0.00        68
          11       0.00      0.00      0.00        68
          12       0.26      0.55      0.35        67
          13       0.00      0.00      0.00        57
          14       0.00      0.00      0.00        70
          15       0.28      0.13      0.18        62
          16       0.37    

Selecting Best Model


In [14]:
model_rfc = RandomForestClassifier()

model_rfc.fit(X_train,y_train)

y_pred = model_rfc.predict(X_test)



print("confusion matrix \n: ", confusion_matrix(y_test,y_pred))
print("classification report \n: ", classification_report(y_test, y_pred))

confusion matrix 
:  [[38  0  2  3  0  0  1  0  1  2  0  0  0  1  0  0  0]
 [ 0 49  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0]
 [ 6  0 57  2  0  1  0  2  0  2  2  0  0  1  0  0  2]
 [ 0  0  2 52  3  0  1  0  1  0  0  0  0  1  0  0  0]
 [ 0  0  2  0 71  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 2  0  0  0  0 58  0  0  1  0  0  0  0  0  0  0  0]
 [ 1  0  1  1  1  0 65  0  0  1  0  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0 56  0  0  7  0  1  0  0  0  0]
 [ 0  0  2  0  0  0  0  0 55  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 62  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 68  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  3 65  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0 66  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 57  0  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0  0  0 68  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0  0 61  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 56]]
classification report 
:                precision    recall 

Saving Model, Encoder, Scaler for production


In [17]:
import pickle

pickle.dump(scaler, open("scaler.pkl",'wb'))
pickle.dump(encoder, open("encoder.pkl",'wb'))
pickle.dump(model_rfc, open("model_rfc.pkl",'wb'))

Inference (Prediction on new data)


In [21]:
import pandas as pd
import pickle

encoder = pickle.load(open("encoder.pkl", 'rb'))
scaler = pickle.load(open("scaler.pkl", 'rb'))
model_rfc = pickle.load(open("model_rfc.pkl", 'rb'))

career_aspiration_map = {
    0: 'Software Engineer',
    1: 'Business Owner',
    2: 'Unknown',
    3: 'Banker',
    4: 'Lawyer',
    5: 'Accountant',
    6: 'Doctor',
    7: 'Real Estate Developer',
    8: 'Stock Investor',
    9: 'Construction Engineer',
    10: 'Artist',
    11: 'Game Developer',
    12: 'Government Officer',
    13: 'Teacher',
    14: 'Designer',
    15: 'Scientist',
    16: 'Writer'
}

def predict_career(gender, part_time_job, absence_days, extracurricular_activities, weekly_self_study_hours,
                   math_score, history_score, physics_score, chemistry_score, biology_score, english_score,
                   geography_score, total_score, average_score):

    input_data = pd.DataFrame([[gender, part_time_job, absence_days, extracurricular_activities,
                                weekly_self_study_hours, math_score, history_score, physics_score,
                                chemistry_score, biology_score, english_score, geography_score,
                                total_score, average_score]],
                              columns=['gender', 'part_time_job', 'absence_days', 'extracurricular_activities',
                                       'weekly_self_study_hours', 'math_score', 'history_score', 'physics_score',
                                       'chemistry_score', 'biology_score', 'english_score', 'geography_score',
                                       'total_score', 'average_score'])

    input_data['gender'] = encoder.transform(input_data['gender'])
    input_data['part_time_job'] = encoder.transform(input_data['part_time_job'])
    input_data['extracurricular_activities'] = encoder.transform(input_data['extracurricular_activities'])

    input_data[['absence_days', 'weekly_self_study_hours', 'math_score', 'history_score',
              'physics_score', 'chemistry_score', 'biology_score',
              'english_score', 'geography_score']] = scaler.transform(
        input_data[['absence_days', 'weekly_self_study_hours', 'math_score', 'history_score',
              'physics_score', 'chemistry_score', 'biology_score',
              'english_score', 'geography_score']])

    result = model_rfc.predict(input_data)
    career_aspiration = career_aspiration_map[result[0]]

    return career_aspiration

In [27]:
# Predefined values (you can modify these based on your needs)
gender = 'male'
part_time_job = 'True'
absence_days = 2
extracurricular_activities = 'False'
weekly_self_study_hours = 8
math_score = 87
history_score = 76
physics_score = 84
chemistry_score = 80
biology_score = 90
english_score = 85
geography_score = 78
total_score = 640
average_score = 91.428571

predicted_career = predict_career(gender, part_time_job, absence_days, extracurricular_activities,
                                  weekly_self_study_hours, math_score, history_score, physics_score,
                                  chemistry_score, biology_score, english_score, geography_score,
                                  total_score, average_score)

print(f"Predicted Career Aspiration: {predicted_career}")

Predicted Career Aspiration: Doctor
