In [1]:
# 1pip install openpyxl

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import joblib

In [3]:
df1=pd.read_csv("../data/career/Admission_Predict.csv")
df2=pd.read_csv("../data/career/Admission_Predict_Ver1.1.csv")
df3=pd.read_excel("../data/career/Dataset Project 404.xlsx")
df4=pd.read_csv("../data/career/education_career_success.csv")

for i, df in enumerate([df1, df2, df3, df4], start=1):
    print(f"\nDataset {i}: shape={df.shape}")
    print(df.head())


Dataset 1: shape=(400, 9)
   Serial No.  GRE Score  TOEFL Score  University Rating  SOP  LOR   CGPA  \
0           1        337          118                  4  4.5   4.5  9.65   
1           2        324          107                  4  4.0   4.5  8.87   
2           3        316          104                  3  3.0   3.5  8.00   
3           4        322          110                  3  3.5   2.5  8.67   
4           5        314          103                  2  2.0   3.0  8.21   

   Research  Chance of Admit   
0         1              0.92  
1         1              0.76  
2         1              0.72  
3         1              0.80  
4         0              0.65  

Dataset 2: shape=(500, 9)
   Serial No.  GRE Score  TOEFL Score  University Rating  SOP  LOR   CGPA  \
0           1        337          118                  4  4.5   4.5  9.65   
1           2        324          107                  4  4.0   4.5  8.87   
2           3        316          104                  3  3.

### Dataset 1 and 2 

In [4]:
df1 = pd.read_csv("../data/career/Admission_Predict.csv")
df2 = pd.read_csv("../data/career/Admission_Predict_Ver1.1.csv")

df = pd.concat([df1, df2], ignore_index=True)

print("Combined Dataset Shape:", df.shape)
print(df.head())

Combined Dataset Shape: (900, 9)
   Serial No.  GRE Score  TOEFL Score  University Rating  SOP  LOR   CGPA  \
0           1        337          118                  4  4.5   4.5  9.65   
1           2        324          107                  4  4.0   4.5  8.87   
2           3        316          104                  3  3.0   3.5  8.00   
3           4        322          110                  3  3.5   2.5  8.67   
4           5        314          103                  2  2.0   3.0  8.21   

   Research  Chance of Admit   
0         1              0.92  
1         1              0.76  
2         1              0.72  
3         1              0.80  
4         0              0.65  


In [5]:
X = df.drop(columns=["Serial No.", "Chance of Admit "])
y = df["Chance of Admit "]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
num_features = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features)
    ]
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=200, random_state=42))
])

In [8]:
pipeline.fit(X_train, y_train)
print("Model trained successfully on combined df1 + df2")

Model trained successfully on combined df1 + df2


In [9]:
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)

MAE: 0.027322499999999906
MSE: 0.0020093891249999985
R2: 0.9011381323087189


In [10]:
import os

os.makedirs("../models", exist_ok=True)
joblib.dump(pipeline, "../models/career_admission_model.pkl")
print("Saved pipeline (scaler + model) in one file!")

Saved pipeline (scaler + model) in one file!


### Dataset 3

In [11]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [12]:
df3 = pd.read_excel("../data/career/Dataset Project 404.xlsx")

print("Dataset 3 shape:", df3.shape)
print(df3.head())

Dataset 3 shape: (3600, 21)
   Sr.No.  Course Job profession Student  Linguistic  Musical  Bodily  \
0     1.0     NaN   Astronomer\n      S1          11        5      12   
1     NaN     NaN   Astronomer\n      S2          12        6      12   
2     NaN     NaN   Astronomer\n      S3          13        7      12   
3     NaN     NaN   Astronomer\n      S4          14        8      12   
4     NaN     NaN   Astronomer\n      S5          13        9      12   

   Logical - Mathematical  Spatial-Visualization  Interpersonal  ...  \
0                      16                     17             11  ...   
1                      16                     16             11  ...   
2                      16                     15             11  ...   
3                      16                     19             11  ...   
4                      16                     20             11  ...   

   Naturalist  s/p   P1    P2   P3    P4    P5   P6    P7    P8  
0          19   s1  AVG  POOR  AVG

In [13]:
if "Sr.No." in df3.columns:
    df3 = df3.drop(columns=["Sr.No."])

df3.columns = df3.columns.str.strip()

target = "Job profession"

X = df3.drop(columns=[target])
y = df3[target]

X = X.dropna(axis=1, how="all")

In [14]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical cols:", categorical_cols)
print("Numerical cols:", numerical_cols)

Categorical cols: ['Student', 's/p', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8']
Numerical cols: ['Linguistic', 'Musical', 'Bodily', 'Logical - Mathematical', 'Spatial-Visualization', 'Interpersonal', 'Intrapersonal', 'Naturalist']


In [15]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),   
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

### Define transformers

In [16]:
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

### Column transformer & pipeline

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9611111111111111

Classification Report:
                                                                                                 precision    recall  f1-score   support

                                                                               Actor / Actress       1.00      1.00      1.00        10
                                                                                       Actuary       1.00      1.00      1.00        10
                                                                                Anthropologist       1.00      1.00      1.00        10
                                                                                  Archeologist       1.00      1.00      1.00        10
                                                                                        Artist       1.00      1.00      1.00        10
                                                                                   Astronomer
       1.00      1.00      1.00    

In [19]:
os.makedirs("../models", exist_ok=True)

joblib.dump(pipeline, "../models/career_profession_model.pkl")

print("Saved Dataset 3 model (full pipeline) as ONE file!")

Saved Dataset 3 model (full pipeline) as ONE file!


### Dataset 4

In [20]:
df4=pd.read_csv("../data/career/education_career_success.csv")
print("Dataset 4 shape:", df4.shape)
print(df4.head())

Dataset 4 shape: (400, 19)
  Student_ID  Age  Gender  High_School_GPA  SAT_Score  University_GPA  \
0       S001   22    Male              3.8       1450             3.6   
1       S002   24  Female              3.6       1380             3.4   
2       S003   21    Male              3.9       1520             3.8   
3       S004   23  Female              3.5       1300             3.2   
4       S005   25    Male              3.7       1420             3.5   

     Field_of_Study  Internships_Completed  Projects_Completed  \
0  Computer Science                      3                   7   
1          Business                      2                   5   
2       Engineering                      4                   9   
3        Psychology                      1                   3   
4          Medicine                      2                   6   

   Certifications  Soft_Skills_Score  Networking_Score  Job_Offers  \
0               2                  8                 7           3 

In [21]:
if "Student_ID" in df4.columns:
    df4 = df4.drop(columns=["Student_ID"])

df4.columns = df4.columns.str.strip()

target = "Current_Job_Level"

X = df4.drop(columns=[target])
y = df4[target]

In [22]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical cols:", categorical_cols)
print("Numerical cols:", numerical_cols)

Categorical cols: ['Gender', 'Field_of_Study', 'Entrepreneurship']
Numerical cols: ['Age', 'High_School_GPA', 'SAT_Score', 'University_GPA', 'Internships_Completed', 'Projects_Completed', 'Certifications', 'Soft_Skills_Score', 'Networking_Score', 'Job_Offers', 'Starting_Salary', 'Career_Satisfaction', 'Years_to_Promotion', 'Work_Life_Balance']


In [23]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),   
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [24]:
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

       Entry       1.00      1.00      1.00        16
         Mid       1.00      1.00      1.00        52
      Senior       1.00      1.00      1.00        12

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



In [27]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring="accuracy")
print("CV scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

CV scores: [0.975 1.    1.    1.    1.   ]
Mean CV accuracy: 0.9949999999999999


In [28]:
import os, joblib

os.makedirs("../models", exist_ok=True)

joblib.dump(pipeline, "../models/career_growth_model.pkl")
print("Saved pipeline (scaler + encoder + model) in one file: career_growth_model.pkl")

Saved pipeline (scaler + encoder + model) in one file: career_growth_model.pkl
