In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [2]:
df = pd.read_csv('Job_Placement_Data.csv')
df

Unnamed: 0,gender,ssc_percentage,ssc_board,hsc_percentage,hsc_board,hsc_subject,degree_percentage,undergrad_degree,work_experience,emp_test_percentage,specialisation,mba_percent,status
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed
211,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed
212,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed
213,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed


In [3]:
le = LabelEncoder()
df['status'] = le.fit_transform(df['status'])

In [4]:
X = df.drop(columns='status')
y = df['status']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [7]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [8]:
nums_trans = Pipeline(steps=[
    ('sc',StandardScaler())
])

In [9]:
cat_trans = Pipeline(steps=[
    ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

In [10]:
preprocessor = ColumnTransformer(transformers=[
    ('num',nums_trans,numeric_features),
    ('cat',cat_trans,categorical_features)
])

In [11]:
models = {}


# Logistic Regression pipeline
pipe_lr = Pipeline(steps=[
('preprocessor', preprocessor),
('clf', LogisticRegression(max_iter=1000, random_state=42)
)])
models['LogisticRegression'] = pipe_lr


# Random Forest pipeline
pipe_rf = Pipeline(steps=[('preprocessor', preprocessor),
('clf', RandomForestClassifier(n_jobs=-1, random_state=42))])
models['RandomForest'] = pipe_rf

In [12]:
pipe_lgr = Pipeline(steps=[
    ('prep',preprocessor),
    ('lgr',LogisticRegression())
])

pipe_rf = Pipeline(steps=[
    ('prep',preprocessor),
    ('clf', RandomForestClassifier())
])

In [18]:
pipe_lgr.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
y_pred_lgr = pipe_lgr.predict(X_test)
y_pred_rf  = pipe_rf.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score

acc_lgr = accuracy_score(y_test, y_pred_lgr)
acc_rf  = accuracy_score(y_test, y_pred_rf)

print("Logistic Regression Accuracy:", acc_lgr)
print("Random Forest Accuracy:", acc_rf)

Logistic Regression Accuracy: 0.8604651162790697
Random Forest Accuracy: 0.8372093023255814


In [21]:
print("Logistic Regression Accuracy:", pipe_lgr.score(X_test, y_test))
print("Random Forest Accuracy:", pipe_rf.score(X_test, y_test))

Logistic Regression Accuracy: 0.8604651162790697
Random Forest Accuracy: 0.8372093023255814


In [23]:
from sklearn.model_selection import cross_val_score

cv_acc = cross_val_score(pipe_rf, X, y, cv=5, scoring='accuracy')
print("CV Accuracy:", cv_acc.mean())

CV Accuracy: 0.8604651162790699
