In [153]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [154]:
df = pd.read_csv('placement-Copy1.csv')

In [155]:
df.head()

Unnamed: 0,cgpa,placement_exam_marks,placed
0,7.19,26.0,1
1,7.46,38.0,1
2,7.54,40.0,1
3,6.42,8.0,1
4,7.23,17.0,0


In [156]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cgpa                  1000 non-null   float64
 1   placement_exam_marks  1000 non-null   float64
 2   placed                1000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 23.6 KB


In [157]:
df.describe()

Unnamed: 0,cgpa,placement_exam_marks,placed
count,1000.0,1000.0,1000.0
mean,6.96124,32.225,0.489
std,0.615898,19.130822,0.500129
min,4.89,0.0,0.0
25%,6.55,17.0,0.0
50%,6.96,28.0,0.0
75%,7.37,44.0,1.0
max,9.12,100.0,1.0


In [158]:
df.dtypes

cgpa                    float64
placement_exam_marks    float64
placed                    int64
dtype: object

In [159]:
df.columns

Index(['cgpa', 'placement_exam_marks', 'placed'], dtype='object')

In [160]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [161]:
df.isnull().sum()

cgpa                    0
placement_exam_marks    0
placed                  0
dtype: int64

In [162]:
df.head()

Unnamed: 0,cgpa,placement_exam_marks,placed
0,7.19,26.0,1
1,7.46,38.0,1
2,7.54,40.0,1
3,6.42,8.0,1
4,7.23,17.0,0


In [163]:
X = df[['cgpa','placement_exam_marks']]
y = df['placed']

# Train Test split


In [164]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state = 121)


In [165]:
X_train.head()

Unnamed: 0,cgpa,placement_exam_marks
337,6.95,68.0
321,6.82,47.0
349,8.12,16.0
620,7.07,18.0
93,7.0,30.0


In [166]:
X_test.head()

Unnamed: 0,cgpa,placement_exam_marks
380,7.12,31.0
288,7.07,21.0
328,7.4,28.0
208,7.25,12.0
284,6.99,62.0


In [167]:
y_train.head()

337    0
321    0
349    0
620    1
93     0
Name: placed, dtype: int64

In [168]:
y_test.head()

380    1
288    0
328    1
208    1
284    1
Name: placed, dtype: int64

In [169]:
# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [170]:
def compare_models_train_test():

  for model in models:

    # training the model
    model.fit(X_train, y_train)
    
    # evaluating the model
    test_data_prediction = model.predict(X_test)

    accuracy = accuracy_score(y_test, test_data_prediction)

    print('Accuracy score of the ', model, ' = ', accuracy)



In [171]:
compare_models_train_test()

Accuracy score of the  LogisticRegression(max_iter=1000)  =  0.48
Accuracy score of the  SVC(kernel='linear')  =  0.48
Accuracy score of the  KNeighborsClassifier()  =  0.5233333333333333
Accuracy score of the  RandomForestClassifier()  =  0.5133333333333333


# Stratified Train Test split

In [172]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,stratify  = y,random_state = 121)

In [173]:
def compare_models_train_test():

  for model in models:

    # training the model
    model.fit(X_train, y_train)
    
    # evaluating the model
    test_data_prediction = model.predict(X_test)

    accuracy = accuracy_score(y_test, test_data_prediction)

    print('Accuracy score of the ', model, ' = ', accuracy)

In [174]:
compare_models_train_test()

Accuracy score of the  LogisticRegression(max_iter=1000)  =  0.5066666666666667
Accuracy score of the  SVC(kernel='linear')  =  0.51
Accuracy score of the  KNeighborsClassifier()  =  0.52
Accuracy score of the  RandomForestClassifier()  =  0.47


# K Fold Cross Validation

In [175]:
cv_score_lr = cross_val_score(LogisticRegression(max_iter=1000), X, y, cv=5)

print(cv_score_lr)

mean_accuracy_lr = sum(cv_score_lr)/len(cv_score_lr)

mean_accuracy_lr = mean_accuracy_lr*100

mean_accuracy_lr = round(mean_accuracy_lr, 2)

print(mean_accuracy_lr)

[0.475 0.47  0.47  0.525 0.395]
46.7


In [176]:
cv_score_svc = cross_val_score(SVC(kernel='linear'), X, y, cv=5)

print(cv_score_svc)

mean_accuracy_svc = sum(cv_score_svc)/len(cv_score_svc)

mean_accuracy_svc = mean_accuracy_svc*100

mean_accuracy_svc = round(mean_accuracy_svc, 2)

print(mean_accuracy_svc)

[0.51  0.475 0.475 0.505 0.425]
47.8


In [177]:
# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [178]:
def compare_models_cross_validation():

  for model in models:

    cv_score = cross_val_score(model, X,y, cv=5)
    
    mean_accuracy = sum(cv_score)/len(cv_score)

    mean_accuracy = mean_accuracy*100

    mean_accuracy = round(mean_accuracy, 2)

    print('Cross Validation accuracies for ', model, '=  ', cv_score)
    print('Accuracy % of the ', model, mean_accuracy)
    print('----------------------------------------------')


In [179]:
compare_models_cross_validation()

Cross Validation accuracies for  LogisticRegression(max_iter=1000) =   [0.475 0.47  0.47  0.525 0.395]
Accuracy % of the  LogisticRegression(max_iter=1000) 46.7
----------------------------------------------
Cross Validation accuracies for  SVC(kernel='linear') =   [0.51  0.475 0.475 0.505 0.425]
Accuracy % of the  SVC(kernel='linear') 47.8
----------------------------------------------
Cross Validation accuracies for  KNeighborsClassifier() =   [0.5   0.47  0.56  0.475 0.47 ]
Accuracy % of the  KNeighborsClassifier() 49.5
----------------------------------------------
Cross Validation accuracies for  RandomForestClassifier() =   [0.48  0.475 0.43  0.47  0.51 ]
Accuracy % of the  RandomForestClassifier() 47.3
----------------------------------------------
