In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('cleaned_data.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA',
       'Research', 'Chance of Admit'],
      dtype='object')

In [4]:
df.drop(columns='Unnamed: 0',inplace=True)
df

Unnamed: 0,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,118,4,4.5,4.5,9.65,1,0.92
1,107,4,4.0,4.5,8.87,1,0.76
2,104,3,3.0,3.5,8.00,1,0.72
3,110,3,3.5,2.5,8.67,1,0.80
4,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...
495,108,5,4.5,4.0,9.02,1,0.87
496,117,5,5.0,5.0,9.87,1,0.96
497,120,5,4.5,5.0,9.56,1,0.93
498,103,4,4.0,5.0,8.43,0,0.73


### X and y

In [5]:
X=df.drop(columns='Chance of Admit')
y=df['Chance of Admit']

### Train_test_split

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Modelling and Evaluation


### Hyperparameter Tuning

# Lasso

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

estimator=Lasso()
param_grid=[{'alpha':list(range(1,100))}]

hpt_model=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_root_mean_squared_error')

hpt_model.fit(X_train,y_train)

hpt_model.best_params_

{'alpha': 1}

In [8]:
# alpha = 1 modelling 

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

model_0=Lasso(alpha=1)

model_0.fit(X_train,y_train)

# predicting train values
ypred_train_0=model_0.predict(X_train)

# Train evaulation
print('Cross Validation score ',cross_val_score(model_0,X_train,y_train,cv=5).mean())
print('R2 train value ',r2_score(y_train,ypred_train_0))


# predicting test values
ypred_test_0=model_0.predict(X_test)

# Test evaulation
print('R2 test value ',r2_score(y_test,ypred_test_0))

Cross Validation score  -0.006168552353700951
R2 train value  0.0
R2 test value  -0.00724844132029312


In [9]:
print('Coefficients of lasso ',model_0.coef_)
print('Intercepts of Coefficients ',model_0.intercept_)

Coefficients of lasso  [0. 0. 0. 0. 0. 0.]
Intercepts of Coefficients  0.7241749999999999


In [10]:
# Lasso Regression not working, since all the coefficients are 0

# Ridge Regression




In [11]:
from sklearn.linear_model import Ridge

estimator=Ridge()
param_grid=[{'alpha':list(range(1,100))}]

hpt_model=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_root_mean_squared_error')

hpt_model.fit(X_train,y_train)

hpt_model.best_params_

{'alpha': 1}

In [12]:
# alpha = 1 modelling 

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

model_1=Ridge(alpha=1)

model_1.fit(X_train,y_train)

# predicting train values
ypred_train_0=model_1.predict(X_train)

# Train evaulation
print('Cross Validation score ',cross_val_score(model_1,X_train,y_train,cv=5).mean())
print('R2 train value ',r2_score(y_train,ypred_train_0))


# predicting test values
ypred_test_0=model_1.predict(X_test)

# Test evaulation
print('R2 test value ',r2_score(y_test,ypred_test_0))

Cross Validation score  0.8001145365944804
R2 train value  0.8129766331422732
R2 test value  0.8284308425195921


In [13]:
df_train,df_test=train_test_split(df,test_size=0.2,random_state=42)

from statsmodels.formula.api import ols

model_ols=ols('y_train~X_train',data=df_train).fit()
model_ols.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.813
Model:,OLS,Adj. R-squared:,0.81
Method:,Least Squares,F-statistic:,284.8
Date:,"Wed, 01 Jan 2025",Prob (F-statistic):,1.04e-139
Time:,12:14:41,Log-Likelihood:,553.12
No. Observations:,400,AIC:,-1092.0
Df Residuals:,393,BIC:,-1064.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.0024,0.073,-13.647,0.000,-1.147,-0.858
X_train[0],0.0047,0.001,5.491,0.000,0.003,0.006
X_train[1],0.0037,0.004,0.861,0.390,-0.005,0.012
X_train[2],0.0012,0.005,0.224,0.823,-0.009,0.011
X_train[3],0.0161,0.005,3.448,0.001,0.007,0.025
X_train[4],0.1313,0.010,13.129,0.000,0.112,0.151
X_train[5],0.0339,0.007,4.698,0.000,0.020,0.048

0,1,2,3
Omnibus:,77.65,Durbin-Watson:,2.067
Prob(Omnibus):,0.0,Jarque-Bera (JB):,150.254
Skew:,-1.058,Prob(JB):,2.36e-33
Kurtosis:,5.13,Cond. No.,2600.0


In [14]:
df

Unnamed: 0,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,118,4,4.5,4.5,9.65,1,0.92
1,107,4,4.0,4.5,8.87,1,0.76
2,104,3,3.0,3.5,8.00,1,0.72
3,110,3,3.5,2.5,8.67,1,0.80
4,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...
495,108,5,4.5,4.0,9.02,1,0.87
496,117,5,5.0,5.0,9.87,1,0.96
497,120,5,4.5,5.0,9.56,1,0.93
498,103,4,4.0,5.0,8.43,0,0.73


## Assumptions failed, independence of errors

## drop SOP

In [15]:
df.drop(columns='SOP',inplace=True)
X=df.drop(columns='Chance of Admit')
y=df['Chance of Admit']

## Train test split

In [16]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Modelling and Evaluation
## Hyperparameter Tuning


In [17]:
estimator=Ridge()
param_grid=[{'alpha':list(range(1,100))}]

hpt_model_1=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_root_mean_squared_error')

hpt_model_1.fit(X_train,y_train)

hpt_model_1.best_params_

{'alpha': 1}

In [18]:
# alpha = 1 modelling 

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

model_1=Ridge(alpha=1)

model_1.fit(X_train,y_train)

# predicting train values
ypred_train_0=model_1.predict(X_train)

# Train evaulation
print('Cross Validation score ',cross_val_score(model_1,X_train,y_train,cv=5).mean())
print('R2 train value ',r2_score(y_train,ypred_train_0))


# predicting test values
ypred_test_0=model_1.predict(X_test)

# Test evaulation
print('R2 test value ',r2_score(y_test,ypred_test_0))

Cross Validation score  0.8023354822270555
R2 train value  0.8129550053651619
R2 test value  0.828408249070475


In [19]:
model_ols=ols('y_train~X_train',data=df_train).fit()
model_ols.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.813
Model:,OLS,Adj. R-squared:,0.811
Method:,Least Squares,F-statistic:,342.6
Date:,"Wed, 01 Jan 2025",Prob (F-statistic):,5.48e-141
Time:,12:14:44,Log-Likelihood:,553.1
No. Observations:,400,AIC:,-1094.0
Df Residuals:,394,BIC:,-1070.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.0068,0.071,-14.231,0.000,-1.146,-0.868
X_train[0],0.0048,0.001,5.558,0.000,0.003,0.006
X_train[1],0.0040,0.004,0.996,0.320,-0.004,0.012
X_train[2],0.0164,0.004,3.690,0.000,0.008,0.025
X_train[3],0.1318,0.010,13.462,0.000,0.113,0.151
X_train[4],0.0339,0.007,4.705,0.000,0.020,0.048

0,1,2,3
Omnibus:,77.203,Durbin-Watson:,2.065
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148.788
Skew:,-1.054,Prob(JB):,4.9100000000000005e-33
Kurtosis:,5.117,Cond. No.,2500.0


## Drop University rating

In [20]:
df.drop(columns='University Rating',inplace=True)
X=df.drop(columns='Chance of Admit')
y=df['Chance of Admit']

## train_test_split

In [21]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# modelling

In [22]:
estimator=Ridge()
param_grid=[{'alpha':list(range(1,100))}]

hpt_model_1=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_root_mean_squared_error')

hpt_model_1.fit(X_train,y_train)

hpt_model_1.best_params_

{'alpha': 1}

In [23]:
# alpha = 1 modelling 

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

model_1=Ridge(alpha=1)

model_1.fit(X_train,y_train)

# predicting train values
ypred_train_0=model_1.predict(X_train)

# Train evaulation
print('Cross Validation score ',cross_val_score(model_1,X_train,y_train,cv=5).mean())
print('R2 train value ',r2_score(y_train,ypred_train_0))


# predicting test values
ypred_test_0=model_1.predict(X_test)

# Test evaulation
print('R2 test value ',r2_score(y_test,ypred_test_0))

Cross Validation score  0.803482448174211
R2 train value  0.8124870729851568
R2 test value  0.8242190292466554


In [24]:
model_ols=ols('y_train~X_train',data=df_train).fit()
model_ols.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.813
Model:,OLS,Adj. R-squared:,0.811
Method:,Least Squares,F-statistic:,428.0
Date:,"Wed, 01 Jan 2025",Prob (F-statistic):,4.07e-142
Time:,12:14:46,Log-Likelihood:,552.59
No. Observations:,400,AIC:,-1095.0
Df Residuals:,395,BIC:,-1075.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.0381,0.063,-16.378,0.000,-1.163,-0.913
X_train[0],0.0049,0.001,5.855,0.000,0.003,0.007
X_train[1],0.0177,0.004,4.139,0.000,0.009,0.026
X_train[2],0.1343,0.009,14.196,0.000,0.116,0.153
X_train[3],0.0347,0.007,4.854,0.000,0.021,0.049

0,1,2,3
Omnibus:,75.892,Durbin-Watson:,2.076
Prob(Omnibus):,0.0,Jarque-Bera (JB):,144.869
Skew:,-1.042,Prob(JB):,3.48e-32
Kurtosis:,5.085,Cond. No.,2240.0


## Accuracy : 82.42 % Ridge