In [22]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb



In [8]:
df = pd.read_csv('Admission_Prediction.csv')

In [9]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.0,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          485 non-null    float64
 2   TOEFL Score        490 non-null    float64
 3   University Rating  485 non-null    float64
 4   SOP                500 non-null    float64
 5   LOR                500 non-null    float64
 6   CGPA               500 non-null    float64
 7   Research           500 non-null    int64  
 8   Chance of Admit    500 non-null    float64
dtypes: float64(7), int64(2)
memory usage: 35.3 KB


In [11]:
df.isnull().sum()

Serial No.            0
GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [12]:
df.shape

(500, 9)

In [15]:
df['GRE Score'] = df['GRE Score'].fillna(df['GRE Score'].median())
df['TOEFL Score'] = df['TOEFL Score'].fillna(df['TOEFL Score'].median())
df['University Rating'] = df['University Rating'].fillna(df['University Rating'].median())

In [16]:
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [17]:
x = df.iloc[:, 1:-1]

In [18]:
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.0,118.0,4.0,4.5,4.5,9.65,1
1,324.0,107.0,4.0,4.0,4.5,8.87,1
2,317.0,104.0,3.0,3.0,3.5,8.00,1
3,322.0,110.0,3.0,3.5,2.5,8.67,1
4,314.0,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.0,108.0,5.0,4.5,4.0,9.02,1
496,337.0,117.0,5.0,5.0,5.0,9.87,1
497,330.0,120.0,5.0,4.5,5.0,9.56,1
498,312.0,103.0,4.0,4.0,5.0,8.43,0


In [19]:
y = df.iloc[:,-1]

In [20]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit, Length: 500, dtype: float64

In [23]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [24]:
scaler = StandardScaler()

In [25]:
scaled_x_train = scaler.fit_transform(x_train)

In [26]:
scaler_x_test = scaler.transform(x_test)

In [34]:
def objective(trail, data=x, target = y):
     x_train,x_test,y_train,y_test = train_test_split(data,target,test_size=0.25,random_state=42)
     param = {
          'tree_method':'gpu_hist',
          'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
          'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),
          'colsample_bytree':trail.suggest_categorical('colsample_bytree', [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]),
          'subsample':trail.suggest_categorical('subsample',[.1,.2,.3,.4,.5,.6,.7,.8,.9,1]),
          'learning_rate':trail.suggest_categorical('learning_rate', [.00001,.0003,.008,.02,.01,1,8]),
          'n_estimators':3000,
          'max_depth':trail.suggest_categorical('max_depth',[3,4,5,6,7,8,9,10,11,12]),
          'random_state':trail.suggest_categorical('random_state',[10,20,30,2000,3454,243123]),
          'min_child_weight':trail.suggest_int('min_child_weight',1,200)
           
          
     }
     
     model = xgb.XGBRFRegressor()
     model.fit(x_train,y_train, eval_set = [(x_test,y_test)],verbose = True)
     pred = model.predict(x_test)
     mse = mean_squared_error(y_test,pred)
     return mse

In [35]:
find_params = optuna.create_study()
find_params.optimize(objective,n_trials=10)
find_params.best_trial.params

[I 2024-11-17 09:50:48,027] A new study created in memory with name: no-name-95e4e12e-5708-4207-b540-2d03c5ff4689
  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),


[0]	validation_0-rmse:0.06544


[I 2024-11-17 09:50:48,268] Trial 0 finished with value: 0.004282974289239958 and parameters: {'lambda': 0.0031753379329223868, 'alpha': 0.07328385979289333, 'colsample_bytree': 1, 'subsample': 0.7, 'learning_rate': 0.0003, 'max_depth': 4, 'random_state': 10, 'min_child_weight': 123}. Best is trial 0 with value: 0.004282974289239958.
  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),


[0]	validation_0-rmse:0.06544


[I 2024-11-17 09:50:48,494] Trial 1 finished with value: 0.004282974289239958 and parameters: {'lambda': 0.07343692039340279, 'alpha': 0.3880377025356733, 'colsample_bytree': 1, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'random_state': 2000, 'min_child_weight': 169}. Best is trial 0 with value: 0.004282974289239958.
  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),


[0]	validation_0-rmse:0.06544


[I 2024-11-17 09:50:48,716] Trial 2 finished with value: 0.004282974289239958 and parameters: {'lambda': 0.00021435576256640908, 'alpha': 2.8034617992921387, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.02, 'max_depth': 9, 'random_state': 20, 'min_child_weight': 3}. Best is trial 0 with value: 0.004282974289239958.


[0]	validation_0-rmse:0.06544


  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),
[I 2024-11-17 09:50:48,936] Trial 3 finished with value: 0.004282974289239958 and parameters: {'lambda': 0.0040225938849253245, 'alpha': 0.0011320693534194117, 'colsample_bytree': 0.9, 'subsample': 0.9, 'learning_rate': 1, 'max_depth': 4, 'random_state': 243123, 'min_child_weight': 81}. Best is trial 0 with value: 0.004282974289239958.
  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),


[0]	validation_0-rmse:0.06544


[I 2024-11-17 09:50:49,161] Trial 4 finished with value: 0.004282974289239958 and parameters: {'lambda': 0.03714153244753724, 'alpha': 2.4761423311967286, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.0003, 'max_depth': 4, 'random_state': 10, 'min_child_weight': 49}. Best is trial 0 with value: 0.004282974289239958.


[0]	validation_0-rmse:0.06544


  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),
[I 2024-11-17 09:50:49,376] Trial 5 finished with value: 0.004282974289239958 and parameters: {'lambda': 4.220942144343207, 'alpha': 0.0031355753278869366, 'colsample_bytree': 0.1, 'subsample': 0.6, 'learning_rate': 0.0003, 'max_depth': 11, 'random_state': 2000, 'min_child_weight': 126}. Best is trial 0 with value: 0.004282974289239958.


[0]	validation_0-rmse:0.06544


  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),
[I 2024-11-17 09:50:49,580] Trial 6 finished with value: 0.004282974289239958 and parameters: {'lambda': 1.968365914120865, 'alpha': 0.06428815591136897, 'colsample_bytree': 0.4, 'subsample': 0.9, 'learning_rate': 0.02, 'max_depth': 4, 'random_state': 2000, 'min_child_weight': 43}. Best is trial 0 with value: 0.004282974289239958.


[0]	validation_0-rmse:0.06544


  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),
[I 2024-11-17 09:50:49,788] Trial 7 finished with value: 0.004282974289239958 and parameters: {'lambda': 4.97097483179545, 'alpha': 0.8161838679384209, 'colsample_bytree': 0.7, 'subsample': 0.2, 'learning_rate': 8, 'max_depth': 8, 'random_state': 243123, 'min_child_weight': 78}. Best is trial 0 with value: 0.004282974289239958.


[0]	validation_0-rmse:0.06544


  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),
[I 2024-11-17 09:50:50,012] Trial 8 finished with value: 0.004282974289239958 and parameters: {'lambda': 0.04485528669777853, 'alpha': 0.009783494615883436, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.01, 'max_depth': 9, 'random_state': 30, 'min_child_weight': 40}. Best is trial 0 with value: 0.004282974289239958.


[0]	validation_0-rmse:0.06544


  'lambda':trail.suggest_loguniform('lambda', 1e-4,10.0),
  'alpha':trail.suggest_loguniform('alpha',1e-4,10.4),
[I 2024-11-17 09:50:50,234] Trial 9 finished with value: 0.004282974289239958 and parameters: {'lambda': 0.0007781982712384326, 'alpha': 0.3755226339574379, 'colsample_bytree': 0.8, 'subsample': 0.9, 'learning_rate': 1e-05, 'max_depth': 11, 'random_state': 3454, 'min_child_weight': 127}. Best is trial 0 with value: 0.004282974289239958.


{'lambda': 0.0031753379329223868,
 'alpha': 0.07328385979289333,
 'colsample_bytree': 1,
 'subsample': 0.7,
 'learning_rate': 0.0003,
 'max_depth': 4,
 'random_state': 10,
 'min_child_weight': 123}

In [36]:
best_params = {'lambda': 0.0031753379329223868,
 'alpha': 0.07328385979289333,
 'colsample_bytree': 1,
 'subsample': 0.7,
 'learning_rate': 0.0003,
 'max_depth': 4,
 'random_state': 10,
 'min_child_weight': 123}

In [39]:
model = xgb.XGBRFRegressor(**best_params)

In [40]:
model.fit(scaled_x_train,y_train)

In [41]:
y_pred = model.predict(scaler_x_test)

In [42]:
from sklearn.metrics import r2_score

In [43]:
r2_score(y_test,y_pred)

-0.002800627552787427