In [29]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path
</div>

In [4]:
path_of_input_file = r'D:\kaggle_trials\graduate-admissions\Admission_Predict_Ver1.1.csv'
df                 = pd.read_csv(path_of_input_file)
df.head(4)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8


<div class="alert alert-block alert-info">
<b>Categorical and Numerical Columns Identification:</b> We identify categorical and numerical columns from the data. We do set a threshold that if any categorical value is classified as numerical, then it has to be classified back to categorical if the number of distinct values of that column in the dataframe is less than 10 
</div>

In [9]:
cols_needed           = list(df.columns)
cols_needed           = cols_needed[:len(cols_needed)-1]

possible_numeric_cols = list(df._get_numeric_data().columns)
possible_numeric_cols.remove('Chance of Admit ','Serial No.')

categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))

numerical_columns     = []
for i in range(len(possible_numeric_cols)):
    col_name  = possible_numeric_cols[i]
    if len(df[col_name].unique())<10:
        categorical_columns.append(col_name)
    else:
        numerical_columns.append(col_name)

<div class="alert alert-block alert-info">
<b>Encoding and Feature Scaling:</b> We do the one hot encoding of categorical values and scale(by using MinMaxScaler) the numerical values to get the final feature matrix X. Subseqently, we consider the SalePrice column to be our target variable
</div>

In [15]:
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[categorical_columns])
scaled_matrix        = scalar.fit_transform(df[numerical_columns])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = scalar.fit_transform(df[['Chance of Admit ']])


<div class="alert alert-block alert-info">
<b>Train Test split:</b> We perform train test split on the data
</div>

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)

<div class="alert alert-block alert-info">
<b>Parameter Grid and tuning :</b> We set up the grid of values for parameter tuning 
</div>

In [18]:
lasso_reg_grid  = {'alpha' : hp.uniform('alpha',0.01,5),
                      'precompute' : hp.choice('precompute',[True,False])
                     }

In [20]:
def hyperopt_train_test(params):
    reg = Lasso(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, lasso_reg_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(lasso_reg_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 30/30 [00:00<00:00, 205.33it/s, best loss: -0.08091812155801115]
The best parameter tuned on training set is given by :-  {'alpha': 0.049550797650968814, 'precompute': False}


In [23]:
model = Lasso(**best_parameters)
model.fit(X_train, y_train)

Lasso(alpha=0.049550797650968814, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  -101.73823153940464


<div class="alert alert-block alert-info">
<b>Conclusion :</b> LASSO gave a very bad fit to the data. We can clearly see that RSS wa higher than TSS resulting in negative Coefficient of determination. We move ahead and try to fit a random forest regressor on the data
</div>

In [25]:
random_forest_grid = {'n_estimators' : hp.choice('n_estimators',range(5,50)),
                      'min_samples_split' : hp.uniform('min_samples_split',0.01,0.95),
                      'min_samples_leaf'  : hp.choice('min_samples_leaf',range(1,10)),
                      'max_features'      : hp.choice('max_features',['auto','sqrt','log2',None])
                     }

In [26]:
def hyperopt_train_test(params):
    reg = RandomForestRegressor(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, random_forest_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(random_forest_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|██████████████████████████████████████████████████| 30/30 [00:02<00:00, 10.34it/s, best loss: -0.8222914287771547]
The best parameter tuned on training set is given by :-  {'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 0.011695696160138769, 'n_estimators': 42}


In [27]:
rf_regressor = RandomForestRegressor(**best_parameters)
rf_regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3,
                      min_samples_split=0.011695696160138769,
                      min_weight_fraction_leaf=0.0, n_estimators=42,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [28]:
y_pred = rf_regressor.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.8065497749146603


<div class="alert alert-block alert-info">
<b>Random Forest Conclusion:</b> Random Forest gave a relative better fit as compared to Lasso
</div>

In [31]:
k_neighbors_grid  = {'n_neighbors' : hp.choice('n_neighbors',range(5,50)),
                      'weights' : hp.choice('wrights',['uniform','distance']),
                      'algorithm'  : hp.choice('algorithm',['auto','ball_tree','kd_tree','brute']),
                      'leaf_size' : hp.choice('leaf_size',range(20,90)),
                     'metric'      : hp.choice('metric',['euclidean','manhattan','chebyshev','minkowski'])
                     }

In [33]:
def hyperopt_train_test(params):
    reg = KNeighborsRegressor(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, k_neighbors_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(k_neighbors_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|██████████████████████████████████████████████████| 30/30 [00:00<00:00, 75.98it/s, best loss: -0.7245810446250394]
The best parameter tuned on training set is given by :-  {'algorithm': 'ball_tree', 'leaf_size': 56, 'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'distance'}


In [34]:
knbrs_reg = KNeighborsRegressor(**best_parameters)
knbrs_reg.fit(X_train,y_train)

KNeighborsRegressor(algorithm='ball_tree', leaf_size=56, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                    weights='distance')

In [35]:
y_pred = knbrs_reg.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.5868586691761426


<div class="alert alert-block alert-info">
<b>kneighbors Conclusion:</b> We still got better results than LASSO but Random Forest still remains the better model to fit in our scenario
</div>