In [1]:
!pip install xgboost==1.5.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost==1.5.2
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
[K     |████████████████████████████████| 173.6 MB 8.9 kB/s 
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.5.2


In [2]:
import pandas as pd 
import numpy as np
import seaborn as sb
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
import math
from xgboost import XGBRegressor,XGBClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [3]:
# Already have asteroid dataset in drive. Downloading it from drive
!gdown --id 1-afEh_SyLdQh1tFH3mlcCgXLPj4FWSKm

Downloading...
From: https://drive.google.com/uc?id=1-afEh_SyLdQh1tFH3mlcCgXLPj4FWSKm
To: /content/Cleaned_Asteroid.csv
100% 174M/174M [00:02<00:00, 80.1MB/s]


In [4]:
df=pd.read_csv("Cleaned_Asteroid.csv")

In [5]:
df=df.astype({'diameter':float,'condition_code':int})
df.head(5)

Unnamed: 0,full_name,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,diameter,albedo,neo,pha,moid,diam_bin
0,1 Ceres,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,0,1002,3.34,939.4,0.09,N,N,1.59478,Very Large
1,2 Pallas,2.772466,0.230337,34.836234,173.080063,310.048857,2.133865,3.411067,4.616444,72318.0,0,8490,4.13,545.0,0.101,N,N,1.23324,Very Large
2,3 Juno,2.66915,0.256942,12.988919,169.85276,248.138626,1.983332,3.354967,4.360814,72684.0,0,7104,5.33,246.596,0.214,N,N,1.03454,Large
3,4 Vesta,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,24288.0,0,9325,3.2,525.4,0.4228,N,N,1.13948,Very Large
4,5 Astraea,2.574249,0.191095,5.366988,141.576604,358.687608,2.082324,3.066174,4.130323,63431.0,0,2861,6.85,106.699,0.274,N,N,1.09589,Large


### Feature Selection

#### We are using Correlation Matrix to select Most Important Feature

In [None]:
corr=df.corr()
corr['diameter'].abs().sort_values(ascending=False)

diameter          1.000000
H                 0.566501
data_arc          0.492110
n_obs_used        0.386038
moid              0.332416
q                 0.329698
a                 0.144748
albedo            0.106077
ad                0.093440
condition_code    0.073546
i                 0.052540
e                 0.049107
per_y             0.048955
w                 0.002980
om                0.001155
Name: diameter, dtype: float64

After performing correlation matrix, it is found that H,data_arc,n_obs_used,moid,q,a,albedo have higher correlation with matrix.

Select the features with higher absolute correlation value.

In [6]:
df=df[['H','data_arc','n_obs_used','moid','q','a','albedo','neo','pha','diameter']]

## Splitting the dataset

* Since the diameter has missing values, we will have missing values of diameter in test data set and train dataset will have non-missing values.

* We can also perform Validation Split for better convenience

In [None]:
df1=df.copy()
df1.dropna(inplace=True)

y = df1["diameter"]
x = df1.drop("diameter", axis=1)

print(x.shape,y.shape)

(137681, 9) (137681,)


In [None]:
def get_gen_grp(diam):
    
    if (diam <= 10):
        return 'Small'
    
    elif (diam > 10) and (diam <= 100):
        return 'Medium'
    
    elif (diam > 100) and (diam <= 500):
        return 'Large'

    elif (diam > 500):
        return 'Very Large'
    
    else:
        return 'Missing'

In [None]:
y_binned = y.apply(get_gen_grp)

In [None]:
# Save your Y values in a new ndarray,
# broken down by the bins created above.
# Pass y_binned to the stratify argument,
# and sklearn will handle the rest

xtr, xte, ytr, yte = train_test_split(x, y, test_size=0.33, stratify=y_binned,random_state=42)
y_binned = ytr.apply(get_gen_grp)
xtr, xcv, ytr, ycv = train_test_split(xtr, ytr, test_size=0.2, stratify=y_binned,random_state=42)

In [7]:
# Making this For missing Diameter Points
data = df.copy()


test_data = data[data["diameter"].isna()]
data.dropna(inplace=True)

y_train = data["diameter"]
x_train = data.drop("diameter", axis=1)
x_test = test_data.drop("diameter", axis=1)

In [None]:
print("Train data : ",xtr.shape)
print("Test data : ",xte.shape)
print("Validation Data : ",xcv.shape)

Train data :  (73796, 9)
Test data :  (45435, 9)
Validation Data :  (18450, 9)


In [8]:
# Taking missing diameter values as test data
print("Train data : ",x_train.shape)
print("Test data : ",x_test.shape)

Train data :  (137681, 9)
Test data :  (702055, 9)


## Encoding the data

* For Categorical Features

In [10]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='error',drop='if_binary',sparse=False)

In [None]:
# NEO
enc.fit(xtr['neo'].values.reshape(-1,1))
xtr_neo_encode=enc.transform(xtr['neo'].values.reshape(-1,1))
xte_neo_encode=enc.transform(xte['neo'].values.reshape(-1,1))
xcv_neo_encode=enc.transform(xcv['neo'].values.reshape(-1,1))

In [11]:
# NEO for missing diameter values
enc.fit(x_train['neo'].values.reshape(-1,1))
x_train_neo_encode=enc.transform(x_train['neo'].values.reshape(-1,1))
x_test_neo_encode=enc.transform(x_test['neo'].values.reshape(-1,1))

In [None]:
# PHA
enc.fit(xtr['pha'].values.reshape(-1,1))
xtr_pha_encode=enc.transform(xtr['pha'].values.reshape(-1,1))
xte_pha_encode=enc.transform(xte['pha'].values.reshape(-1,1))
xcv_pha_encode=enc.transform(xcv['pha'].values.reshape(-1,1))

In [12]:
# PHA for missing diameter values
enc.fit(x_train['pha'].values.reshape(-1,1))
x_train_pha_encode=enc.transform(x_train['pha'].values.reshape(-1,1))
x_test_pha_encode=enc.transform(x_test['pha'].values.reshape(-1,1))

* For Numerical Features

In [13]:
sd=StandardScaler()

#### Train Data

In [None]:
sd.fit(xtr[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
xtr_norm=sd.transform(xtr[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
xtr_norm=pd.DataFrame(data=xtr_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
xtr_norm.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid
0,0.194575,-0.228851,-0.20396,-0.188852,-0.486813,-0.86697,-0.264413
1,0.192109,0.452161,2.454209,1.508338,-0.980177,-0.500787,0.426278
2,-0.092617,-0.217243,-0.143081,-0.963731,1.345683,-0.665569,-0.243129
3,0.230832,0.08436,-0.105456,-0.152851,-0.980177,-0.491632,0.240638
4,0.115235,0.386575,-0.47073,-0.279712,-0.204891,0.021025,0.413315


In [14]:
# for missing diameter values
sd.fit(x_train[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
x_train_norm=sd.transform(x_train[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
x_train_norm=pd.DataFrame(data=x_train_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
x_train_norm.shape

(137681, 7)

#### Test Data

In [None]:
xte_norm=sd.transform(xte[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
xte_norm=pd.DataFrame(data=xte_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
xte_norm.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid
0,-0.318625,-1.029335,-0.27575,-0.610052,0.925903,1.675331,-1.093568
1,0.119695,0.17096,-0.271366,-0.246037,0.00797,-0.526874,0.179751
2,0.02227,-0.270871,-1.447034,-1.107998,1.137734,-0.645665,-0.254651
3,0.099376,1.026296,-0.256913,0.627943,-0.909964,0.386904,1.002517
4,0.172358,-0.198264,-0.397864,0.210699,-0.556912,-0.536012,-0.168756


In [15]:
# for missing diameter values
x_test_norm=sd.transform(x_test[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
x_test_norm=pd.DataFrame(data=x_test_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
x_test_norm.shape

(702055, 7)

#### Validation Data

In [None]:
xcv_norm=sd.transform(xcv[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
xcv_norm=pd.DataFrame(data=xcv_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
xcv_norm.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid
0,-0.31166,-1.147161,-0.276724,-0.536219,0.784683,-0.855834,-1.145684
1,-0.197834,-0.152285,-0.404846,-0.682169,0.855293,-0.618252,-0.176829
2,-0.320636,-0.884338,2.653353,2.597402,-0.627523,1.757572,-0.932238
3,-0.365295,-0.468528,0.960001,1.024582,-0.133251,3.603403,-0.476485
4,-0.15519,-0.495946,-0.108656,0.226152,0.14919,1.318958,-0.491498


#### Encoding Numerical and Categorical Features

* Train Data

In [None]:
xtr=xtr_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
xtr['neo']=xtr_neo_encode
xtr['pha']=xtr_pha_encode
print(xtr.shape)
xtr.head(5)

(73796, 9)


Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid,neo,pha
0,0.194575,-0.228851,-0.20396,-0.188852,-0.486813,-0.86697,-0.264413,0.0,0.0
1,0.192109,0.452161,2.454209,1.508338,-0.980177,-0.500787,0.426278,0.0,0.0
2,-0.092617,-0.217243,-0.143081,-0.963731,1.345683,-0.665569,-0.243129,0.0,0.0
3,0.230832,0.08436,-0.105456,-0.152851,-0.980177,-0.491632,0.240638,0.0,0.0
4,0.115235,0.386575,-0.47073,-0.279712,-0.204891,0.021025,0.413315,0.0,0.0


In [16]:
# for missing diameter values
x_train=x_train_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
x_train['neo']=x_train_neo_encode
x_train['pha']=x_train_pha_encode
print(x_train.shape)
x_train.head(5)

(137681, 9)


Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid,neo,pha
0,-0.029792,0.302503,-0.012622,0.588733,-8.368553,-0.36117,0.341052,0.0,0.0
1,-0.027624,-0.520575,10.305415,13.457097,-7.810651,-0.260805,-0.364798,0.0,0.0
2,-0.095484,-0.812229,10.364889,11.075212,-6.963205,0.770221,-0.752728,0.0,0.0
3,-0.297605,-0.485614,2.500587,14.89207,-8.467421,2.675338,-0.547849,0.0,0.0
4,-0.157815,-0.620434,8.861286,3.783483,-5.889774,1.317669,-0.632952,0.0,0.0


* Test Data

In [None]:
xte=xte_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
xte['neo']=xte_neo_encode
xte['pha']=xte_pha_encode
xte.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid,neo,pha
0,-0.318625,-1.029335,-0.27575,-0.610052,0.925903,1.675331,-1.093568,0.0,0.0
1,0.119695,0.17096,-0.271366,-0.246037,0.00797,-0.526874,0.179751,0.0,0.0
2,0.02227,-0.270871,-1.447034,-1.107998,1.137734,-0.645665,-0.254651,0.0,0.0
3,0.099376,1.026296,-0.256913,0.627943,-0.909964,0.386904,1.002517,0.0,0.0
4,0.172358,-0.198264,-0.397864,0.210699,-0.556912,-0.536012,-0.168756,0.0,0.0


In [17]:
# for missing diameter values
x_test=x_test_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
x_test['neo']=x_test_neo_encode
x_test['pha']=x_test_pha_encode
print(x_test.shape)
x_test.head(5)

(702055, 9)


Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid,neo,pha
0,-0.105408,-0.397108,5.067909,2.492866,-2.11158,-0.47066,-0.39953,0.0,0.0
1,-0.133678,-1.671668,5.466519,2.599415,-2.450558,-0.47066,-1.553292,0.0,0.0
2,-0.115431,-2.335321,4.968947,2.003084,0.148275,-0.47066,-2.375477,1.0,0.0
3,-0.375146,-1.079947,4.909473,2.618319,-0.981652,-0.47066,-1.129684,0.0,0.0
4,-0.351346,-1.16469,4.660362,1.717808,-0.981652,-0.47066,-1.214086,0.0,0.0


* Validation Data

In [None]:
xcv=xcv_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
xcv['neo']=xcv_neo_encode
xcv['pha']=xcv_pha_encode
xcv.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid,neo,pha
0,-0.31166,-1.147161,-0.276724,-0.536219,0.784683,-0.855834,-1.145684,0.0,0.0
1,-0.197834,-0.152285,-0.404846,-0.682169,0.855293,-0.618252,-0.176829,0.0,0.0
2,-0.320636,-0.884338,2.653353,2.597402,-0.627523,1.757572,-0.932238,0.0,0.0
3,-0.365295,-0.468528,0.960001,1.024582,-0.133251,3.603403,-0.476485,0.0,0.0
4,-0.15519,-0.495946,-0.108656,0.226152,0.14919,1.318958,-0.491498,0.0,0.0


#### Building Models

### Using R Squared as performance metric for the problem

* Linear Regression

In [None]:
# Assumptions of multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

print(calc_vif(xtr))

  import pandas.util.testing as tm


    variables         VIF
0           a    1.133476
1           q  220.180090
2    data_arc    2.534710
3  n_obs_used    5.880809
4           H    6.111423
5      albedo    1.361707
6        moid  224.130210
7         neo    1.805184
8         pha    1.297135


After looking at the above observation, it can be concluded that:
1. ‘q’ and ‘moid’ have a high VIF value, meaning they can be predicted by other independent variables in the dataset.
2. VIF is preferred as it can show the correlation of a variable with a group of other variables.
3. We can drop higher correlated features.

In [None]:
xtr1=xtr.copy() # Will have highest correlated feature dropped
xte1=xte.copy() # Will have highest correlated feature dropped
xcv1=xcv.copy() # Will have highest correlated feature dropped

In [18]:
# for missing diameter values
x_tr1=x_train.copy() # Will have highest correlated feature dropped
x_te1=x_test.copy() # Will have highest correlated feature dropped

#### Training the model for dropped correlated feature

In [None]:

xtr1.drop(columns=['moid'],axis=1,inplace=True)
xte1.drop(columns=['moid'],axis=1,inplace=True)
xcv1.drop(columns=['moid'],axis=1,inplace=True)

In [19]:
# for missing diameter values
x_tr1.drop(columns=['moid'],axis=1,inplace=True)
x_te1.drop(columns=['moid'],axis=1,inplace=True)

In [None]:
print(calc_vif(xtr1))

    variables       VIF
0           a  1.133118
1           q  2.173444
2    data_arc  2.522083
3  n_obs_used  5.552479
4           H  5.670471
5      albedo  1.361400
6         neo  1.404608
7         pha  1.296783


In [None]:
# for missing diameter values
print(calc_vif(x_tr1))

    variables       VIF
0           a  1.128233
1           q  2.228755
2    data_arc  2.506320
3  n_obs_used  5.569443
4           H  5.770094
5      albedo  1.364866
6         neo  1.445694
7         pha  1.341693


In [None]:
lr=LinearRegression()
lr.fit(xtr1,ytr)
ypred=lr.predict(xte1)

In [None]:
print("R^2 of train data is : ",lr.score(xtr1,ytr))
print("R^2 of test data is : ",lr.score(xte1,yte))
print("R^2 of validation data is : ",lr.score(xcv1,ycv))

R^2 of train data is :  0.443761567392878
R^2 of test data is :  0.4003907440505936
R^2 of validation data is :  0.4964776063301628


In [20]:
# for missing diameter values
lr=LinearRegression()
lr.fit(x_tr1,y_train)
ypred=lr.predict(x_te1)

In [21]:
# for missing diameter values
print("R^2 value is : ",lr.score(x_tr1,y_train))

R^2 value is :  0.43944935293366716


* Ridge Regression

In [None]:
from numpy import arange
from pandas import read_csv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
# load the dataset
# define model
model = Ridge()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [0.001,0.01,0.1,1,10]
# define search
search = GridSearchCV(model, grid, scoring='r2', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(xtr1, ytr)
# summarize
print('R Squared Error: %.3f' % results.best_score_)
print('Best Param: %s' % results.best_params_)

R Squared Error: 0.393
Best Param: {'alpha': 0.001}


In [None]:
rd=results.best_estimator_
rd.fit(xtr1,ytr)

Ridge(alpha=0.001)

In [None]:
print("R^2 of train data is : ",rd.score(xtr1,ytr))
print("R^2 of test data is : ",rd.score(xte1,yte))
print("R^2 of validation data is : ",rd.score(xcv1,ycv))

R^2 of train data is :  0.4437615673928089
R^2 of test data is :  0.40039074503627026
R^2 of validation data is :  0.4964776057004572


In [22]:
# for missing diameter values
rd = Ridge(alpha=0.001)
rd.fit(x_tr1,y_train)
print("R^2 value is : ",rd.score(x_tr1,y_train))

R^2 value is :  0.4394493529336466


* Lasso Regression

In [None]:
# define model
model = Lasso()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [0.001,0.01,0.1,1,10]
# define search
search = GridSearchCV(model, grid, scoring='r2', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(xtr1, ytr)
# summarize
print('R Squared : %.3f' % results.best_score_)
print('Best Params : %s' % results.best_params_)

R Squared : 0.412
Best Params : {'alpha': 0.1}


In [None]:
ls=results.best_estimator_
ls.fit(xtr1,ytr)
print("R^2 of train data is : ",ls.score(xtr1,ytr))
print("R^2 of test data is : ",ls.score(xte1,yte))
print("R^2 of validation data is : ",ls.score(xcv1,ycv))

R^2 of train data is :  0.4342583181781601
R^2 of test data is :  0.39384082265738307
R^2 of validation data is :  0.4883625213478223


In [23]:
# for missing diameter values
ls=Lasso(alpha=0.1)
ls.fit(x_tr1,y_train)
print("R^2 value is : ",ls.score(x_tr1,y_train))

R^2 value is :  0.42916500696301507


* Elastic Net

In [None]:
# define model
model = ElasticNet()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [0.01,0.1,1,10]
grid['max_iter']=[100,250,500]
grid['l1_ratio']=arange(0, 1, 0.1)
search = GridSearchCV(model, grid, scoring='r2', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(xtr1, ytr)
# summarize
print('R Squared : %.3f' % results.best_score_)
print('Best Params : %s' % results.best_params_)

R Squared : 0.419
Best Params : {'alpha': 0.1, 'l1_ratio': 0.0, 'max_iter': 100}


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [None]:
el=results.best_estimator_
el.fit(xtr1,ytr)
print("R^2 of train data is : ",el.score(xtr1,ytr))
print("R^2 of test data is : ",el.score(xte1,yte))
print("R^2 of validation data is : ",el.score(xcv1,ycv))

R^2 of train data is :  0.43317548101820436
R^2 of test data is :  0.38968933506934433
R^2 of validation data is :  0.48289225079795317


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [24]:
# for missing diameter values
el=ElasticNet(alpha= 0.1, l1_ratio= 0.0, max_iter= 100)
el.fit(x_tr1,y_train)
print("R^2 value is : ",el.score(x_tr1,y_train))

R^2 value is :  0.427357544016027


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


* Adaboost Regressor

In [None]:
from sklearn.model_selection import RepeatedKFold,KFold,cross_val_score
from sklearn.ensemble import AdaBoostRegressor
ada=AdaBoostRegressor()
# evaluate the model
grid = dict()
grid['n_estimators'] = [100, 250, 500]
grid['learning_rate'] = [0.0001, 0.001]
# define the grid search procedure
grid_search = GridSearchCV(estimator=ada, param_grid=grid, n_jobs=-1, cv=10, scoring='r2',verbose=2)
# execute the grid search
grid_result = grid_search.fit(xtr, ytr)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best: 0.835125 using {'learning_rate': 0.001, 'n_estimators': 500}


In [None]:
ada=grid_result.best_estimator_
ada.fit(xtr,ytr)
print("R^2 of train model is : ",ada.score(xtr,ytr))
print("R^2 of test model is : ",ada.score(xte,yte))
print("R^2 of validation model is : ",ada.score(xcv,ycv))

R^2 of train model is :  0.879941896541697
R^2 of test model is :  0.8351957465370801
R^2 of validation model is :  0.8647748518272709


In [None]:
xtr.shape

(73796, 9)

In [25]:
# for missing diameter values
from sklearn.ensemble import AdaBoostRegressor
ada=AdaBoostRegressor(learning_rate= 0.001, n_estimators= 500)
ada.fit(x_train,y_train)
print("R^2 value is : ",ada.score(x_train,y_train))

R^2 value is :  0.8678820798050715


* Random Forest Regressor

In [None]:
param_grid = {
    'max_depth': [3,5],
    'n_estimators': [100,250,500]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf,scoring="r2", param_grid = param_grid, 
                          cv = 10 , n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(xtr,ytr)
print('R Squared : %.3f' % grid_search.best_score_)
print('Best Params : %s' % grid_search.best_params_)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
R Squared : 0.940
Best Params : {'max_depth': 5, 'n_estimators': 500}


In [None]:
rf=grid_search.best_estimator_
rf.fit(xtr,ytr)
print("R^2 of train model is : ",rf.score(xtr,ytr))
print("R^2 of test model is : ",rf.score(xte,yte))
print("R^2 of validation model is : ",rf.score(xcv,ycv))

R^2 of train model is :  0.9686573148243734
R^2 of test model is :  0.9330159393661889
R^2 of validation model is :  0.9675411461195876


In [26]:
# for missing diameter values
rf = RandomForestRegressor(max_depth=5,n_estimators=500)
rf.fit(x_train,y_train)
print("R^2 value is : ",rf.score(x_train,y_train))

R^2 value is :  0.969346040128229


#### XGBoost Regressor

In [None]:
param_grid = {
    'max_depth': [3,5],
    'n_estimators': [100,250,500]
}
# Create a based model
xgb = XGBRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgb,scoring="r2", param_grid = param_grid, 
                          cv = 10 , n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(xtr,ytr)
print('R Squared : %.3f' % grid_search.best_score_)
print('Best Params : %s' % grid_search.best_params_)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
R Squared : 0.954
Best Params : {'max_depth': 3, 'n_estimators': 500}


In [None]:
xgb=grid_search.best_estimator_
xgb.fit(xtr,ytr)
print("R^2 of train data is : ",xgb.score(xtr,ytr))
print("R^2 of test data is : ",xgb.score(xte,yte))
print("R^2 of validation data is : ",xgb.score(xcv,ycv))

R^2 of train data is :  0.9944674393109414
R^2 of test data is :  0.9496517986487836
R^2 of validation data is :  0.9839819565400414


In [27]:
# for missing diameter values
xgb=XGBRegressor(max_depth=3,n_estimators=500)
xgb.fit(x_train,y_train)
print("R^2 of train model is : ",xgb.score(x_train,y_train))

R^2 of train model is :  0.9954176833142238


## Negative Mean Absolute Error

* This is the negative value of mean absolute error.
* The best value is 0.
* Value closer to 0 indicates less error.
* Value farther from 0 indicates higher error.   

In [None]:
# Using the previously trained LR Model
trpred=lr.predict(xtr1)
tepred=lr.predict(xte1)
cvpred=lr.predict(xcv1)
print("Negative Mean Absolute Error of Train Data",-(mean_absolute_error(ytr, trpred)))
print("Negative Mean Absolute Error of Test Data",-(mean_absolute_error(yte, tepred)))
print("Negative Mean Absolute Error of Validation Data",-(mean_absolute_error(ycv, cvpred)))

Negative Mean Absolute Error of Train Data -2.7245993242050397
Negative Mean Absolute Error of Test Data -2.729776323479229
Negative Mean Absolute Error of Validation Data -2.722722040864397


In [28]:
# Using the previously trained LR Model
trpred=lr.predict(x_tr1)
tepred=lr.predict(x_te1)
print("Negative Mean Absolute Error is",-(mean_absolute_error(y_train, trpred)))

Negative Mean Absolute Error is -2.6154995998283495


#### Ridge Regression

In [None]:
# Using the previously trained Ridge Model
trpred=rd.predict(xtr1)
tepred=rd.predict(xte1)
cvpred=rd.predict(xcv1)
print("Negative Mean Absolute Error of Train Data",-(mean_absolute_error(ytr, trpred)))
print("Negative Mean Absolute Error of Test Data",-(mean_absolute_error(yte, tepred)))
print("Negative Mean Absolute Error of Validation Data",-(mean_absolute_error(ycv, cvpred)))

Negative Mean Absolute Error of Train Data -2.7245990642839795
Negative Mean Absolute Error of Test Data -2.7297760354204788
Negative Mean Absolute Error of Validation Data -2.722721765569699


In [29]:
# Using the previously trained Ridge Model
# for missing diameter values
trpred=rd.predict(x_tr1)
tepred=rd.predict(x_te1)
print("Negative Mean Absolute Error is",-(mean_absolute_error(y_train, trpred)))

Negative Mean Absolute Error is -2.615499444077749


#### Lasso Regression

In [None]:
# Using Previously trained Lasso Model 
trpred=ls.predict(xtr1)
tepred=ls.predict(xte1)
cvpred=ls.predict(xcv1)
print("Negative Mean Absolute Error of Train Data",-(mean_absolute_error(ytr, trpred)))
print("Negative Mean Absolute Error of Test Data",-(mean_absolute_error(yte, tepred)))
print("Negative Mean Absolute Error of Validation Data",-(mean_absolute_error(ycv, cvpred)))

Negative Mean Absolute Error of Train Data -2.598078900715931
Negative Mean Absolute Error of Test Data -2.596430925377259
Negative Mean Absolute Error of Validation Data -2.5887777551964737


In [30]:
# Using Previously trained Lasso Model 
# for missing diameter values
trpred=ls.predict(x_tr1)
tepred=ls.predict(x_te1)
print("Negative Mean Absolute Error is",-(mean_absolute_error(y_train, trpred)))

Negative Mean Absolute Error is -2.4857373518807693


#### ElasticNet Regression

In [None]:
# Using Previously trained Enet Model 
trpred=el.predict(xtr1)
tepred=el.predict(xte1)
cvpred=el.predict(xcv1)
print("Negative Mean Absolute Error of Train Data",-(mean_absolute_error(ytr, trpred)))
print("Negative Mean Absolute Error of Test Data",-(mean_absolute_error(yte, tepred)))
print("Negative Mean Absolute Error of Validation Data",-(mean_absolute_error(ycv, cvpred)))

Negative Mean Absolute Error of Train Data -2.5238968630157887
Negative Mean Absolute Error of Test Data -2.5202216921853493
Negative Mean Absolute Error of Validation Data -2.5194974727749058


In [31]:
# Using Previously trained Enet Model 
# for missing diameter values
trpred=el.predict(x_tr1)
tepred=el.predict(x_te1)
print("Negative Mean Absolute Error is",-(mean_absolute_error(y_train, trpred)))

Negative Mean Absolute Error is -2.4029816355595863


#### AdaBoost Regressor

In [None]:
# Using Previously trained AdaBoost Model 
trpred=ada.predict(xtr)
tepred=ada.predict(xte)
cvpred=ada.predict(xcv)
print("Negative Mean Absolute Error of Train Data",-(mean_absolute_error(ytr, trpred)))
print("Negative Mean Absolute Error of Test Data",-(mean_absolute_error(yte, tepred)))
print("Negative Mean Absolute Error of Validation Data",-(mean_absolute_error(ycv, cvpred)))

Negative Mean Absolute Error of Train Data -1.7828337827086074
Negative Mean Absolute Error of Test Data -1.8054492244950557
Negative Mean Absolute Error of Validation Data -1.7975714407365178


In [32]:
# Using Previously trained AdaBoost Model 
# for missing diameter values
trpred=ada.predict(x_train)
#tepred=ada.predict(x_test)
print("Negative Mean Absolute Error is",-(mean_absolute_error(y_train, trpred)))

Negative Mean Absolute Error is -1.7959175559182532


#### Random Forest Regressor

In [None]:
# Using Previously Trained Random Forest Model
trpred=rf.predict(xtr)
tepred=rf.predict(xte)
cvpred=rf.predict(xcv)
print("Negative Mean Absolute Error of Train Data",-(mean_absolute_error(ytr, trpred)))
print("Negative Mean Absolute Error of Test Data",-(mean_absolute_error(yte, tepred)))
print("Negative Mean Absolute Error of Validation Data",-(mean_absolute_error(ycv, cvpred)))

Negative Mean Absolute Error of Train Data -0.9117090614476029
Negative Mean Absolute Error of Test Data -0.920098430760104
Negative Mean Absolute Error of Validation Data -0.9172345556942838


In [33]:
# Using Previously Trained Random Forest Model
# for diameter missing values
trpred=rf.predict(x_train)
tepred=rf.predict(x_test)
print("Negative Mean Absolute Error is",-(mean_absolute_error(y_train, trpred)))

Negative Mean Absolute Error is -0.9129400066668372


#### XGBoost Regressor

In [None]:
# Using Previously Trained XGBoost Model
trpred=xgb.predict(xtr)
tepred=xgb.predict(xte)
cvpred=xgb.predict(xcv)
print("Negative Mean Absolute Error of Train Data",-(mean_absolute_error(ytr, trpred)))
print("Negative Mean Absolute Error of Test Data",-(mean_absolute_error(yte, tepred)))
print("Negative Mean Absolute Error of Validation Data",-(mean_absolute_error(ycv, cvpred)))

Negative Mean Absolute Error of Train Data -0.3807907607597713
Negative Mean Absolute Error of Test Data -0.42141800896918613
Negative Mean Absolute Error of Validation Data -0.4181520152589881


In [34]:
# Using Previously Trained XGBoost Model
# for diameter missing values
trpred=xgb.predict(x_train)
tepred=xgb.predict(x_test)
print("Negative Mean Absolute Error is",-(mean_absolute_error(y_train, trpred)))

Negative Mean Absolute Error is -0.3630875416468981


### Pretty Table Observations

#### For R Squared Error

In [None]:
from prettytable import PrettyTable
tab = PrettyTable(["Sr No", "Model", "Best Hyperparameters","R Squared Scores of all data"])
# Add rows
tab.add_row(["1.", "Linear Regression", "-","1.Train Data :  0.444 \n 2.Test Data : 0.400 \n 3.Validation Data :  0.497\n"])
tab.add_row(["2.", "Ridge Regression", "{'alpha' : 0.001}","1.Train Data :  0.444 \n 2.Test Data : 0.400 \n 3.Validation Data :  0.497\n"])
tab.add_row(["3.", "Lasso Regression", "{'alpha' : 0.1}","1.Train Data :  0.434 \n 2.Test Data : 0.394 \n 3.Validation Data :  0.488\n"])
tab.add_row(["4.", "ElasticNet Regression", "{'alpha' : 0.1,'l1_ratio': 0.0, 'max_iter': 100}","1.Train Data :  0.433 \n 2.Test Data : 0.390 \n 3.Validation Data :  0.483\n"])
tab.add_row(["5.","Adaboost Regressor","{'learning_rate': 0.001, 'n_estimators': 500}","1.Train Data :  0.881 \n 2.Test Data : 0.835 \n 3.Validation Data :  0.865\n"])
tab.add_row(["6.","Random Forest Regressor","{'max_depth': 5, 'n_estimators': 100}","1.Train Data :  0.967 \n 2.Test Data : 0.931 \n 3.Validation Data :  0.968\n"])
tab.add_row(["7.","XGBoost Regressor","{'max_depth': 3, 'n_estimators': 500}","1.Train Data :  0.995 \n 2.Test Data : 0.950 \n 3.Validation Data :  0.984\n"])
print(tab)

+-------+-------------------------+--------------------------------------------------+------------------------------+
| Sr No |          Model          |               Best Hyperparameters               | R Squared Scores of all data |
+-------+-------------------------+--------------------------------------------------+------------------------------+
|   1.  |    Linear Regression    |                        -                         |    1.Train Data :  0.444     |
|       |                         |                                                  |     2.Test Data : 0.400      |
|       |                         |                                                  |  3.Validation Data :  0.497  |
|       |                         |                                                  |                              |
|   2.  |     Ridge Regression    |                {'alpha' : 0.001}                 |    1.Train Data :  0.444     |
|       |                         |                     

#### For Missing Diameter values as Test Data

In [37]:
from prettytable import PrettyTable
tab = PrettyTable(["Sr No", "Model", "Best Hyperparameters","R Squared Value"])
# Add rows
tab.add_row(["1.", "Linear Regression", "-","0.440"])
tab.add_row(["2.", "Ridge Regression", "{'alpha' : 0.001}","0.440"])
tab.add_row(["3.", "Lasso Regression", "{'alpha' : 0.1}","0.429"])
tab.add_row(["4.", "ElasticNet Regression", "{'alpha' : 0.1,'l1_ratio': 0.0, 'max_iter': 100}","0.427"])
tab.add_row(["5.","Adaboost Regressor","{'learning_rate': 0.001, 'n_estimators': 500}","0.868"])
tab.add_row(["6.","Random Forest Regressor","{'max_depth': 5, 'n_estimators': 100}","0.969"])
tab.add_row(["7.","XGBoost Regressor","{'max_depth': 3, 'n_estimators': 500}","0.995"])
print(tab)

+-------+-------------------------+--------------------------------------------------+-----------------+
| Sr No |          Model          |               Best Hyperparameters               | R Squared Value |
+-------+-------------------------+--------------------------------------------------+-----------------+
|   1.  |    Linear Regression    |                        -                         |      0.440      |
|   2.  |     Ridge Regression    |                {'alpha' : 0.001}                 |      0.440      |
|   3.  |     Lasso Regression    |                 {'alpha' : 0.1}                  |      0.429      |
|   4.  |  ElasticNet Regression  | {'alpha' : 0.1,'l1_ratio': 0.0, 'max_iter': 100} |      0.427      |
|   5.  |    Adaboost Regressor   |  {'learning_rate': 0.001, 'n_estimators': 500}   |      0.868      |
|   6.  | Random Forest Regressor |      {'max_depth': 5, 'n_estimators': 100}       |      0.969      |
|   7.  |    XGBoost Regressor    |      {'max_depth': 

#### Negative Mean Absolute Error

In [None]:
from prettytable import PrettyTable
tab = PrettyTable(["Sr No", "Model", "Best Hyperparameters","Negative Absolute Error of all Data"])
# Add rows
tab.add_row(["1.", "Linear Regression", "-","1.Train Data :  -2.725 \n 2.Test Data : -2.723 \n 3.Validation Data :  -2.723\n"])
tab.add_row(["2.", "Ridge Regression", "{'alpha' : 0.001}","1.Train Data :  -2.725 \n 2.Test Data : -2.723 \n 3.Validation Data :  -2.723\n"])
tab.add_row(["3.", "Lasso Regression", "{'alpha' : 0.1}","1.Train Data :  -2.598 \n 2.Test Data : -2.596 \n 3.Validation Data :  -2.589\n"])
tab.add_row(["4.", "ElasticNet Regression", "{'alpha' : 0.1,'l1_ratio': 0.0, 'max_iter': 100}","1.Train Data :  -2.524 \n 2.Test Data : -2.520 \n 3.Validation Data :  -2.520\n"])
tab.add_row(["5.","Adaboost Regressor","{'learning_rate': 0.001, 'n_estimators': 500}","1.Train Data :  -1.783 \n 2.Test Data : -1.806 \n 3.Validation Data :  -1.798\n"])
tab.add_row(["6.","Random Forest Regressor","{'max_depth': 5, 'n_estimators': 100}","1.Train Data :  -0.912 \n 2.Test Data : -0.920 \n 3.Validation Data :  -0.917\n"])
tab.add_row(["7.","XGBoost Regressor","{'max_depth': 3, 'n_estimators': 500}","1.Train Data :  -0.381 \n 2.Test Data : -0.421 \n 3.Validation Data :  -0.418\n"])
print(tab)

+-------+-------------------------+--------------------------------------------------+-------------------------------------+
| Sr No |          Model          |               Best Hyperparameters               | Negative Absolute Error of all Data |
+-------+-------------------------+--------------------------------------------------+-------------------------------------+
|   1.  |    Linear Regression    |                        -                         |       1.Train Data :  -2.725        |
|       |                         |                                                  |         2.Test Data : -2.723        |
|       |                         |                                                  |      3.Validation Data :  -2.723    |
|       |                         |                                                  |                                     |
|   2.  |     Ridge Regression    |                {'alpha' : 0.001}                 |       1.Train Data :  -2.725        |


#### For Missing Diameter values as Test Data

In [36]:
from prettytable import PrettyTable
tab = PrettyTable(["Sr No", "Model", "Best Hyperparameters","Negative Mean Absolute Error"])
# Add rows
tab.add_row(["1.", "Linear Regression", "-","-2.616"])
tab.add_row(["2.", "Ridge Regression", "{'alpha' : 0.001}","-2.616"])
tab.add_row(["3.", "Lasso Regression", "{'alpha' : 0.1}","-2.486"])
tab.add_row(["4.", "ElasticNet Regression", "{'alpha' : 0.1,'l1_ratio': 0.0, 'max_iter': 100}","-2.403"])
tab.add_row(["5.","Adaboost Regressor","{'learning_rate': 0.001, 'n_estimators': 500}","-1.80"])
tab.add_row(["6.","Random Forest Regressor","{'max_depth': 5, 'n_estimators': 100}","-0.913"])
tab.add_row(["7.","XGBoost Regressor","{'max_depth': 3, 'n_estimators': 500}","-0.363"])
print(tab)

+-------+------------------------------------------+--------------------------------------------------+------------------------------+
| Sr No |                  Model                   |               Best Hyperparameters               | Negative Mean Absolute Error |
+-------+------------------------------------------+--------------------------------------------------+------------------------------+
|  **** | For Missing Diameter values as Test Data |                       ****                       |             ----             |
|   1.  |            Linear Regression             |                        -                         |            -2.616            |
|   2.  |             Ridge Regression             |                {'alpha' : 0.001}                 |            -2.616            |
|   3.  |             Lasso Regression             |                 {'alpha' : 0.1}                  |            -2.486            |
|   4.  |          ElasticNet Regression           | {'

### Conclusion:

<b>For R squared:</b>
1. After lots of model training, It can be observed that XGBoost Regressor performs best out of all models.

2. Lasso, Ridge and Elastic Net Regression has almost similar performance.

3. Out of all Models, Linear Regression performs poorly.

4. R^2 Error is taken as performance metric in this regression problem.

5. All of the ensembles perform very well.

6. Higher values of R-square determines the less difference between the predicted values and actual values and hence represents a good model.



<b> For Negative Mean Absolute Error : </b>

1. After lots of model training, It can be observed that XGBoost Regressor performs best in terms of negative mean absolute error out of all models.

2. Lasso Performs better than Ridge whereas ElasticNet gives out the best performance out of all Linear Regression models.

3. Negative Mean Absolute Error is taken as performance metric in this regression problem.

4. All of the ensembles perform very well.

5. Higher values of Negative Mean Absolute Error determines the less difference between the predicted values and actual values and hence represents a good model.


In [39]:
import pickle
pickle.dump(enc,open("One-Hot-Encoder.sav",'wb'))
pickle.dump(xgb,open("XGB_Updated_with_137681_pts.pkl",'wb'))