In [None]:
!pip install xgboost==1.5.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost==1.5.2
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
[K     |████████████████████████████████| 173.6 MB 10 kB/s 
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.5.2


In [None]:
import pandas as pd 
import numpy as np
import seaborn as sb
from scipy import stats
import matplotlib.pyplot as plt
import math
from xgboost import XGBRegressor,XGBClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
# Already have asteroid dataset in drive. Downloading it from drive
!gdown --id 1-afEh_SyLdQh1tFH3mlcCgXLPj4FWSKm

Downloading...
From: https://drive.google.com/uc?id=1-afEh_SyLdQh1tFH3mlcCgXLPj4FWSKm
To: /content/Cleaned_Asteroid.csv
100% 174M/174M [00:01<00:00, 129MB/s]


In [None]:
df=pd.read_csv("Cleaned_Asteroid.csv")

In [None]:
df=df.astype({'diameter':float,'condition_code':int})
df.head(5)

Unnamed: 0,full_name,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,diameter,albedo,neo,pha,moid,diam_bin
0,1 Ceres,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,0,1002,3.34,939.4,0.09,N,N,1.59478,Very Large
1,2 Pallas,2.772466,0.230337,34.836234,173.080063,310.048857,2.133865,3.411067,4.616444,72318.0,0,8490,4.13,545.0,0.101,N,N,1.23324,Very Large
2,3 Juno,2.66915,0.256942,12.988919,169.85276,248.138626,1.983332,3.354967,4.360814,72684.0,0,7104,5.33,246.596,0.214,N,N,1.03454,Large
3,4 Vesta,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,24288.0,0,9325,3.2,525.4,0.4228,N,N,1.13948,Very Large
4,5 Astraea,2.574249,0.191095,5.366988,141.576604,358.687608,2.082324,3.066174,4.130323,63431.0,0,2861,6.85,106.699,0.274,N,N,1.09589,Large


### Feature Selection

#### We are using Correlation Matrix to select Most Important Feature

In [None]:
corr=df.corr()
corr['diameter'].abs().sort_values(ascending=False)

diameter          1.000000
H                 0.566501
data_arc          0.492110
n_obs_used        0.386038
moid              0.332416
q                 0.329698
a                 0.144748
albedo            0.106077
ad                0.093440
condition_code    0.073546
i                 0.052540
e                 0.049107
per_y             0.048955
w                 0.002980
om                0.001155
Name: diameter, dtype: float64

After performing correlation matrix, it is found that H,data_arc,n_obs_used,moid,q,a,albedo have higher correlation with matrix.

Select the features with higher absolute correlation value.

In [None]:
df=df[['H','data_arc','n_obs_used','moid','q','a','albedo','neo','pha','diameter']]

Unnamed: 0,H,data_arc,n_obs_used,moid,q,a,albedo,neo,pha,diameter
0,3.34,8822.0,1002,1.59478,2.558684,2.769165,0.09,0,0,939.4
1,4.13,72318.0,8490,1.23324,2.133865,2.772466,0.101,0,0,545.0
2,5.33,72684.0,7104,1.03454,1.983332,2.66915,0.214,0,0,246.596
3,3.2,24288.0,9325,1.13948,2.151909,2.361418,0.4228,0,0,525.4
4,6.85,63431.0,2861,1.09589,2.082324,2.574249,0.274,0,0,106.699


## Splitting the dataset

* Since the diameter has missing values, we will have missing values of diameter in test data set and train dataset will have non-missing values.

* We can also perform Validation Split for better convenience

In [None]:
# Test Data of missing values of diameter
df1=df.copy()
df1.dropna(inplace=True)

y = df1["diameter"]
x = df1.drop("diameter", axis=1)

# 84-16 Splitting With no Random state
print(x.shape,y.shape)

(137681, 9) (137681,)


In [None]:
def get_gen_grp(diam):
    
    if (diam <= 10):
        return 'Small'
    
    elif (diam > 10) and (diam <= 100):
        return 'Medium'
    
    elif (diam > 100) and (diam <= 500):
        return 'Large'

    elif (diam > 500):
        return 'Very Large'
    
    else:
        return 'Missing'

In [None]:
y_binned = y.apply(get_gen_grp)

In [None]:
# Save your Y values in a new ndarray,
# broken down by the bins created above.
# Pass y_binned to the stratify argument,
# and sklearn will handle the rest

xtr, xte, ytr, yte = train_test_split(x, y, test_size=0.33,stratify=y_binned,random_state=42)
y_binned = ytr.apply(get_gen_grp)
xtr, xcv, ytr, ycv = train_test_split(xtr, ytr, test_size=0.2,stratify=y_binned,random_state=42)

In [None]:
print("Train data : ",xtr.shape)
print("Test data : ",xte.shape)
print("Validation Data : ",xcv.shape)

Train data :  (73796, 9)
Test data :  (45435, 9)
Validation Data :  (18450, 9)


#### Train Data

In [None]:
# NEO
xtr_neo_encode=pd.get_dummies(xtr['neo'], drop_first=True)
xte_neo_encode=pd.get_dummies(xte['neo'], drop_first=True)
xcv_neo_encode=pd.get_dummies(xcv['neo'], drop_first=True)

In [None]:
# PHA
xtr_pha_encode=pd.get_dummies(xtr['pha'], drop_first=True)
xte_pha_encode=pd.get_dummies(xte['pha'], drop_first=True)
xcv_pha_encode=pd.get_dummies(xcv['pha'], drop_first=True)

In [None]:
sd=StandardScaler()

In [None]:
sd.fit(xtr[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
xtr_norm=sd.transform(xtr[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
xtr_norm=pd.DataFrame(data=xtr_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
xtr_norm.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid
0,0.194575,-0.228851,-0.20396,-0.188852,-0.486813,-0.86697,-0.264413
1,0.192109,0.452161,2.454209,1.508338,-0.980177,-0.500787,0.426278
2,-0.092617,-0.217243,-0.143081,-0.963731,1.345683,-0.665569,-0.243129
3,0.230832,0.08436,-0.105456,-0.152851,-0.980177,-0.491632,0.240638
4,0.115235,0.386575,-0.47073,-0.279712,-0.204891,0.021025,0.413315


#### Test Data

In [None]:
xte_norm=sd.transform(xte[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
xte_norm=pd.DataFrame(data=xte_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
xte_norm.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid
0,-0.33798,-1.015029,-0.275336,-0.608864,0.9228,1.678005,-1.07786
1,0.129509,0.16805,-0.270976,-0.245425,0.006551,-0.52825,0.176767
2,0.025601,-0.267443,-1.440111,-1.106021,1.134242,-0.64726,-0.251257
3,0.107837,1.011119,-0.256604,0.627171,-0.909697,0.387208,0.987456
4,0.185676,-0.195878,-0.396771,0.210587,-0.557294,-0.537405,-0.166623


#### Validation Data

In [None]:
xcv_norm=sd.transform(xcv[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
xcv_norm=pd.DataFrame(data=xcv_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
xcv_norm.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid
0,-0.330551,-1.131165,-0.276305,-0.535148,0.781838,-0.857815,-1.129211
1,-0.209151,-0.150559,-0.403715,-0.680866,0.852319,-0.619796,-0.174578
2,-0.340125,-0.872112,2.637492,2.593512,-0.627774,1.760396,-0.918899
3,-0.387755,-0.462265,0.953549,1.023182,-0.13441,3.609622,-0.469835
4,-0.163669,-0.489291,-0.10917,0.226016,0.147513,1.320976,-0.484627


In [None]:
xtr=xtr_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
xtr['neo']=xtr_neo_encode.values
xtr['pha']=xtr_pha_encode.values
xtr.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid,neo,pha
0,0.194575,-0.228851,-0.20396,-0.188852,-0.486813,-0.86697,-0.264413,0,0
1,0.192109,0.452161,2.454209,1.508338,-0.980177,-0.500787,0.426278,0,0
2,-0.092617,-0.217243,-0.143081,-0.963731,1.345683,-0.665569,-0.243129,0,0
3,0.230832,0.08436,-0.105456,-0.152851,-0.980177,-0.491632,0.240638,0,0
4,0.115235,0.386575,-0.47073,-0.279712,-0.204891,0.021025,0.413315,0,0


In [None]:
xte=xte_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
xte['neo']=xte_neo_encode.values
xte['pha']=xte_pha_encode.values
xte.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid,neo,pha
0,-0.33798,-1.015029,-0.275336,-0.608864,0.9228,1.678005,-1.07786,0,0
1,0.129509,0.16805,-0.270976,-0.245425,0.006551,-0.52825,0.176767,0,0
2,0.025601,-0.267443,-1.440111,-1.106021,1.134242,-0.64726,-0.251257,0,0
3,0.107837,1.011119,-0.256604,0.627171,-0.909697,0.387208,0.987456,0,0
4,0.185676,-0.195878,-0.396771,0.210587,-0.557294,-0.537405,-0.166623,0,0


In [None]:
xcv=xcv_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
xcv['neo']=xcv_neo_encode.values
xcv['pha']=xcv_pha_encode.values
xcv.head(5)

Unnamed: 0,a,q,data_arc,n_obs_used,H,albedo,moid,neo,pha
0,-0.330551,-1.131165,-0.276305,-0.535148,0.781838,-0.857815,-1.129211,0,0
1,-0.209151,-0.150559,-0.403715,-0.680866,0.852319,-0.619796,-0.174578,0,0
2,-0.340125,-0.872112,2.637492,2.593512,-0.627774,1.760396,-0.918899,0,0
3,-0.387755,-0.462265,0.953549,1.023182,-0.13441,3.609622,-0.469835,0,0
4,-0.163669,-0.489291,-0.10917,0.226016,0.147513,1.320976,-0.484627,0,0


In [None]:
xtr.shape

(73796, 9)

In [None]:
model=XGBRegressor(max_depth= 3, n_estimators=500)
model.fit(xtr,ytr)
print("R^2 of train model is : ",model.score(xtr,ytr))
print("R^2 of test model is : ",model.score(xte,yte))
print("R^2 of validation model is : ",model.score(xcv,ycv))

R^2 of train model is :  0.9962450160010675
R^2 of test model is :  0.9428911813153186
R^2 of validation model is :  0.9832383641009411


In [None]:
import pickle

In [None]:
df[df['diameter']>900]

Unnamed: 0,H,data_arc,n_obs_used,moid,q,a,albedo,neo,pha,diameter
0,3.34,8822.0,1002,1.59478,2.558684,2.769165,0.09,0,0,939.4


In [None]:
pickle.dump(model,open("xgboost_r2.pkl", "wb"))
pickle.dump(sd, open("scaler.sav", "wb"))

In [None]:
model=XGBRegressor(max_depth= 5, n_estimators=500)
model.fit(xtr_norm,ytr)
print("R^2 of train model is : ",model.score(xtr_norm,ytr))
print("R^2 of test model is : ",model.score(xte_norm,yte))
print("R^2 of validation model is : ",model.score(xcv_norm,ycv))

R^2 of train model is :  0.9981450996183379
R^2 of test model is :  0.9358642808097781
R^2 of validation model is :  0.9830381885187537


In [None]:
pickle.dump(model,open("xgboost_nmae.pkl", "wb"))

In [None]:
pickle.dump(sd, open("std.sav", "wb"))

In [None]:
H=21
data=22
n=34
mo=45
q=65
a=67
alb=27
trans=np.array([[H,data,n,mo,q,a,alb]])
print(trans)

[[21 22 34 45 65 67 27]]


In [None]:
trans=trans.reshape(1,-1)

In [None]:
trans=sd.transform(trans).reshape(1,-1)
print(trans)

[[ 11.73326183  37.45412165  -1.43478252  -1.0545912   35.10590766
  612.16980082  49.27138478]]


  "X does not have valid feature names, but"


In [None]:
trans.shape

(1, 7)

In [None]:
trans[0][6]

27

In [None]:
df[df['diameter']<1.5]

Unnamed: 0,H,data_arc,n_obs_used,moid,q,a,albedo,neo,pha,diameter
1220,17.70,31628.0,467,0.107716,1.083854,1.919416,0.078,1,0,1.000
1565,16.90,25593.0,1008,0.034275,0.186588,1.078095,0.510,1,1,1.000
1864,16.84,17538.0,1730,0.156505,0.575811,1.080048,0.220,1,0,1.200
1914,18.97,18842.0,51,0.108620,1.092382,2.543337,0.210,1,0,0.500
2061,16.80,21601.0,818,0.113146,0.790141,0.966790,0.260,1,0,1.100
...,...,...,...,...,...,...,...,...,...,...
762416,17.30,3370.0,145,0.876709,1.841139,2.741434,0.046,0,0,1.290
771469,17.50,6512.0,86,0.869589,1.801623,2.770673,0.058,0,0,1.049
786782,17.20,2214.0,69,1.149760,2.118738,3.329413,0.110,0,0,1.327
792377,17.90,5760.0,104,0.823652,1.839905,2.219953,0.096,0,0,1.419
