In [1]:
!pip install xgboost==1.5.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost==1.5.2
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
[K     |████████████████████████████████| 173.6 MB 6.6 kB/s 
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.5.2


In [2]:
import pickle
import pandas as pd
import numpy as np 
import xgboost
from sklearn.metrics import r2_score
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [3]:
# Already have asteroid dataset in drive. Downloading it from drive
!gdown --id 1-afEh_SyLdQh1tFH3mlcCgXLPj4FWSKm

Downloading...
From: https://drive.google.com/uc?id=1-afEh_SyLdQh1tFH3mlcCgXLPj4FWSKm
To: /content/Cleaned_Asteroid.csv
100% 174M/174M [00:02<00:00, 80.3MB/s]


In [4]:
df=pd.read_csv("Cleaned_Asteroid.csv")
df=df.astype({'diameter':float})
df.head(5)

Unnamed: 0,full_name,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,diameter,albedo,neo,pha,moid,diam_bin
0,1 Ceres,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,0.0,1002,3.34,939.4,0.09,N,N,1.59478,Very Large
1,2 Pallas,2.772466,0.230337,34.836234,173.080063,310.048857,2.133865,3.411067,4.616444,72318.0,0.0,8490,4.13,545.0,0.101,N,N,1.23324,Very Large
2,3 Juno,2.66915,0.256942,12.988919,169.85276,248.138626,1.983332,3.354967,4.360814,72684.0,0.0,7104,5.33,246.596,0.214,N,N,1.03454,Large
3,4 Vesta,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,24288.0,0.0,9325,3.2,525.4,0.4228,N,N,1.13948,Very Large
4,5 Astraea,2.574249,0.191095,5.366988,141.576604,358.687608,2.082324,3.066174,4.130323,63431.0,0.0,2861,6.85,106.699,0.274,N,N,1.09589,Large


In [8]:
def final_pred(data):
  def load_sample(df):
    df=df.sample(n=1,axis=0)
    return df
  

  def encode_sample(df):
    df=df[['H','data_arc','n_obs_used','moid','q','a','albedo','neo','pha','diameter']]
    x = df.drop("diameter", axis=1)
    # Loaded Standard Scalar of Sklearn version 1.0.2(default)
    sd=pickle.load(open("Scalar.sav","rb"))
    x_norm=sd.transform(x[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
    x_norm=pd.DataFrame(data=x_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
    ohe=pickle.load(open("One-Hot-Encoder.sav","rb"))
    x_neo_encode=ohe.transform(x['neo'].values.reshape(-1,1))
    x_pha_encode=ohe.transform(x['pha'].values.reshape(-1,1))
    x=x_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
    x['neo']=x_neo_encode
    x['pha']=x_pha_encode
    return x

  def model_prediction(df):
    # Loaded the pretrained xgboost models version 1.5.2
    model1=pickle.load(open("/content/XGB_Updated_with_137681_pts.pkl","rb"))
    ypr=model1.predict(df)
    return ypr

  samp=load_sample(data)
  enc_samp=encode_sample(samp)
  ypr=model_prediction(enc_samp)
  return ypr,samp

In [10]:
ypr,samp=final_pred(df)
samp["Predicted Diameter"]=ypr
print(samp)

                  full_name        a         e          i          om  \
737819         (2015 AL170)  2.62431  0.177854  16.547029  116.484555   

                 w         q        ad     per_y  data_arc  condition_code  \
737819  227.133879  2.157566  3.091053  4.251389      10.0             7.0   

        n_obs_used       H  diameter  albedo neo pha     moid diam_bin  \
737819          11  17.223       NaN   0.078   N   N  1.17826  Missing   

        Predicted Diameter  
737819            2.291601  


In [25]:
def final_metrics(data):
  def select_data(data):
    data=data.dropna()
    x=data[['a','q','data_arc','n_obs_used','H','albedo','neo','pha','moid','diam_bin']]
    y=data['diameter']
    return x,y

  def split_data(x,y):
    # If the random state has changed to 45, The r2 score and NMAE Increases Significantly 
    xtr,xte,ytr,yte=train_test_split(x,y,test_size=0.2,stratify=x['diam_bin'],random_state=42)
    xtr.drop(columns=['diam_bin'],axis=1,inplace=True)
    xte.drop(columns=['diam_bin'],axis=1,inplace=True)
    return xte,yte

  def encode_test(test):
    sd = pickle.load(open('Scalar.sav', 'rb'))
    xte_norm=sd.transform(test[['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid']])
    xte_norm=pd.DataFrame(data=xte_norm,columns=['a', 'q', 'data_arc', 'n_obs_used', 'H',
       'albedo', 'moid'])
    ohe=pickle.load(open("One-Hot-Encoder.sav","rb"))
    xte_neo_encode=ohe.transform(test['neo'].values.reshape(-1,1))
    xte_pha_encode=ohe.transform(test['pha'].values.reshape(-1,1))
    xte=xte_norm[['a','q','data_arc','n_obs_used','H','albedo','moid']]
    xte['neo']=xte_neo_encode
    xte['pha']=xte_pha_encode
    return xte

  def model_metrics(xte,yte):
    xgb=pickle.load(open("/content/XGB_Updated_with_137681_pts.pkl","rb"))
    ypred=xgb.predict(xte)
    rsq=r2_score(yte,ypred)
    tepred=xgb.predict(xte)
    nmae=-(mean_absolute_error(yte,tepred))
    return rsq,nmae
  
  
  x,y=select_data(data)
  xte,yte=split_data(x,y)
  xtest=encode_test(xte)
  rsq,nm=model_metrics(xtest,yte)
  return rsq,nm 

In [26]:
# With Random State 42
r,nm=final_metrics(df)
print("R Squared for Test Data is : ",r)
print("Negative Mean Absolute Error for Test Data is : ",nm)

R Squared for Test Data is :  0.9946893989353829
Negative Mean Absolute Error for Test Data is :  -0.38226829813287627
