In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.impute import SimpleImputer
import plotly.express as px
import warnings
warnings.filterwarnings(action = 'ignore')
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
import xgboost as xgb
%matplotlib inline

In [11]:
data = pd.read_csv('weatherAUS.csv')
data.columns = data.columns.str.lower()
string_columns = list(data.dtypes[data.dtypes == 'object'].index)

for col in string_columns:
    data[col] = data[col].str.lower().str.replace(' ', '_')

data.dropna(subset=['raintomorrow'], inplace=True)
data.raintomorrow = (data.raintomorrow == 'yes').astype(int)
categorical = list(data.dtypes[data.dtypes == 'object'].index)
numerical = list(data.dtypes[data.dtypes != 'object'].index)
df = data.copy()
imputer = SimpleImputer(strategy='mean')
imputer2 = SimpleImputer(strategy='most_frequent')
df[numerical] = imputer.fit_transform(df[numerical])
df[categorical] = imputer2.fit_transform(df[categorical])
df['year'] = pd.to_datetime(df.date).dt.year
df['month'] = pd.to_datetime(df.date).dt.month
df['day'] = pd.to_datetime(df.date).dt.day

train_df = df[df['year']<2015]
val_df = df[df['year']==2015]
test_df = df[df['year']>2015]

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train = train_df.drop('raintomorrow', axis=1)
y_train=train_df['raintomorrow']
test = test_df.drop('raintomorrow', axis=1)
y_test=test_df['raintomorrow']
val = val_df.drop('raintomorrow', axis=1)
y_val = val_df['raintomorrow']

del train['year']
del train['date']
del val['year']
del val['date']
del test['year']
del test['date']


dict_train = train.to_dict(orient='records')
dict_val = val.to_dict(orient='records')
dict_test = test.to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)
X_test = dv.transform(dict_test)


dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)


xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 30,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
}

model = xgb.train(xgb_params, dtrain,
                  num_boost_round=430)  #, verbose_eval=10,
                  #evals=watchlist)

In [12]:


y_pred_xgb = model.predict(dtest)




In [13]:
import bentoml
bentoml.xgboost.save_model("rain_tomorrow_model",model, custom_objects={"dictVectorizer":dv},
signatures={"predict": {"batchable":True,"batch_dim":0,}})

Model(tag="rain_tomorrow_model:egqcgtsv72rzv6fm", path="C:\Users\Odiaka\bentoml\models\rain_tomorrow_model\egqcgtsv72rzv6fm\")