In [None]:
# imports
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

# Get The Data

In [None]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [None]:
train_df.head()

Unnamed: 0,ID,Date,y,Hour,Temperature(�C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(�C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


# Data Exploration And EDA

In [None]:
# adjusting column names of the train data
col_names = ['ID', 'Date', 'y', 'Hour', 'Temperature', 'Humidity',
       'Wind speed', 'Visibility', 'Dew point temperature',
       'Solar Radiation', 'Rainfall', 'Snowfall', 'Seasons',
       'Holiday', 'Functioning Day']
train_df.columns = col_names

In [None]:
# adjusting column names of the test data
col_names = ['ID', 'Date','Hour', 'Temperature', 'Humidity',
       'Wind speed', 'Visibility', 'Dew point temperature',
       'Solar Radiation', 'Rainfall', 'Snowfall', 'Seasons',
       'Holiday', 'Functioning Day']
test_df.columns = col_names

In [None]:
# adjusting Date 
for data in [train_df, test_df] :
    data['Date'] = pd.to_datetime(data['Date'])
    #train_df['Month'] = train_df['Date'].dt.month.astype(int)
    #train_df['Weekday'] = train_df['Date'].dt.dayofweek.astype(int)
    #train_df['Year'] = train_df['Date'].dt.year.astype(int)
    data['Month'] = data['Date'].dt.strftime('%B')
    data['Weekday'] = data['Date'].dt.strftime('%A')

- there is a strong correlation between temperature and dew point temperature
- temperature and hour is the most correlated features with y

# Data Preparation

In [None]:
for data in [train_df, test_df]:
    data.loc[data['Rainfall'] != 0, 'Rainfall'] = 1
    #data.loc[data['Snowfall'] != 0, 'Snowfall'] = 1
    data['Visibility'] = np.log(data['Visibility'])
    #data['Solar Radiation'] = np.sqrt(data['Solar Radiation'])

    data['Holiday'] = data['Holiday'].astype('category').cat.codes
    data['Hour_sin'] = np.sin(2 * np.pi * data['Hour'] / 23)
    #train_df = train_df.loc[(train_df['y'] < 2500)]
    data.loc[data['Functioning Day'] == 'Yes', 'Functioning Day'] = 1
    data.loc[data['Functioning Day'] == 'No', 'Functioning Day'] = 0
    data['Functioning Day'] = data['Functioning Day'].astype(int)
    data['Hot'] = 0
    data.loc[data['Temperature'] >= 20 , 'Hot'] = 1
    data['isOfficeHour']=0
    data.loc[(data['Hour']>=17 ) & (data['Hour']<=20 ),'isOfficeHour' ]=1
    data['weekend']=0
    data.loc[data['Weekday']=='Sunday','weekend']=1
    #ata.loc[data['Weekday']=='Friday','weekend']=1

    #data['Hot'].value_counts()
    #data['Cold'] = 0
    #data.loc[data['Temperature'] > 5, 'Cold'] = 1
    #train_df['Cold'].value_counts()
    #data['Hour_cos']=np.cos(2*np.pi*data['Hour']/23)
    #data['Seasons']=data['Seasons'].astype('category').cat.codes


NameError: ignored

In [None]:
# dropping some features
train_df.drop(columns = ['ID', 'Date','Snowfall'], axis = 1, inplace = True)
X_test = test_df.drop(columns = ['ID', 'Date','Snowfall'], axis = 1)

In [None]:
train_df = pd.get_dummies(train_df)
X_test = pd.get_dummies(X_test)
pd.options.display.max_columns = 100

In [None]:
train_df.head()

In [None]:
X_test.head()

In [None]:
## removing some outliers
#print(f'Shape of training set before removing outliers is : {train_df.shape}')
#train_df = train_df[train_df['Wind speed'] < 5.5]
#train_df = train_df[train_df['y'] < 2500]
#print(f'Shape of training set after removing outliers is : {train_df.shape}')

# Spliting Data

In [None]:
features = train_df.drop(columns = ['y'])
target = np.log1p(train_df['y'])
from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size = 0.2, shuffle = True, random_state = 42)

NameError: ignored

# Training Some Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_log_error, make_scorer

RN_model = RandomForestRegressor(n_estimators = 200, max_depth =5)
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', RN_model)
]
pipe = Pipeline(steps)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scalar',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('poly',
                 PolynomialFeatures(degree=2, include_bias=True,
                                    interaction_only=False, order='C')),
                ('model',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=5,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=200, n_jobs=None,
                                       oob_score=False, rand

In [None]:
def print_metrics(y_train_pred, y_val_pred, model_name) :
    print('============' + model_name + '============' )
    print('R2_Score for Training Data Is : ', r2_score(np.expm1(y_train), np.expm1(y_train_pred)))
    print('R2_Score For Validation Data Is : ', r2_score(np.expm1(y_val), np.expm1(y_val_pred)))
    print('Square Root Of Mean Squared Log Error For Training Data : ', np.sqrt(mean_squared_log_error(np.round(np.expm1(y_train)), np.round(np.expm1(y_train_pred)))))
    print('Square Root Of Mean Squared Log Error For Validation Data : ', np.sqrt(mean_squared_log_error(np.round(np.expm1(y_val)), np.round(np.expm1(y_val_pred)))))
    print('\n')

In [None]:
RN_train_pred = pipe.predict(X_train)
RN_val_pred = pipe.predict(X_val)
# random forest Scores
print_metrics(RN_train_pred, RN_val_pred, 'Random Forest Regressor')

R2_Score for Training Data Is :  0.7185019746370751
R2_Score For Validation Data Is :  0.7104682446551704
Square Root Of Mean Squared Log Error For Training Data :  0.4968252510225262
Square Root Of Mean Squared Log Error For Validation Data :  0.5152342380150058




In [None]:
from xgboost import XGBRegressor
xg_model = XGBRegressor(base_score=0.5, booster='gbtree',learning_rate=0.1)
xg_model.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [None]:
xg_train_pred = xg_model.predict(X_train)
xg_val_pred = xg_model.predict(X_val)
# random forest Scores
print_metrics(xg_train_pred, xg_val_pred, 'xgboosting Regressor')

R2_Score for Training Data Is :  0.8489342254671565
R2_Score For Validation Data Is :  0.8337242455884731
Square Root Of Mean Squared Log Error For Training Data :  0.42842488796357237
Square Root Of Mean Squared Log Error For Validation Data :  0.4570728984584059




In [None]:
test_df[['ID', 'y']].to_csv('submission.csv', index = False)