In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("notebook", font_scale=1.25)
import optuna
from IPython.core.display import HTML,display
optuna.logging.set_verbosity(optuna.logging.WARNING)
from optuna.visualization import plot_param_importances
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('./dataset/train_E1GspfA.csv')
test_df = pd.read_csv('./dataset/test_6QvDdzb.csv')
df

## checking the number of rows

In [None]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns in the Training set")
print(f"There are {test_df.shape[0]} rows and {test_df.shape[1]} columns in the Test set")

## Datatype of the data

In [None]:
df.dtypes

In [None]:
df.date = pd.to_datetime(df.date)
test_df.date = pd.to_datetime(test_df.date)

In [None]:
df.dtypes

In [None]:
df.dtypes

## Visualizing the Training Data

In [None]:
# On complete data

plt.figure(figsize=(20,7))
sns.lineplot(data=df, x='date', y='demand', color='red', lw=2)
plt.show()

In [None]:
# On one year data

one_year_df = df.loc[(df.date>=pd.to_datetime('2019-01-01')) & (df.date<pd.to_datetime('2020-01-01'))]
plt.figure(figsize=(10,7))
sns.lineplot(data=one_year_df, x='date', y='demand', color='red', lw=2)
plt.show()

In [None]:
# On one month training data

one_month_df = df.loc[(df.date>=pd.to_datetime('2019-01-01')) & (df.date<pd.to_datetime('2019-02-01'))]
plt.figure(figsize=(20,7))
sns.lineplot(data=one_month_df, x='date', y='demand', lw=2, color='red')
plt.show()

In [None]:
# On two months training data

two_months_df = df.loc[(df.date>=pd.to_datetime('2019-01-01')) & (df.date<pd.to_datetime('2019-03-01'))]
plt.figure(figsize=(20,7))
sns.lineplot(data=two_months_df, x='date', y='demand', lw=2, color='red')
plt.show()

In [None]:
# On day training data

one_day_df = df.loc[(df.date>=pd.to_datetime('2019-01-01')) & (df.date<pd.to_datetime('2019-01-02'))]
plt.figure(figsize=(20,7))
sns.lineplot(data=one_day_df, x='hour', y='demand', lw=2, color='red')
plt.show()

## Decomposing Date Features

In [None]:
# Training dataset
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day
df['week_of_year'] = (df.date.dt.isocalendar().week).astype(int)
df['day_of_week'] = df.date.dt.weekday
df['quarter'] = df.date.dt.quarter

# Test dataset
test_df['year'] = test_df.date.dt.year
test_df['month'] = test_df.date.dt.month
test_df['day'] = test_df.date.dt.day
test_df['week_of_day'] = test_df.date.dt.isocalendar().week
test_df['day_of_week'] = test_df.date.dt.weekday
test_df['quarter'] = test_df.date.dt.quarter
df

## Grouping Demands by month

In [None]:
plt.figure(figsize=(20,7))
temp1 = df.groupby('month')['demand'].mean().reset_index()
temp2 = df.groupby('month')['demand'].median().reset_index()
plt.subplot(1,2,1)
sns.barplot(data=temp1, x='month', y='demand', palette='Set1')
plt.title('Mean')
plt.subplot(1,2,2)
sns.barplot(data=temp2, x='month', y='demand', palette = 'Set1')
plt.title('Median')
plt.show()


#### Inference: The month of November has higher demand as compared to other month of the year.

In [None]:
df['is_november'] = (df.month==11).astype(int)
test_df['is_novemver'] = (test_df.month==11).astype(int)
test_df

## Grouping Demands by hour


In [None]:
plt.figure(figsize=(20,7))
temp1 = df.groupby('hour')['demand'].mean().reset_index()
temp2 = df.groupby('hour')['demand'].median().reset_index()
plt.subplot(1,2,1)
sns.barplot(data=temp1, x='hour', y='demand', palette='Set1')
plt.title('Mean')
plt.subplot(1,2,2)
sns.barplot(data=temp2, x='hour',y='demand', palette='Set1')
plt.title('Median')
plt.tight_layout()
plt.show()

#### Inference: We can observe that hours from 7 to 22 have higher demand as compared to other hours of the day. 

## Adding Peak hours features

In [None]:
df['peak_hours'] = ((df['hour']>=7)&(df['hour']<=22)).astype(int)
test_df['peak_hours'] = ((test_df['hour']>=7)&(test_df['hour']<=22)).astype(int)
test_df

## Grouping demands by year

In [None]:
plt.figure(figsize=(20,7))
temp1 = df.groupby('year')['demand'].mean().reset_index()
temp2 = df.groupby('year')['demand'].median().reset_index()
plt.subplot(1,2,1)
sns.barplot(data=temp1, x='year', y='demand', palette='Set1')
plt.title('mean')
plt.subplot(1,2,2)
sns.barplot(data=temp2, x='year', y='demand', palette='Set2')
plt.title('median')
plt.show()


### Inference: We can observe demand goes on increasing every year


In [None]:
df['year_code']= df.year - 2017
test_df['year_code'] = test_df.year - 2017

## Grouping demand by quarters

In [None]:
plt.figure(figsize=(20,7))
temp1 = df.groupby('quarter')['demand'].mean().reset_index()
temp2 = df.groupby('quarter')['demand'].median().reset_index()
plt.subplot(1,2,1)
sns.barplot(data=temp1, x='quarter', y='demand', palette='Set1')
plt.title('mean')
plt.subplot(1,2,2)
sns.barplot(data=temp2, x='quarter', y='demand', palette='Set2')
plt.title('median')
plt.show()


### Inference: Quarter 3 has lower demand as compared to other quarters

In [None]:
df['is_quarter_three'] = (df.quarter == 3).astype(int)
test_df['is_quarter_three'] = (test_df.quarter == 3).astype(int)
test_df

## Grouping demands by week of the year

In [None]:
plt.figure(figsize=(20,7))
temp1 = df.groupby('day_of_week')['demand'].mean().reset_index()
temp2 = df.groupby('day_of_week')['demand'].median().reset_index()
plt.subplot(1,2,1)
sns.barplot(data=temp1, x='day_of_week', y='demand', palette='Set1')
plt.title('mean')
plt.subplot(1,2,2)
sns.barplot(data=temp2, x='day_of_week', y='demand', palette='Set2')
plt.title('median')
plt.show()

### Inference: Friday, Saturday and Sunday have high demand

# Validation Set

In [None]:
# Compute 20% of samples
df.shape[0]*0.2

In [None]:
val = df.iloc[:3650]
train = df.iloc[3650:]

In [None]:
print(f"There are {train.shape[0]} rows and {train.shape[1]} columns in the Training set")
print(f"There are {val.shape[0]} rows and {val.shape[1]} columns in the validation set")

In [None]:
xtrain=train.drop(columns=['demand','year','date','quarter']) #quartor feature is dropped as is_quartor_three feature will carry the info.
ytrain=train['demand']

xval=val.drop(columns=['year','demand','date','quarter']) #quartor feature is dropped as is_quartor_three feature will carry the info.
yval=val['demand']

test_df.drop(columns=['date','year','quarter'],inplace=True)

# Modelling

In [None]:
model = lgb.LGBMRegressor(learning_rate = 0.01, n_estimators=3000)
model.fit(
    xtrain.values, ytrain,
    eval_set = [(xval.values, yval)],
    eval_metric = 'rmse',
    callbacks = [lgb.early_stopping(100)]    
)

In [None]:
display(HTML(f"<h3> <b style='color:#673AB7;font-size:22px;'>This model gave rmse: <b style='color:red;'>{model.best_score_['valid_0']['rmse']:0.4F}</b></h3>"))

In [None]:
def objective(trial):
    
    params = {
    
    'n_estimators':4000,
    'num_leaves':trial.suggest_int('num_leaves',35,80),
    'subsample':1,
    'min_child_samples':trial.suggest_int("min_child_samples",30,100),
    'learning_rate':trial.suggest_categorical("learning_rate",[0.001,0.01,0.03,0.05,0.07]),
     'max_depth':trial.suggest_int("max_depth",4,12),
    'reg_alpha':trial.suggest_float('reg_alpha',0.0,50),
    'reg_lambda':trial.suggest_float('reg_lambda',0.0,50),
    "min_split_gain": trial.suggest_float("min_split_gain", 0.0,20),
    'subsample_freq' : trial.suggest_categorical("subsample_freq", [1])
        
            }
    
    model=lgb.LGBMRegressor(**params)
    model.fit(
               xtrain.values,ytrain,
               eval_set=[(xval.values,yval)],
               eval_metric='rmse',
               callbacks=[lgb.early_stopping(100,verbose=0)]
        )
    pred=model.predict(xval)
    
    score=mean_squared_error(yval,pred,squared=False)
    
    return score

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=300,show_progress_bar=True)

In [None]:
trial = study.best_trial
best_params_lgbm=trial.params
study.best_value
display(HTML(f"<h3 style='color:#673AB7'>Best Params :<br><br><pre>{best_params_lgbm:}</h3>"))

In [None]:
model=lgb.LGBMRegressor(**best_params_lgbm,n_estimators=4000)
model.fit(
               xtrain.values,ytrain,
               eval_set=[(xval.values,yval)],
               eval_metric='rmse',
               callbacks=[lgb.early_stopping(100)]
        )

In [None]:
preds = model.predict(test_df.values)
preds