In [0]:
# LTFS Data Science FinHack 2
## https://datahack.analyticsvidhya.com/contest/ltfs-data-science-finhack-2-an-online-hackathon/

In [0]:
# Simple approach using Ada Boost
## Public Leaderboard score - 13.4600818316021
## Private Leader board score - 23.0178940307931

In [0]:
%reload_ext autoreload
%autoreload 2

In [0]:
# importing Libraries
import os
import datetime
from tqdm import tqdm
## importing pandas and numpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
### importing Fastai library for add_datepart function
from fastai import *
from fastai.tabular import *
#### importing SKlearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [0]:
# Fixing Seed=2
manualSeed = 2
np.random.seed(manualSeed)
random.seed(manualSeed)

In [12]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [13]:
path="/content/gdrive/My Drive/AIML/Data Science FinHack 2/"
print(os.listdir(path))

['My_Notebooks', 'My_Dataset', 'My_Models', 'My_Outputs', 'Bin']


In [0]:
# Function to convert date to pandas datetime datatype
def cnv_2_datetime(df,col_name='application_date'):
  df[col_name] = pd.to_datetime(df[col_name], format='%d-%m-%Y')
  return df

In [0]:
# Function to convert categorical columns as pandas category type
def cnv_2_category(df,col_names):
  for col in col_names:
    df[col] = df[col].astype('category')
  return df

In [0]:
# To drop columns
def drop_col(df,col_names):
  df.drop(col_names, axis=1,inplace=True)
  return df

In [0]:
# All above functions together
def process_df(df,cat_names,drop_cols,col_name='application_date'):
  df[col_name] = pd.to_datetime(df[col_name], format='%d-%m-%Y')
  for col in cat_names:
    df[col] = df[col].astype(str)
  df.drop(drop_cols, axis=1,inplace=True)
  return df

In [18]:
# Loading Datasets
# Manually splitted .csv files into train and validation files from original files
# Created test files from test .csv file to match train and validation sets format

train_df_1 = pd.read_csv(path+'My_Dataset/'+'train_Seg_1.csv')
valid_df_1 = pd.read_csv(path+'My_Dataset/'+'valid_Seg_1.csv')
test_df_1 = pd.read_csv(path+'My_Dataset/'+'test_Seg_1.csv')

#train_df_2 = pd.read_csv(path+'My_Dataset/'+'train_Seg_2.csv')
#valid_df_2 = pd.read_csv(path+'My_Dataset/'+'valid_Seg_2.csv')
#test_df_2 = pd.read_csv(path+'My_Dataset/'+'test_Seg_2.csv')

##----For segment 2 
# For segmet 2 there is no initial data for some of the states, i.e., there are 0 case counts upto some initial time frame.
#I manually deleted initial cases which are zeros and save files as below 
train_df_2 = pd.read_csv(path+'My_Dataset/'+'train_Seg_2_v1.csv')
valid_df_2 = pd.read_csv(path+'My_Dataset/'+'valid_Seg_2_v1.csv')
test_df_2 = pd.read_csv(path+'My_Dataset/'+'test_Seg_2_v1.csv')

sub_df = pd.read_csv(path+'My_Dataset/'+'sample_submission_IIzFVsf.csv')

pred_test_df_1=test_df_1.copy(deep=True)
pred_test_df_2=test_df_2.copy(deep=True)
train_df_1.head(2)

Unnamed: 0,application_date,segment,branch_id,state,zone,case_count
0,01-04-2017,1,1,WEST BENGAL,EAST,40
1,03-04-2017,1,1,WEST BENGAL,EAST,5


In [0]:
dfs_1=[train_df_1,valid_df_1,test_df_1]
dfs_2=[train_df_2,valid_df_2,test_df_2]

In [0]:
for df in dfs_1:
  df=cnv_2_datetime(df,col_name='application_date')
for df in dfs_2:
  df=cnv_2_datetime(df,col_name='application_date')

In [21]:
train_df_1.shape

(65321, 6)

In [0]:
# Adding holidays data
# https://www.calendarlabs.com/holidays/india/2019
with open(path+'My_Dataset/'+'Holidays.txt', 'r') as f:
    x = f.readlines()

In [0]:
Holiday_list=[]
for i in x:
  Holiday_list.append(i.rstrip('\n'))
#Holiday_list

In [0]:
# Getting columns for pre-processing
cat_col_1=['branch_id','state','zone']
cat_col_2=['state']
drop_cols_1=['segment']
drop_cols_2=['segment','branch_id','zone']
pred_drop_cols_1=['segment']
pred_drop_cols_2=['segment','branch_id','zone']

In [0]:
train_df_1 = process_df(train_df_1,cat_col_1,drop_cols_1)
valid_df_1 = process_df(valid_df_1,cat_col_1,drop_cols_1)

test_df_1 = process_df(test_df_1,cat_col_1,drop_cols_1)

pred_test_df_1 = process_df(pred_test_df_1,cat_col_1,pred_drop_cols_1)


In [0]:
train_df_2 = process_df(train_df_2,cat_col_2,drop_cols_2)
valid_df_2 = process_df(valid_df_2,cat_col_2,drop_cols_2)

test_df_2 = process_df(test_df_2,cat_col_2,drop_cols_2)

pred_test_df_2 = process_df(pred_test_df_2,cat_col_2,pred_drop_cols_2)

In [0]:
def add_holidays(df,Holiday_list=Holiday_list):
  df["is_holiday"] = (df.application_date.astype("str").isin(Holiday_list)).astype(int)
  return df

In [0]:
train_df_1 = add_holidays(train_df_1)
valid_df_1 = add_holidays(valid_df_1)
test_df_1 = add_holidays(test_df_1)

train_df_2 = add_holidays(train_df_2)
valid_df_2 = add_holidays(valid_df_2)
test_df_2 = add_holidays(test_df_2)

In [29]:
train_df_1.dtypes

application_date    datetime64[ns]
branch_id                   object
state                       object
zone                        object
case_count                   int64
is_holiday                   int64
dtype: object

In [30]:
train_df_1.head(2)

Unnamed: 0,application_date,branch_id,state,zone,case_count,is_holiday
0,2017-04-01,1,WEST BENGAL,EAST,40,0
1,2017-04-03,1,WEST BENGAL,EAST,5,0


In [31]:
# Adding date time features
add_datepart(train_df_1,'application_date',drop=True,time=False)
add_datepart(valid_df_1,'application_date',drop=True,time=False)

add_datepart(test_df_1,'application_date',drop=True,time=False)

add_datepart(train_df_2,'application_date',drop=True,time=False)
add_datepart(valid_df_2,'application_date',drop=True,time=False)

add_datepart(test_df_2,'application_date',drop=True,time=False)

Unnamed: 0,state,is_holiday,application_Year,application_Month,application_Week,application_Day,application_Dayofweek,application_Dayofyear,application_Is_month_end,application_Is_month_start,application_Is_quarter_end,application_Is_quarter_start,application_Is_year_end,application_Is_year_start,application_Elapsed
0,ASSAM,0,2019,7,30,24,2,205,False,False,False,False,False,False,1563926400
1,ASSAM,0,2019,7,30,25,3,206,False,False,False,False,False,False,1564012800
2,ASSAM,0,2019,7,30,26,4,207,False,False,False,False,False,False,1564099200
3,ASSAM,0,2019,7,30,27,5,208,False,False,False,False,False,False,1564185600
4,ASSAM,0,2019,7,30,28,6,209,False,False,False,False,False,False,1564272000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,WEST BENGAL,0,2019,10,42,20,6,293,False,False,False,False,False,False,1571529600
1298,WEST BENGAL,0,2019,10,43,21,0,294,False,False,False,False,False,False,1571616000
1299,WEST BENGAL,0,2019,10,43,22,1,295,False,False,False,False,False,False,1571702400
1300,WEST BENGAL,0,2019,10,43,23,2,296,False,False,False,False,False,False,1571788800


In [32]:
train_df_1.head(2)

Unnamed: 0,branch_id,state,zone,case_count,is_holiday,application_Year,application_Month,application_Week,application_Day,application_Dayofweek,application_Dayofyear,application_Is_month_end,application_Is_month_start,application_Is_quarter_end,application_Is_quarter_start,application_Is_year_end,application_Is_year_start,application_Elapsed
0,1,WEST BENGAL,EAST,40,0,2017,4,13,1,5,91,False,True,False,True,False,False,1491004800
1,1,WEST BENGAL,EAST,5,0,2017,4,14,3,0,93,False,False,False,False,False,False,1491177600


In [0]:
states=train_df_1['state'].unique()
zones=train_df_1['zone'].unique()
bid=train_df_1['branch_id'].unique()

In [0]:
# Encoding labels for categorical variables
le=LabelEncoder()
state_labels=le.fit_transform(states)
zone_labels=le.fit_transform(zones)
bid_labels=le.fit_transform(bid)

In [0]:
state_dict={}
state_dict_rev={}
j=0
for i in states:
  state_dict[i]=state_labels[j]
  state_dict_rev[state_labels[j]]=i
  j+=1

zone_dict={}
zone_dict_rev={}
j=0
for i in zones:
  zone_dict[i]=zone_labels[j]
  zone_dict_rev[zone_labels[j]]=i
  j+=1

bid_dict={}
bid_dict_rev={}
j=0
for i in bid:
  bid_dict[i]=bid_labels[j]
  bid_dict_rev[bid_labels[j]]=i
  j+=1

In [36]:
state_dict

{'ANDHRA PRADESH': 0,
 'ASSAM': 1,
 'BIHAR': 2,
 'CHHATTISGARH': 3,
 'DELHI': 4,
 'GUJARAT': 5,
 'HARYANA': 6,
 'JHARKHAND': 7,
 'KARNATAKA': 8,
 'KERALA': 9,
 'MADHYA PRADESH': 10,
 'MAHARASHTRA': 11,
 'ORISSA': 12,
 'PUNJAB': 13,
 'TAMIL NADU': 14,
 'TELANGANA': 15,
 'TRIPURA': 16,
 'UTTAR PRADESH': 17,
 'UTTARAKHAND': 18,
 'WEST BENGAL': 19}

In [37]:
zone_dict

{'CENTRAL': 0, 'EAST': 1, 'NORTH': 2, 'SOUTH': 3, 'WEST': 4}

In [0]:
# Encoding labels for categorical variables
train_df_1['state'] = train_df_1['state'].apply(lambda x: state_dict[x])
train_df_1['zone'] = train_df_1['zone'].apply(lambda x: zone_dict[x])
train_df_1['branch_id'] = train_df_1['branch_id'].apply(lambda x: bid_dict[x])

valid_df_1['state'] = valid_df_1['state'].apply(lambda x: state_dict[x])
valid_df_1['zone'] = valid_df_1['zone'].apply(lambda x: zone_dict[x])
valid_df_1['branch_id'] = valid_df_1['branch_id'].apply(lambda x: bid_dict[x])
 
test_df_1['state'] = test_df_1['state'].apply(lambda x: state_dict[x])
test_df_1['zone'] = test_df_1['zone'].apply(lambda x: zone_dict[x])
test_df_1['branch_id'] = test_df_1['branch_id'].apply(lambda x: bid_dict[x])

train_df_2['state'] = train_df_2['state'].apply(lambda x: state_dict[x])
valid_df_2['state'] = valid_df_2['state'].apply(lambda x: state_dict[x])
test_df_2['state'] = test_df_2['state'].apply(lambda x: state_dict[x])

In [39]:
train_df_1.head()

Unnamed: 0,branch_id,state,zone,case_count,is_holiday,application_Year,application_Month,application_Week,application_Day,application_Dayofweek,application_Dayofyear,application_Is_month_end,application_Is_month_start,application_Is_quarter_end,application_Is_quarter_start,application_Is_year_end,application_Is_year_start,application_Elapsed
0,0,19,1,40,0,2017,4,13,1,5,91,False,True,False,True,False,False,1491004800
1,0,19,1,5,0,2017,4,14,3,0,93,False,False,False,False,False,False,1491177600
2,0,19,1,4,0,2017,4,14,4,1,94,False,False,False,False,False,False,1491264000
3,0,19,1,113,1,2017,4,14,5,2,95,False,False,False,False,False,False,1491350400
4,0,19,1,76,0,2017,4,14,7,4,97,False,False,False,False,False,False,1491523200


In [40]:
states_1=train_df_1['state'].unique()
states_1

array([19,  4,  8, 14, 17, 13, 15,  0, 11, 12,  5,  1,  7, 18,  9,  3,  2, 16, 10,  6])

In [41]:
states_2=train_df_2['state'].unique()
states_2

array([ 1,  2,  3,  5,  7,  9,  8, 11, 10, 12, 14, 16, 17, 19])

In [0]:
Ada_seg_1=[]
states_seg_1=[]

In [43]:
# Using Adaboost Regressor for segment 1
final_preds_1=np.array([])
final_dates_1=np.array([])
for i in states_1:
  train_df_1[train_df_1['state']==i]['case_count']
  y1=train_df_1[train_df_1['state']==i]['case_count']
  yv1=valid_df_1[valid_df_1['state']==i]['case_count']
  X1=train_df_1[train_df_1['state']==i].drop(columns=['case_count'])
  Xv1=valid_df_1[valid_df_1['state']==i].drop(columns=['case_count'])

  ada_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=17), n_estimators=300, loss='exponential')
  ada_1.fit(X1, y1)
  yp1=ada_1.predict(Xv1)
  metric=mean_absolute_error(yv1, yp1)
  Ada_seg_1.append(round(metric,3))
  states_seg_1.append(state_dict_rev[i])
  print("Metric for State", state_dict_rev[i],"=",round(metric,3))

  ada_preds1 = ada_1.predict(test_df_1[test_df_1['state']==i])
  dates1=pred_test_df_1[pred_test_df_1['state']==state_dict_rev[i]]['application_date']
  dates1=dates1.astype('str').to_numpy()

  final_preds_1=np.append(final_preds_1, ada_preds1, axis=0)
  final_dates_1=np.append(final_dates_1, dates1, axis=0)

Metric for State WEST BENGAL = 17.805
Metric for State DELHI = 36.316
Metric for State KARNATAKA = 24.034
Metric for State TAMIL NADU = 10.372
Metric for State UTTAR PRADESH = 5.496
Metric for State PUNJAB = 6.852
Metric for State TELANGANA = 14.413
Metric for State ANDHRA PRADESH = 3.143
Metric for State MAHARASHTRA = 11.668
Metric for State ORISSA = 29.368
Metric for State GUJARAT = 17.181
Metric for State ASSAM = 17.418
Metric for State JHARKHAND = 7.994
Metric for State UTTARAKHAND = 10.364
Metric for State KERALA = 6.684
Metric for State CHHATTISGARH = 3.453
Metric for State BIHAR = 11.975
Metric for State TRIPURA = 12.37
Metric for State MADHYA PRADESH = 11.987
Metric for State HARYANA = 4.84


In [44]:
len(final_dates_1)==len(final_preds_1)

True

In [0]:
Ada_seg_2=[]
states_seg_2=[]

In [46]:
# Using Adaboost Regressor for segment 2
final_preds_2=np.array([])
final_dates_2=np.array([])
for i in states_2:
  train_df_2[train_df_2['state']==i]['case_count']
  y2=train_df_2[train_df_2['state']==i]['case_count']
  yv2=valid_df_2[valid_df_2['state']==i]['case_count']
  X2=train_df_2[train_df_2['state']==i].drop(columns=['case_count'])
  Xv2=valid_df_2[valid_df_2['state']==i].drop(columns=['case_count'])

  ada_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=22), n_estimators=300, loss='exponential')
  ada_2.fit(X2, y2)
  yp2=ada_2.predict(Xv2)
  metric=mean_absolute_error(yv2, yp2)
  Ada_seg_2.append(round(metric,3))
  states_seg_2.append(state_dict_rev[i])
  print("Metric for State", state_dict_rev[i],"=",round(metric,3))

  ada_preds2 = ada_2.predict(test_df_2[test_df_2['state']==i])
  dates2 = pred_test_df_2[pred_test_df_2['state']==state_dict_rev[i]]['application_date']
  dates2=dates2.astype('str').to_numpy()

  final_preds_2=np.append(final_preds_2, ada_preds2, axis=0)
  final_dates_2=np.append(final_dates_2, dates2, axis=0)

Metric for State ASSAM = 76.435
Metric for State BIHAR = 553.0
Metric for State CHHATTISGARH = 39.174
Metric for State GUJARAT = 60.957
Metric for State JHARKHAND = 81.217
Metric for State KERALA = 121.217
Metric for State KARNATAKA = 193.913
Metric for State MAHARASHTRA = 51.174
Metric for State MADHYA PRADESH = 145.913
Metric for State ORISSA = 222.739
Metric for State TAMIL NADU = 570.435
Metric for State TRIPURA = 39.087
Metric for State UTTAR PRADESH = 53.478
Metric for State WEST BENGAL = 127.522


In [47]:
len(final_dates_2)==len(final_preds_2)

True

In [0]:
pd_df_1=pd.DataFrame({'application_date':final_dates_1, 'preds':final_preds_1.astype(int)})
pd_df_2=pd.DataFrame({'application_date':final_dates_2, 'preds':final_preds_2.astype(int)})

In [0]:
# Summing up predictions datewise
seg_1_preds=pd_df_1.groupby(['application_date']).sum()
seg_2_preds=pd_df_2.groupby(['application_date']).sum()
seg_1_preds.reset_index(drop=True, inplace=True)
seg_2_preds.reset_index(drop=True, inplace=True)

In [0]:
# Predictions to submission file
sub_df_pred = pd.concat([seg_1_preds, seg_2_preds])
sub_df['case_count']=sub_df_pred['preds'].values
sub_df.to_csv(path+'My_Outputs'+'/'+'Submission_Individual_Ada_Boost.csv',index=False)