# Step 1: Imports

In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile, Path
import re
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, RepeatedKFold
import torch
from sklearn.ensemble import RandomForestRegressor
import xgboost
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.linear_model import Lasso, LinearRegression, Ridge, RidgeCV


In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [119]:
class config:
 
    batch_size = 64           
    learning_rate= 0.00005  
    weight_decay = 1e-6
    architecture= " "     
    epochs= 10
    seed = 15
    
    if device == "cuda":
        num_workers = 1
        pin_memory = True
    else:
        num_workers = 0
        pin_memory = False

In [47]:
seed = config.seed
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
random_state= 15
# os.environ['PYTHONHASHSEED'] = str(seed)

In [2]:
with ZipFile('godaddy-microbusiness-density-forecasting.zip') as myzip:
      print(myzip.namelist())

['census_starter.csv', 'revealed_test.csv', 'sample_submission.csv', 'test.csv', 'train.csv']


In [3]:
%%time
with ZipFile('godaddy-microbusiness-density-forecasting.zip') as myzip:
    data_train = myzip.open('train.csv')
    data_test = myzip.open('test.csv')
    data_census = myzip.open('census_starter.csv')
    
df_train = pd.read_csv(data_train)
df_test = pd.read_csv(data_test)
df_cen = pd.read_csv(data_census)

CPU times: total: 188 ms
Wall time: 210 ms


In [4]:
with ZipFile('godaddy-microbusiness-density-forecasting.zip') as myzip:
    data_revealed_test = myzip.open('revealed_test.csv')
df_revealed_test = pd.read_csv(data_revealed_test)    

In [5]:
df_tr = df_train.copy()
df_te = df_test.copy()
df_ce = df_cen.copy()

# Step 2: Explore the training data

In [7]:
df_train.head(3)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269


In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122265 entries, 0 to 122264
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   row_id                 122265 non-null  object 
 1   cfips                  122265 non-null  int64  
 2   county                 122265 non-null  object 
 3   state                  122265 non-null  object 
 4   first_day_of_month     122265 non-null  object 
 5   microbusiness_density  122265 non-null  float64
 6   active                 122265 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 6.5+ MB


In [12]:
print(df_train['first_day_of_month'].min())
print(df_train['first_day_of_month'].max())
print(df_test['first_day_of_month'].min())

2019-08-01
2022-10-01
2022-11-01


In [10]:
df_cen.head(3)

Unnamed: 0,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,cfips,pct_college_2017,pct_college_2018,pct_college_2019,pct_college_2020,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,76.6,78.9,80.6,82.7,85.5,1001,14.5,15.9,16.1,16.7,...,1.3,1.1,0.7,0.6,1.1,55317,58786.0,58731,57982.0,62660.0
1,74.5,78.1,81.8,85.1,87.9,1003,20.4,20.7,21.0,20.2,...,1.4,1.3,1.4,1.0,1.3,52562,55962.0,58320,61756.0,64346.0
2,57.2,60.4,60.5,64.6,64.6,1005,7.6,7.8,7.6,7.3,...,0.5,0.3,0.8,1.1,0.8,33368,34186.0,32525,34990.0,36422.0


In [11]:
df_cen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   pct_bb_2017            3142 non-null   float64
 1   pct_bb_2018            3142 non-null   float64
 2   pct_bb_2019            3142 non-null   float64
 3   pct_bb_2020            3141 non-null   float64
 4   pct_bb_2021            3141 non-null   float64
 5   cfips                  3142 non-null   int64  
 6   pct_college_2017       3142 non-null   float64
 7   pct_college_2018       3142 non-null   float64
 8   pct_college_2019       3142 non-null   float64
 9   pct_college_2020       3141 non-null   float64
 10  pct_college_2021       3141 non-null   float64
 11  pct_foreign_born_2017  3142 non-null   float64
 12  pct_foreign_born_2018  3142 non-null   float64
 13  pct_foreign_born_2019  3142 non-null   float64
 14  pct_foreign_born_2020  3141 non-null   float64
 15  pct_

In [13]:
df_ce.isnull().sum().loc[(df_ce.isna().sum()!=0)].keys().tolist()
df_cen.isna().sum().loc[(df_ce.isna().sum()!=0)]

pct_bb_2020              1
pct_bb_2021              1
pct_college_2020         1
pct_college_2021         1
pct_foreign_born_2020    1
pct_foreign_born_2021    1
pct_it_workers_2018      1
pct_it_workers_2020      1
pct_it_workers_2021      1
median_hh_inc_2018       1
median_hh_inc_2020       2
median_hh_inc_2021       2
dtype: int64

In [18]:
df_ce["pct_bb_2020"] = df_ce["pct_bb_2020"].fillna(df_ce["pct_bb_2019"])
df_ce["pct_bb_2021"] = df_ce["pct_bb_2021"].fillna(df_ce["pct_bb_2019"])
df_ce["pct_college_2020"] = df_ce["pct_college_2020"].fillna(df_ce["pct_college_2019"])
df_ce["pct_college_2021"] = df_ce["pct_college_2021"].fillna(df_ce["pct_college_2019"])
df_ce["pct_foreign_born_2020"] = df_ce["pct_foreign_born_2020"].fillna(df_ce["pct_foreign_born_2019"])
df_ce["pct_foreign_born_2021"] = df_ce["pct_foreign_born_2021"].fillna(df_ce["pct_foreign_born_2019"])
df_ce["pct_it_workers_2018"] = df_ce["pct_it_workers_2018"].fillna(df_ce["pct_it_workers_2017"])
df_ce["pct_it_workers_2020"] = df_ce["pct_it_workers_2020"].fillna(df_ce["pct_it_workers_2019"])
df_ce["pct_it_workers_2021"] = df_ce["pct_it_workers_2021"].fillna(df_ce["pct_it_workers_2019"])
df_ce["median_hh_inc_2018"] = df_ce["median_hh_inc_2018"].fillna(df_ce["median_hh_inc_2017"])
df_ce["median_hh_inc_2020"] = df_ce["median_hh_inc_2020"].fillna(df_ce["median_hh_inc_2019"])
df_ce["median_hh_inc_2021"] = df_ce["median_hh_inc_2021"].fillna(df_ce["median_hh_inc_2019"])

In [15]:
df_ce.isnull().sum().loc[(df_ce.isna().sum()!=0)]

Series([], dtype: int64)

## 1. using 2-year shift betw. stat data and current factor

In [5]:
#train 2year shift между стат данными и текущим коэффициентом
df_pct_bb = df_ce[['cfips','pct_bb_2018', 'pct_bb_2019', 'pct_bb_2020', 'pct_bb_2017']]
df_pct_college = df_ce[['cfips', 'pct_college_2018', 'pct_college_2019', 'pct_college_2020', 'pct_college_2017']]
df_pct_foreign_born = df_ce[['cfips', 'pct_foreign_born_2018', 'pct_foreign_born_2019', 'pct_foreign_born_2020', 'pct_foreign_born_2017']]
df_pct_it_workers = df_ce[['cfips','pct_it_workers_2018', 'pct_it_workers_2019', 'pct_it_workers_2020', 'pct_it_workers_2017']]
df_median_hh_inc = df_ce[['cfips','median_hh_inc_2018', 'median_hh_inc_2019', 'median_hh_inc_2020', 'median_hh_inc_2017']]

In [6]:
#test 2year shift между стат данными и текущим коэффициентом
df_pct_bb_test = df_ce[['cfips', 'pct_bb_2020', 'pct_bb_2021']]
df_pct_college_test = df_ce[['cfips', 'pct_college_2020', 'pct_college_2021']]
df_pct_foreign_born_test = df_ce[['cfips', 'pct_foreign_born_2020', 'pct_foreign_born_2021']]
df_pct_it_workers_test = df_ce[['cfips', 'pct_it_workers_2020', 'pct_it_workers_2021']]
df_median_hh_inc_test = df_ce[['cfips','median_hh_inc_2020', 'median_hh_inc_2021']]

In [7]:
df_pct_bb_unpivoted = df_pct_bb.melt(id_vars=['cfips'], var_name='year', value_name='pct_bb')
df_pct_college_unpivoted = df_pct_college.melt(id_vars=['cfips'], var_name='year', value_name='pct_college')
df_pct_foreign_born_unpivoted = df_pct_foreign_born.melt(id_vars=['cfips'], var_name='year', value_name='pct_foreign_born')
df_pct_it_workers_unpivoted = df_pct_it_workers.melt(id_vars=['cfips'], var_name='year', value_name='pct_it_workers')
df_median_hh_inc_unpivoted = df_median_hh_inc.melt(id_vars=['cfips'], var_name='year', value_name='median_hh_inc')

In [8]:
#change reaserch year to +2 year(train)
df_pct_bb_unpivoted["year"]= df_pct_bb_unpivoted["year"].str.extract('(\d+)').astype('int')+2
df_pct_college_unpivoted["year"]= df_pct_college_unpivoted["year"].str.extract('(\d+)').astype('int')+2
df_pct_foreign_born_unpivoted["year"]= df_pct_foreign_born_unpivoted["year"].str.extract('(\d+)').astype('int')+2
df_pct_it_workers_unpivoted["year"]= df_pct_it_workers_unpivoted["year"].str.extract('(\d+)').astype('int')+2
df_median_hh_inc_unpivoted["year"]= df_median_hh_inc_unpivoted["year"].str.extract('(\d+)').astype('int')+2

In [9]:
df_tr['year'] = df_tr["first_day_of_month"].str.extract('(\d+)').astype('int')
df_tr.head(3)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,year
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249,2019
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198,2019
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269,2019


In [10]:
df_tr_1 = df_tr.merge(df_pct_bb_unpivoted, on=['cfips', 'year'], how = 'left')
df_tr_2 = df_tr_1.merge(df_pct_college_unpivoted, on=['cfips', 'year'], how = 'left')
df_tr_3 = df_tr_2.merge(df_pct_foreign_born_unpivoted, on=['cfips', 'year'], how = 'left')
df_tr_4 = df_tr_3.merge(df_pct_it_workers_unpivoted, on=['cfips', 'year'], how = 'left')
df_tr_5 = df_tr_4.merge(df_median_hh_inc_unpivoted, on=['cfips', 'year'], how = 'left')

In [11]:
df_tr_5.isnull().sum().loc[(df_tr_5.isna().sum()!=0)]

Series([], dtype: int64)

In [12]:
df_pct_bb_test_unpivoted = df_pct_bb_test.melt(id_vars=['cfips'], var_name='year', value_name='pct_bb')
df_pct_college_test_unpivoted = df_pct_college_test.melt(id_vars=['cfips'], var_name='year', value_name='pct_college')
df_pct_foreign_born_test_unpivoted = df_pct_foreign_born_test.melt(id_vars=['cfips'], var_name='year', value_name='pct_foreign_born')
df_pct_it_workers_test_unpivoted = df_pct_it_workers_test.melt(id_vars=['cfips'], var_name='year', value_name='pct_it_workers')
df_median_hh_inc_test_unpivoted = df_median_hh_inc_test.melt(id_vars=['cfips'], var_name='year', value_name='median_hh_inc')

In [13]:
#change reaserch year to +2 year(test)
df_pct_bb_test_unpivoted["year"]= df_pct_bb_test_unpivoted["year"].str.extract('(\d+)').astype('int')+2
df_pct_college_test_unpivoted["year"]= df_pct_college_test_unpivoted["year"].str.extract('(\d+)').astype('int')+2
df_pct_foreign_born_test_unpivoted["year"]= df_pct_foreign_born_test_unpivoted["year"].str.extract('(\d+)').astype('int')+2
df_pct_it_workers_test_unpivoted["year"]= df_pct_it_workers_test_unpivoted["year"].str.extract('(\d+)').astype('int')+2
df_median_hh_inc_test_unpivoted["year"]= df_median_hh_inc_test_unpivoted["year"].str.extract('(\d+)').astype('int')+2

In [14]:
df_te['year'] = df_te["first_day_of_month"].str.extract('(\d+)').astype('int')
df_te_1 = df_te.merge(df_pct_bb_test_unpivoted, on=['cfips', 'year'], how = 'left')
df_te_2 = df_te_1.merge(df_pct_college_test_unpivoted, on=['cfips', 'year'], how = 'left')
df_te_3 = df_te_2.merge(df_pct_foreign_born_test_unpivoted, on=['cfips', 'year'], how = 'left')
df_te_4 = df_te_3.merge(df_pct_it_workers_test_unpivoted, on=['cfips', 'year'], how = 'left')
df_te_5 = df_te_4.merge(df_median_hh_inc_test_unpivoted, on=['cfips', 'year'], how = 'left')

In [241]:
df_te_5.head(3)

Unnamed: 0,row_id,cfips,first_day_of_month,year,pct_bb,pct_college,pct_foreign_born,pct_it_workers,median_hh_inc
0,1001_2022-11-01,1001,2022-11-01,2022,82.7,16.7,2.3,0.6,57982.0
1,1003_2022-11-01,1003,2022-11-01,2022,85.1,20.2,3.4,1.0,61756.0
2,1005_2022-11-01,1005,2022-11-01,2022,64.6,7.3,2.6,1.1,34990.0


## 2. LinearRegression for prediction current stat data

In [19]:
pct_bb = df_ce.loc[:,['pct_bb_2017','pct_bb_2018', 'pct_bb_2019', 'pct_bb_2020', 'pct_bb_2021' ]]
pct_college = df_ce.loc[:,['pct_college_2017','pct_college_2018', 'pct_college_2019', 'pct_college_2020','pct_college_2021' ]]
pct_foreign_born = df_ce.loc[:,[ 'pct_foreign_born_2017','pct_foreign_born_2018', 'pct_foreign_born_2019', 'pct_foreign_born_2020', 'pct_foreign_born_2021' ]]
pct_it_workers = df_ce.loc[:,['pct_it_workers_2017','pct_it_workers_2018', 'pct_it_workers_2019', 'pct_it_workers_2020', 'pct_it_workers_2021' ]]
median_hh_inc = df_ce.loc[:,['median_hh_inc_2017','median_hh_inc_2018', 'median_hh_inc_2019', 'median_hh_inc_2020', 'median_hh_inc_2021']]

In [20]:
model = LinearRegression()
X = pct_bb.loc[:,['pct_bb_2017','pct_bb_2018','pct_bb_2019','pct_bb_2020']]
y = pct_bb.loc[:,['pct_bb_2021']]
model.fit(X, y)
X1 = pct_bb.loc[:,['pct_bb_2018','pct_bb_2019','pct_bb_2020','pct_bb_2021']]

X.columns.tolist()
X1.columns.tolist()
dict(zip(X1.columns.tolist(),X.columns.tolist()))
X1.rename(columns = dict(zip(X1.columns.tolist(),X.columns.tolist())), inplace = True)

pct_bb['pct_bb_2022'] = np.round(model.predict(X1),1)
X2 = pct_bb.loc[:,['pct_bb_2019','pct_bb_2020','pct_bb_2021', 'pct_bb_2022']]

X2.columns.tolist()
dict(zip(X2.columns.tolist(),X.columns.tolist()))
X2.rename(columns = dict(zip(X2.columns.tolist(),X.columns.tolist())), inplace = True)

pct_bb['pct_bb_2023'] = np.round(model.predict(X2),1)

In [21]:
X = pct_college.loc[:,['pct_college_2017','pct_college_2018','pct_college_2019','pct_college_2020']]
y = pct_college.loc[:,['pct_college_2021']]
model.fit(X, y)
X1 = pct_college.loc[:,['pct_college_2018','pct_college_2019','pct_college_2020','pct_college_2021']]

X.columns.tolist()
X1.columns.tolist()
dict(zip(X1.columns.tolist(),X.columns.tolist()))
X1.rename(columns = dict(zip(X1.columns.tolist(),X.columns.tolist())), inplace = True)

pct_college['pct_college_2022'] = np.round(model.predict(X1),1)
X2 = pct_college.loc[:,['pct_college_2019','pct_college_2020','pct_college_2021', 'pct_college_2022']]

X2.columns.tolist()
dict(zip(X2.columns.tolist(),X.columns.tolist()))
X2.rename(columns = dict(zip(X2.columns.tolist(),X.columns.tolist())), inplace = True)

pct_college['pct_college_2023'] = np.round(model.predict(X2),1)

In [22]:
X = pct_foreign_born.loc[:,['pct_foreign_born_2017','pct_foreign_born_2018','pct_foreign_born_2019','pct_foreign_born_2020']]
y = pct_foreign_born.loc[:,['pct_foreign_born_2021']]
model.fit(X, y)
X1 = pct_foreign_born.loc[:,['pct_foreign_born_2018','pct_foreign_born_2019','pct_foreign_born_2020','pct_foreign_born_2021']]

X.columns.tolist()
X1.columns.tolist()
dict(zip(X1.columns.tolist(),X.columns.tolist()))
X1.rename(columns = dict(zip(X1.columns.tolist(),X.columns.tolist())), inplace = True)

pct_foreign_born['pct_foreign_born_2022'] = np.round(model.predict(X1),1)
X2 = pct_foreign_born.loc[:,['pct_foreign_born_2019','pct_foreign_born_2020','pct_foreign_born_2021', 'pct_foreign_born_2022']]

X2.columns.tolist()
dict(zip(X2.columns.tolist(),X.columns.tolist()))
X2.rename(columns = dict(zip(X2.columns.tolist(),X.columns.tolist())), inplace = True)

pct_foreign_born['pct_foreign_born_2023'] = np.round(model.predict(X2),1)

In [23]:
X = pct_it_workers.loc[:,['pct_it_workers_2017','pct_it_workers_2018','pct_it_workers_2019','pct_it_workers_2020']]
y = pct_it_workers.loc[:,['pct_it_workers_2021']]
model.fit(X, y)
X1 = pct_it_workers.loc[:,['pct_it_workers_2018','pct_it_workers_2019','pct_it_workers_2020','pct_it_workers_2021']]

X.columns.tolist()
X1.columns.tolist()
dict(zip(X1.columns.tolist(),X.columns.tolist()))
X1.rename(columns = dict(zip(X1.columns.tolist(),X.columns.tolist())), inplace = True)

pct_it_workers['pct_it_workers_2022'] = np.round(model.predict(X1),1)
X2 = pct_it_workers.loc[:,['pct_it_workers_2019','pct_it_workers_2020','pct_it_workers_2021', 'pct_it_workers_2022']]

X2.columns.tolist()
dict(zip(X2.columns.tolist(),X.columns.tolist()))
X2.rename(columns = dict(zip(X2.columns.tolist(),X.columns.tolist())), inplace = True)

pct_it_workers['pct_it_workers_2023'] = np.round(model.predict(X2),1)

In [24]:
X = median_hh_inc.loc[:,['median_hh_inc_2017','median_hh_inc_2018','median_hh_inc_2019','median_hh_inc_2020']]
y = median_hh_inc.loc[:,['median_hh_inc_2021']]
model.fit(X, y)
X1 = median_hh_inc.loc[:,['median_hh_inc_2018','median_hh_inc_2019','median_hh_inc_2020','median_hh_inc_2021']]

X.columns.tolist()
X1.columns.tolist()
dict(zip(X1.columns.tolist(),X.columns.tolist()))
X1.rename(columns = dict(zip(X1.columns.tolist(),X.columns.tolist())), inplace = True)

median_hh_inc['median_hh_inc_2022'] = np.round(model.predict(X1),1)
X2 = median_hh_inc.loc[:,['median_hh_inc_2019','median_hh_inc_2020','median_hh_inc_2021', 'median_hh_inc_2022']]

X2.columns.tolist()
dict(zip(X2.columns.tolist(),X.columns.tolist()))
X2.rename(columns = dict(zip(X2.columns.tolist(),X.columns.tolist())), inplace = True)

median_hh_inc['median_hh_inc_2023'] = np.round(model.predict(X2),1)

In [25]:
df_pct_bb =pd.concat([pct_bb.loc[:,['pct_bb_2019', 'pct_bb_2020', 'pct_bb_2021', 'pct_bb_2022' ]], df_ce[['cfips']]], axis=1)
df_pct_bb_test = pd.concat([pct_bb.loc[:,['pct_bb_2022', 'pct_bb_2023' ]] , df_ce[['cfips']]], axis=1)  
df_pct_college = pd.concat([pct_college.loc[:,['pct_college_2019','pct_college_2020', 'pct_college_2021','pct_college_2022' ]],
                             df_ce[['cfips']]], axis=1)
df_pct_college_test = pd.concat([pct_college.loc[:,['pct_college_2022', 'pct_college_2023' ]], df_ce[['cfips']]], axis=1)     
df_pct_foreign_born = pd.concat([pct_foreign_born.loc[:,['pct_foreign_born_2019','pct_foreign_born_2020', 'pct_foreign_born_2021',
                                             'pct_foreign_born_2022' ]], df_ce[['cfips']]], axis=1)
df_pct_foreign_born_test = pd.concat([pct_foreign_born.loc[:,['pct_foreign_born_2022', 'pct_foreign_born_2023' ]] ,
                                       df_ce[['cfips']]], axis=1)
df_pct_it_workers = pd.concat([pct_it_workers.loc[:,['pct_it_workers_2019','pct_it_workers_2020', 'pct_it_workers_2021',
                                         'pct_it_workers_2022' ]],df_ce[['cfips']]], axis=1)
df_pct_it_workers_test = pd.concat([pct_it_workers.loc[:,['pct_it_workers_2022', 'pct_it_workers_2023' ]] ,
                                     df_ce[['cfips']]], axis=1)
df_median_hh_inc = pd.concat([median_hh_inc.loc[:,['median_hh_inc_2019','median_hh_inc_2020', 'median_hh_inc_2021',
                                                   'median_hh_inc_2022' ]], df_ce[['cfips']]], axis=1)
df_median_hh_inc_test = pd.concat([median_hh_inc.loc[:,['median_hh_inc_2022', 'median_hh_inc_2023' ]] , df_ce[['cfips']]], axis=1)

In [26]:
df_pct_bb_unpivoted = df_pct_bb.melt(id_vars=['cfips'], var_name='year', value_name='pct_bb')
df_pct_college_unpivoted = df_pct_college.melt(id_vars=['cfips'], var_name='year', value_name='pct_college')
df_pct_foreign_born_unpivoted = df_pct_foreign_born.melt(id_vars=['cfips'], var_name='year', value_name='pct_foreign_born')
df_pct_it_workers_unpivoted = df_pct_it_workers.melt(id_vars=['cfips'], var_name='year', value_name='pct_it_workers')
df_median_hh_inc_unpivoted = df_median_hh_inc.melt(id_vars=['cfips'], var_name='year', value_name='median_hh_inc')

In [27]:
df_pct_bb_unpivoted["year"]= df_pct_bb_unpivoted["year"].str.extract('(\d+)').astype('int')
df_pct_college_unpivoted["year"]= df_pct_college_unpivoted["year"].str.extract('(\d+)').astype('int')
df_pct_foreign_born_unpivoted["year"]= df_pct_foreign_born_unpivoted["year"].str.extract('(\d+)').astype('int')
df_pct_it_workers_unpivoted["year"]= df_pct_it_workers_unpivoted["year"].str.extract('(\d+)').astype('int')
df_median_hh_inc_unpivoted["year"]= df_median_hh_inc_unpivoted["year"].str.extract('(\d+)').astype('int')

In [28]:
df_tr['year'] = pd.to_datetime(df_tr["first_day_of_month"]).dt.year

In [29]:
df_tr_1 = df_tr.merge(df_pct_bb_unpivoted, on=['cfips', 'year'], how = 'left')
df_tr_2 = df_tr_1.merge(df_pct_college_unpivoted, on=['cfips', 'year'], how = 'left')
df_tr_3 = df_tr_2.merge(df_pct_foreign_born_unpivoted, on=['cfips', 'year'], how = 'left')
df_tr_4 = df_tr_3.merge(df_pct_it_workers_unpivoted, on=['cfips', 'year'], how = 'left')
df_tr_5 = df_tr_4.merge(df_median_hh_inc_unpivoted, on=['cfips', 'year'], how = 'left')

In [30]:
df_pct_bb_test_unpivoted = df_pct_bb_test.melt(id_vars=['cfips'], var_name='year', value_name='pct_bb')
df_pct_college_test_unpivoted = df_pct_college_test.melt(id_vars=['cfips'], var_name='year', value_name='pct_college')
df_pct_foreign_born_test_unpivoted = df_pct_foreign_born_test.melt(id_vars=['cfips'], var_name='year', value_name='pct_foreign_born')
df_pct_it_workers_test_unpivoted = df_pct_it_workers_test.melt(id_vars=['cfips'], var_name='year', value_name='pct_it_workers')
df_median_hh_inc_test_unpivoted = df_median_hh_inc_test.melt(id_vars=['cfips'], var_name='year', value_name='median_hh_inc')
df_pct_bb_test_unpivoted["year"]= df_pct_bb_test_unpivoted["year"].str.extract('(\d+)').astype('int')
df_pct_college_test_unpivoted["year"]= df_pct_college_test_unpivoted["year"].str.extract('(\d+)').astype('int')
df_pct_foreign_born_test_unpivoted["year"]= df_pct_foreign_born_test_unpivoted["year"].str.extract('(\d+)').astype('int')
df_pct_it_workers_test_unpivoted["year"]= df_pct_it_workers_test_unpivoted["year"].str.extract('(\d+)').astype('int')
df_median_hh_inc_test_unpivoted["year"]= df_median_hh_inc_test_unpivoted["year"].str.extract('(\d+)').astype('int')
df_te['year'] = pd.to_datetime(df_te["first_day_of_month"]).dt.year
df_te_1 = df_te.merge(df_pct_bb_test_unpivoted, on=['cfips', 'year'], how = 'left')
df_te_2 = df_te_1.merge(df_pct_college_test_unpivoted, on=['cfips', 'year'], how = 'left')
df_te_3 = df_te_2.merge(df_pct_foreign_born_test_unpivoted, on=['cfips', 'year'], how = 'left')
df_te_4 = df_te_3.merge(df_pct_it_workers_test_unpivoted, on=['cfips', 'year'], how = 'left')
df_te_5 = df_te_4.merge(df_median_hh_inc_test_unpivoted, on=['cfips', 'year'], how = 'left')

In [31]:
def get_month_sin(month):
    theta = month * (2*np.pi / 12)
    return np.sin(theta)
def get_month_cos(month) :
    theta = month * (2*np.pi / 12)
    return np.cos(theta)

"""добавлены син-кос по месяцам, поскольку только они здесь цикличны"""

In [32]:
df0 = df_tr_5.copy()
df0test = df_te_5.copy()
df0["first_day_of_month"] = pd.to_datetime(df0["first_day_of_month"])
df0test["first_day_of_month"] = pd.to_datetime(df0test["first_day_of_month"])
df0['month'] = df0["first_day_of_month"].dt.month
df0test['month'] = df0test["first_day_of_month"].dt.month
data_train0 = df0.loc[:,['cfips','year','month','pct_bb','pct_college','pct_foreign_born','pct_it_workers','median_hh_inc']]
data_test0= df0test.loc[:,['cfips','year', 'month','pct_bb','pct_college','pct_foreign_born','pct_it_workers','median_hh_inc']]
data_train0['sin_month'] = data_train0.month.map(get_month_sin(data_train0['month']))
data_train0['cos_month'] = data_train0.month.map(get_month_cos(data_train0['month']))
data_test0['sin_month'] = data_test0.month.map(get_month_sin(data_test0['month']))
data_test0['cos_month'] = data_test0.month.map(get_month_cos(data_test0['month']))

In [37]:
data_train = data_train0.copy()
data_test = data_test0.copy()
y_train= df0['microbusiness_density']

In [38]:
data_test.duplicated().sum()

0

In [39]:
data_train.corrwith(y_train).abs().sort_values(ascending=False)

pct_college         0.481797
median_hh_inc       0.388506
pct_bb              0.344377
pct_foreign_born    0.281323
pct_it_workers      0.242714
year                0.017025
cfips               0.011767
sin_month           0.003987
month               0.002121
cos_month           0.000910
dtype: float64

In [40]:
data_train.corr().style.background_gradient(sns.light_palette('blue', as_cmap=True))

Unnamed: 0,cfips,year,month,pct_bb,pct_college,pct_foreign_born,pct_it_workers,median_hh_inc,sin_month,cos_month
cfips,1.0,-0.0,-0.0,0.03863,0.047669,-0.018601,-0.023915,0.059222,0.0,-0.0
year,-0.0,1.0,-0.325776,0.262968,0.046045,-0.00137,-0.034109,0.180103,0.066297,0.293768
month,-0.0,-0.325776,1.0,-0.09347,-0.016481,0.000601,0.010756,-0.052976,0.220083,-0.775172
pct_bb,0.03863,0.262968,-0.09347,1.0,0.625913,0.286659,0.264236,0.708522,0.016072,0.085904
pct_college,0.047669,0.046045,-0.016481,0.625913,1.0,0.333935,0.350307,0.706099,0.002894,0.015114
pct_foreign_born,-0.018601,-0.00137,0.000601,0.286659,0.333935,1.0,0.200818,0.38678,-6.9e-05,-0.000571
pct_it_workers,-0.023915,-0.034109,0.010756,0.264236,0.350307,0.200818,1.0,0.29449,-0.002242,-0.00967
median_hh_inc,0.059222,0.180103,-0.052976,0.708522,0.706099,0.38678,0.29449,1.0,0.012732,0.046701
sin_month,0.0,0.066297,0.220083,0.016072,0.002894,-6.9e-05,-0.002242,0.012732,1.0,-0.040371
cos_month,-0.0,0.293768,-0.775172,0.085904,0.015114,-0.000571,-0.00967,0.046701,-0.040371,1.0


In [42]:
from statsmodels.tsa.stattools import adfuller
for name, values in data_train.items():
    st = adfuller(values[:10000])[1]
    print(f'{name}: {st}')

cfips: 0.9410602924974806
year: 0.0
month: 0.9585320860220025
pct_bb: 3.5574912153603106e-13
pct_college: 3.7198405795953476e-17
pct_foreign_born: 4.3624307029862637e-13
pct_it_workers: 1.5127871029105457e-23
median_hh_inc: 8.408018136174087e-13
sin_month: 0.0
cos_month: 0.0


# Step 3. modelling

In [36]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [48]:
ssc = StandardScaler()
train_ssc = ssc.fit_transform(data_train)
target = y_train

### LGBM 

In [49]:
%%time
features = data_train

RS_lgb = RandomizedSearchCV(
            estimator=lgb.LGBMRegressor(random_state=random_state) ,   #n_estimators=5000 next add      
            param_distributions = {                
              'num_leaves':[20,40,60,80,100], 
              'min_child_samples':[5,10,15],
              'max_depth':[3,  6, 8, 10, 12, 15, 20],
              'learning_rate':[0.05,0.10,0.15,0.20,0.25,0.30],
              'reg_alpha':[0.0, 0.1, 0.2 , 0.3, 0.4]
            },
            scoring = 'neg_mean_absolute_error',    
            cv = 10,                                
            n_jobs = -1,                           
            random_state=random_state ,
            return_train_score=True,
            n_iter=20,                             
            verbose =0     )

RS_lgb.fit(features, target)

CPU times: total: 6.61 s
Wall time: 1min 39s


In [58]:
print("The best_params:\n{}\n".format(RS_lgb.best_params_), 
        "The best MAE_cv score: {:.9f}".format(-RS_lgb.best_score_))

smape(target, RS_lgb.predict(features))

The best_params:
{'reg_alpha': 0.3, 'num_leaves': 80, 'min_child_samples': 15, 'max_depth': 6, 'learning_rate': 0.05}
 The best MAE_cv score: 1.771358570


30.356112352313755

### XGB 

In [51]:
%%time
features = data_train

RS_xgb = RandomizedSearchCV(
            estimator=XGBRegressor( verbosity=0, random_state=random_state) ,         
            param_distributions = {                
            'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
             'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
             'min_child_weight' : [ 1, 3, 5, 7 ],
             'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
             'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7,1 ]
            },
            scoring = 'neg_mean_absolute_error',    
            cv = 5,                                 
            n_jobs = -1,                           
            random_state=random_state ,
            return_train_score=True,
            n_iter=5,                          
            verbose =5     )

RS_xgb.fit(features, target)
print("The best_params:\n{}\n".format(RS_xgb.best_params_), 
        "The best MAE_cv score: {:.9f}".format(-RS_xgb.best_score_))

smape(target, RS_xgb.predict(features))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
The best_params:
{'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.5}
 The best MSE_cv score: 1.648596263
CPU times: total: 14.8 s
Wall time: 1min 5s


33.713199375673106

N_estimators here, MSE:

In [57]:
%%time
pipe_lgb = Pipeline([
    ('scaler', None),
    ('regressor', lgb.LGBMRegressor( reg_alpha= 0.3, num_leaves= 80, min_child_samples= 15, max_depth= 6,
                                    learning_rate= 0.05,random_state=random_state, n_estimators=5000))])
features = data_train
pipe_lgb.fit(features, target)
predict_lgb = pipe_lgb.predict(features)

scores = cross_val_score(pipe_lgb, features, target.values.ravel(), scoring= 'neg_mean_squared_error', cv=5, n_jobs=-1)

print('Test CV_score for LGBM baseline: MSE_test', f": {-np.mean(scores):.9f}")
forecast= predict_lgb
actual = target
smape(actual, forecast)

Test CV_score for LGBM baseline: MSE_test : 31.212644169
CPU times: total: 4min 43s
Wall time: 2min 41s


7.322658543385707

In [56]:
%%time
pipe_xgb = Pipeline([  
    ('scaler', None),
    ('regressor', XGBRegressor(gamma= 0.4, colsample_bytree= 0.5, min_child_weight = 3, max_depth= 4,
                                learning_rate= 0.05,random_state=random_state, n_estimators=5000,
                                verbosity = 0))])
features = data_train

pipe_xgb.fit(features, target)
predict_xgb = pipe_xgb.predict(features)

scores = cross_val_score(pipe_xgb, features, target.values.ravel(), scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

print('Test CV_score for XGB baseline: MSE_test', f": {-np.mean(scores):.9f}")
forecast= predict_xgb
actual = target
smape(actual, forecast)

Test CV_score for XGB baseline: MSE_test : 22.652678887
CPU times: total: 11min 32s
Wall time: 5min 53s


15.275507430742735

## Linear methods:

In [59]:
features = train_ssc
LR = LinearRegression()
LR.fit(features, target)
scores = cross_val_score(LR, features, target.values.ravel(), scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
print('Test CV_score for LR baseline: MAE_test', f": {-np.mean(scores):.5f}")
print('SMAPE_score for LR baseline', f":{smape(target, LR.predict(features)):.5f}")

Test CV_score for LR baseline: MAE_test : 1.72217
SMAPE_score for LR baseline :44.90011


In [60]:
%%time
poly = PolynomialFeatures(degree=2, include_bias=False)
features = train_ssc
pipe_poly_LR = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', LinearRegression())])
pipe_poly_LR.fit(features, target)

scores = cross_val_score(pipe_poly_LR, features, target.values.ravel(), scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
print('Test CV_score for poly_LR baseline: MAE_test', f": {-np.mean(scores):.9f}")
print('SMAPE_score for poly_LR baseline', f":{smape(target, pipe_poly_LR.predict(features)):.5f}")

Test CV_score for poly_LR baseline: MAE_test : 1.574352782
SMAPE_score for poly_LR baseline :37.40494
CPU times: total: 1.92 s
Wall time: 2.8 s


In [62]:
%%time
features = train_ssc
cv = RepeatedKFold(n_splits= 10 , n_repeats= 3 , random_state= random_state )

pipe_poly_RidgeCV = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', RidgeCV(alphas= (0.05, 50.0), cv=cv, scoring='neg_mean_absolute_error'))])
pipe_poly_RidgeCV.fit(features, target)

print('Best CV_score for poly_ridgeCV baseline: MAE', f": {-pipe_poly_RidgeCV.named_steps['regressor'].best_score_:.5f}")
print('SMAPE_score for poly_ridgeCV baseline', f":{smape(target, pipe_poly_RidgeCV.predict(features)):.5f}")

Best CV_score for poly_ridgeCV baseline: MAE : 1.44810
SMAPE_score for poly_ridgeCV baseline :37.40221
CPU times: total: 8.05 s
Wall time: 6.54 s


1submission

In [63]:
with ZipFile('godaddy-microbusiness-density-forecasting.zip') as myzip:
    data_sub = myzip.open('sample_submission.csv')
     
df_sub = pd.read_csv(data_sub)

In [64]:
df_sub.columns

Index(['row_id', 'microbusiness_density'], dtype='object')

In [65]:
df_sub.head(3)

Unnamed: 0,row_id,microbusiness_density
0,1001_2022-11-01,3.817671
1,1003_2022-11-01,3.817671
2,1005_2022-11-01,3.817671


In [67]:
"""pipe_lgb"""
y_test_predict_lgb = pipe_lgb.predict(data_test)

In [70]:
test_Id = df_test['row_id']
submission = pd.DataFrame({'row_id': test_Id,'microbusiness_density': y_test_predict_lgb})
submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,row_id,microbusiness_density
0,1001_2022-11-01,3.385132
1,1003_2022-11-01,8.466618
2,1005_2022-11-01,1.277283
3,1007_2022-11-01,1.279167
4,1009_2022-11-01,1.798717


In [71]:
submission.describe()

Unnamed: 0,microbusiness_density
count,25080.0
mean,4.024496
std,4.64043
min,-6.896722
25%,1.897635
50%,2.828457
75%,4.895719
max,145.443796


In [72]:
newArr = y_test_predict_lgb[y_test_predict_lgb < 0]
len(newArr)

76

In [73]:
my_array = y_test_predict_lgb
my_array[my_array < 0] = 0.000000
len(my_array)

25080

In [75]:
test_Id = df_test['row_id']
submission = pd.DataFrame({'row_id': test_Id,'microbusiness_density': my_array})
submission.to_csv("submission.csv", index=False)
submission.describe()

Unnamed: 0,microbusiness_density
count,25080.0
mean,4.028762
std,4.635234
min,0.0
25%,1.897635
50%,2.828457
75%,4.895719
max,145.443796


In [76]:
from sklearn.metrics import fbeta_score, make_scorer
score = make_scorer(smape, greater_is_better=False)

Повтор с данной оценкой даcт submission 2

In [78]:
import torch.nn as nn
from torch.utils.data.dataset import random_split
import torch.optim as optim
from torchsummary import summary
from torch.utils.data import TensorDataset, DataLoader

preparing data:

In [103]:
X_tr, X_val, y_tr, y_val = train_test_split(data_train, y_train, test_size=0.2, random_state=random_state)

In [101]:
y_train= df0['microbusiness_density']

In [104]:
X_train_ssc = ssc.fit_transform(X_tr)
X_val_ssc = ssc.transform(X_val)
X_test_ssc = ssc.transform(data_test)

In [114]:
msc = MinMaxScaler()
y_train_msc = msc.fit_transform(y_tr.to_numpy().reshape(-1, 1)) #mm for DF not pd.series [y_train]
y_val_msc = msc.transform(y_val.to_numpy().reshape(-1, 1))

In [118]:
train_features = torch.Tensor(X_train_ssc)
train_targets = torch.Tensor(y_train_msc)

val_features = torch.Tensor(X_val_ssc)
val_targets = torch.Tensor(y_val_msc)

test_features = torch.Tensor(X_test_ssc)

train_ds = TensorDataset(train_features, train_targets)
val_ds = TensorDataset(val_features, val_targets)
train_loader = DataLoader(train_ds, batch_size=config.batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=config.batch_size, shuffle=False, drop_last=True)

## GRU

Образец https://towardsdatascience.com/building-rnn-lstm-and-gru-for-time-series-using-pytorch-a46e5b094e7b

In [79]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout_prob):
        super(GRU, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout_prob)   
        self.fc = nn.Linear(hidden_dim, output_dim)                                                      

    def forward(self, x):                                                                              
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().to(device)       
        out, (hn) = self.gru(x, (h0.detach()))                                                          
        out = self.fc(out[:, -1, :])                                                                   
       
        return out

In [324]:
input_dim = len(data_train.columns)
output_dim = 1
hidden_dim = 5
num_layers = 3
dropout = 0.2
model = GRU(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers, dropout_prob = dropout).to(device)
print(model)

GRU(
  (gru): GRU(10, 5, num_layers=3, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)


In [123]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)

In [259]:
def train(dataloader, model, criterion, optimizer):
    model.train()
    size = len(dataloader.dataset)
    train_loss = []

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.view([config.batch_size, -1, input_dim]).to(device), y.to(device)
        
        pred = model(X)
        loss = criterion(pred, y.unsqueeze(1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 1000 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
        train_loss.append(loss)   
       
        
def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():                                                    
        for X, y in dataloader:
            X, y = X.view([config.batch_size, -1, input_dim]).to(device), y.to(device)
            pred = model(X)
            
            test_loss += criterion(pred, y.unsqueeze(1)).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
            

    test_loss /= size
    correct /= size
    print(f"val Error: \n Accuracy: {(100*correct):>0.5f}%, Avg loss: {test_loss:>8f} \n")  


In [162]:
for epoch in range(1, config.epochs + 1):
    print(f"Epoch {epoch}\n-------------------------------")
    train(train_loader, model, criterion, optimizer)
    test(val_loader, model)
print("Done!")        

loss: 0.000140  [    0/97812]
loss: 0.000123  [64000/97812]
val Error: 
 Accuracy: 2.09381%, Avg loss: 0.000004 

loss: 0.000140  [    0/97812]
loss: 0.000122  [64000/97812]
val Error: 
 Accuracy: 2.09381%, Avg loss: 0.000004 

loss: 0.000139  [    0/97812]
loss: 0.000122  [64000/97812]
val Error: 
 Accuracy: 2.09381%, Avg loss: 0.000004 

loss: 0.000139  [    0/97812]
loss: 0.000121  [64000/97812]
val Error: 
 Accuracy: 2.09381%, Avg loss: 0.000004 

loss: 0.000139  [    0/97812]
loss: 0.000120  [64000/97812]
val Error: 
 Accuracy: 2.09381%, Avg loss: 0.000004 

loss: 0.000139  [    0/97812]
loss: 0.000120  [64000/97812]
val Error: 
 Accuracy: 2.09381%, Avg loss: 0.000004 

loss: 0.000139  [    0/97812]
loss: 0.000120  [64000/97812]
val Error: 
 Accuracy: 2.09381%, Avg loss: 0.000004 

loss: 0.000139  [    0/97812]
loss: 0.000120  [64000/97812]
val Error: 
 Accuracy: 2.09381%, Avg loss: 0.000004 

loss: 0.000139  [    0/97812]
loss: 0.000120  [64000/97812]
val Error: 
 Accuracy: 2.093

prediction test:

In [221]:
test_loader = DataLoader(torch.Tensor(X_test_ssc), shuffle=False, drop_last=True)
with torch.no_grad():
    predictions = []
    for x in test_loader:
        x = x.view([x.shape[0], 1,x.shape[1]]).to(device)
        model.eval()
        yhat = model(x)
        predictions.append(msc.inverse_transform(yhat.cpu().detach().numpy()).reshape(-1))
len(predictions)        

25080

In [304]:
outputs = []
targets = []
with torch.no_grad(): 
    for X, y in val_loader:
        X, y = X.view([config.batch_size, -1, input_dim]).to(device), y.to(device)
        pred = model(X)
        outputs.append(msc.inverse_transform(pred.cpu().detach().numpy()).reshape(-1))
        targets.append(msc.inverse_transform(y.cpu().numpy()).reshape(-1))
    sMAPE = 0
    for i in range(len(outputs)):
        smape(targets[i], outputs[i])
        sMAPE +=np.mean(smape(targets[i], outputs[i]))
    print(sMAPE/len(outputs))
      

59.601616999865826
