# Short Notebook

#### Emil B. Berglund - 529222 & Louis H. H. Linnerud - 539305, Team: ML er Bingo
Public leaderboard rmsle = 0.67866

#### Table of contents:
0. Setup

1. Exploratory data analysis

2. Feature Engineering
    - Build features
        - Pure grunnkrets features
        - Income features
        - Age / Population features
        - Household features
        - Bus features
        - store based features
    - Combine features into one frame
    - inspect features 
<br>
<br>
3. Models/Predictors
    - LightGBM
    - CatBoost
    - XGBoost
    - Random Forest Regressor
    - H2O AutoML
<br>
<br>
4. Model Interpretations
    - Parameter importance
    - feature importance



___

# ___________ _0. Setup_ ___________

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import seaborn as sns
import sklearn.metrics as metrics
import sklearn.ensemble as ensemble
import optuna
import lightgbm as lgb
import catboost as cb
import xgboost as xg
import featuretools as ft
import category_encoders as ce
import shap
import h2o
from h2o.automl import H2OAutoML
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss, mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from verstack import LGBMTuner, MeanTargetEncoder, OneHotEncoder
from sklearn.neighbors import BallTree, KDTree




#from pandas_profiling import ProfileReport

In [20]:
def writeResultToFile(test_data, pred_data, nameOfFile='namelessSubmission'):
    submission = pd.DataFrame()
    submission['id'] = test_data['store_id']
    submission['predicted'] = np.asarray(pred_data)
    submission.to_csv('submissionFiles/'+ nameOfFile+'.csv', index=False)
    

In [21]:
def rmsle(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred)**0.5

In [22]:
def remove_columns(dataSet, columns):
    for column in columns:
        dataSet.drop(column, axis=1, inplace=True)


In [23]:
def remove_retailers_with_0_revenue(dataSet):
    dataSet.drop(dataSet[dataSet['revenue']==0.0].index, inplace=True)

___

# ___________ _2. Feature Engineering_ ___________
Feature engingeering organized in the following order:
- Build features
    - Pure grunnkrets features
    - Income features
    - Age / Population features
    - Household features
    - Bus features
    - store based features
- Combine features into one frame
- inspect features

### Build features

#### Import all the data sets

In [24]:
stores_train = pd.read_csv('data/stores_train.csv')
grunnkrets = pd.read_csv('data/grunnkrets_norway_stripped.csv')
gk_incomes = pd.read_csv('data/grunnkrets_income_households.csv')
gk_households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
gk_ages = pd.read_csv('data/grunnkrets_age_distribution.csv')
buss_stopps = pd.read_csv('data/busstops_norway.csv')
hierarchy = pd.read_csv('data/plaace_hierarchy.csv')

grunnkrets.drop_duplicates(subset=['grunnkrets_id'], inplace=True)
gk_incomes.drop_duplicates(subset=['grunnkrets_id'], inplace=True)
gk_households.drop_duplicates(subset=['grunnkrets_id'], inplace=True)
gk_ages.drop_duplicates(subset=['grunnkrets_id'], inplace=True)
buss_stopps.drop_duplicates(subset=['busstop_id'], inplace=True)

##### Construct the features

In [25]:

municipality_area = grunnkrets.groupby('municipality_name').area_km2.transform('sum')
grunnkrets['municipality_area'] = municipality_area
gk_incomes.rename(columns = {'all_households':'income_gk'}, inplace = True)

# income in district
gk_districts = grunnkrets[['grunnkrets_id','district_name']]
gk_incomes = pd.merge(gk_incomes, gk_districts, how='left', on='grunnkrets_id')

income_district = gk_incomes.groupby(['district_name']).income_gk.transform('mean')
gk_incomes['income_district'] = income_district
gk_incomes['income_district_log'] = np.log1p(income_district)

# Mean revenue in district
gk_revenues = pd.DataFrame(stores_train[['grunnkrets_id','revenue']])
gk_revenues.rename(columns = {'revenue':'revenue_gk'}, inplace = True)
gk_revenues = gk_revenues.groupby('grunnkrets_id').revenue_gk.mean()
gk_incomes = pd.merge(gk_incomes, gk_revenues, how='left', on='grunnkrets_id')
#gk_incomes['revenue_gk'].fillna(0,inplace=True)
gk_incomes['mean_rev_district'] = gk_incomes.groupby(['district_name']).revenue_gk.transform('mean')
gk_incomes['mean_rev_district_log'] = np.log1p(gk_incomes['mean_rev_district'])

# label income in district as low medium high
upper = gk_incomes['income_district'].quantile(0.80)
lower = gk_incomes['income_district'].quantile(0.20)
maximum = gk_incomes['income_district'].max()
gk_incomes['income_classification'] = pd.cut(gk_incomes.income_district, bins=[0,lower, upper, maximum], labels=[1,2,3]) 

# label income in gk as low medium high
upper = gk_incomes['income_gk'].quantile(0.80)
lower = gk_incomes['income_gk'].quantile(0.20)
maximum = gk_incomes['income_gk'].max()
gk_incomes['income_classification'] = pd.cut(gk_incomes.income_gk, bins=[0,lower, upper, maximum], labels=['low','medium','high'])


# Merge in selected columns
gk_area = grunnkrets[['grunnkrets_id','area_km2','municipality_area','municipality_name','district_name']]
gk_ages = pd.merge(gk_ages, gk_area, how='left', on='grunnkrets_id')

# Number of people features in grunnkrets
tot_people_in_gk = np.sum(gk_ages.iloc[:,np.arange(2,93,1)], axis=1)
gk_ages['tot_people_gk'] = tot_people_in_gk

# Number of people features in municipality
tot_people_in_municipality = gk_ages.groupby('municipality_name').tot_people_gk.transform('sum')
gk_ages['tot_people_municipality'] = tot_people_in_municipality

# Number of people features in district
tot_people_in_district = gk_ages.groupby('district_name').tot_people_gk.transform('sum')
gk_ages['tot_people_district'] = tot_people_in_district

# People density gk
gk_ages['people_density_gk'] = (gk_ages['tot_people_gk'] / gk_ages['area_km2'])
gk_ages['people_density_gk_log'] = np.log1p(gk_ages['tot_people_gk'] / gk_ages['area_km2'])

# People density municipality
gk_ages['people_density_municipality'] = (gk_ages['tot_people_municipality'] / gk_ages['municipality_area'])
gk_ages['people_density_municipality_log'] = np.log1p(gk_ages['tot_people_municipality'] / gk_ages['municipality_area'])

# People density district
district_area_km2 = gk_ages.groupby('municipality_name').area_km2.transform('sum')
gk_ages['district_area_km2'] = district_area_km2

gk_ages['people_density_district'] = (gk_ages['tot_people_district'] / gk_ages['district_area_km2'])
gk_ages['people_density_district_log'] = np.log1p(gk_ages['people_density_district'])

# City name
gk_ages['city'] = "none"
col_idx = gk_ages.columns.get_loc('city')
for mun in gk_ages['municipality_name'].unique():
        gk_ages.iloc[gk_ages[(gk_ages['municipality_name']==mun) & (gk_ages['tot_people_municipality']>100000)].index,[col_idx]] = mun

gk_municipalities = grunnkrets[['grunnkrets_id','municipality_name']]

# Number of house holds grunnkrets level
gk_households['nb_households_gk']  = np.sum(gk_households.iloc[:,np.arange(2,10,1)], axis=1)
gk_households['nb_households_gk_log']  = np.log1p(gk_households['nb_households_gk'])

# Number of house holds municipality level
gk_households = pd.merge(gk_households, gk_municipalities, how='left', on='grunnkrets_id')
nb_housholds_municipality = gk_households.groupby('municipality_name').nb_households_gk.transform('sum')
gk_households['nb_households_municipality'] = nb_housholds_municipality
gk_households['nb_households_municipality_log'] = np.log1p(nb_housholds_municipality)




In [26]:
# Helper function for dividing up the plaace hierarchy into separate categories
def add_hierarchy(stores):
    chosen_hiercs = hierarchy[['plaace_hierarchy_id','lv1_desc','lv2_desc','lv3_desc']]
    stores_with_hierarchy = pd.merge(stores,chosen_hiercs, how='inner', on='plaace_hierarchy_id')
    stores['lv1_desc'] = stores_with_hierarchy['lv1_desc']
    stores['lv2_desc'] = stores_with_hierarchy['lv2_desc']
    stores['lv3_desc'] = stores_with_hierarchy['lv3_desc']

# Helper function for adding lat and lon to busses
def add_latlong(buss):
    buss['lat'] = 0.0
    buss['lon'] = 0.0
    lonList = []
    latList = [] 
    for index, row in buss.iterrows():
        lon = row['geometry']
        lon = lon[6:]
        lon = lon.replace(')','')
        
        lat = lon.split()[1]
        lon = lon.split()[0]
        lon = float(lon)
        lat = float(lat) 

        lonList.append(lon)
        latList.append(lat)
        
    buss['lon'] = np.array(lonList)
    buss['lat'] = np.array(latList)

# Add lat lon to bus_stops
add_latlong(buss_stopps) 

# Add bus features to data frame
def add_buss_feats(stores,r1=0.3,r2=5,r3=10):
    k_neighbors =1200
    minutes = 60 #minutes
    nautic = 1.852 #km
    
    buss_stopps[["lat_rad", "lon_rad"]] = np.deg2rad(buss_stopps[["lat", "lon"]])
    stores[["lat_rad", "lon_rad"]] = np.deg2rad(stores[["lat", "lon"]])
    
    # All bus stops
    tree = BallTree(buss_stopps[["lat_rad", "lon_rad"]].values, metric='haversine')

    distances, indices = tree.query(stores[["lat_rad", "lon_rad"]], k = k_neighbors)
    distances = np.rad2deg(distances) # convert back to radians
    distances_km = distances*minutes*nautic
    nb_stops_r1 = np.count_nonzero(distances_km < r1, axis=1)
    nb_stops_r2 = np.count_nonzero(distances_km < r2, axis=1)
    nb_stops_r3 = np.count_nonzero(distances_km < r3, axis=1)
    dist_closest_bus = np.min(distances_km,axis=1)
    
    stores['dist_closest_bus'] = dist_closest_bus
    stores['dist_closest_bus_transformed'] = np.log1p(dist_closest_bus*10000)
    stores['nb_stops_r1'] = nb_stops_r1
    stores['nb_stops_r2'] = nb_stops_r2
    stores['nb_stops_r3'] = nb_stops_r3
    
    # Important bus stops
    important_buss_stops = buss_stopps[buss_stopps['importance_level'] == 'Nasjonalt knutepunkt']
    
    tree = BallTree(important_buss_stops[["lat_rad", "lon_rad"]].values, metric='haversine')

    distances, indices = tree.query(stores[["lat_rad", "lon_rad"]], k = 2) #2 because we eperienced some unstability with 1
    distances = np.rad2deg(distances) # convert back to radians
    distances_km = distances*minutes*nautic
    dist_closest_important_stop = np.min(distances_km,axis=1)
    
    stores['dist_closest_important_stop'] = dist_closest_important_stop
    stores[stores['dist_closest_important_stop'] < 0.00001] = np.max(dist_closest_important_stop,axis=0)#.replace(to_replace = 0,  value= np.max(dist_closest_important_stop), inplace=True)
    
    stores['dist_closest_important_stop_transformed'] = np.log1p(dist_closest_important_stop)


Store set

In [27]:
# Add mean rev within each chain
stores_train['chain_name'].fillna('0', inplace=True)

#trying to avoid overfitting small chains
for chain in stores_train['chain_name'].unique():
    if stores_train[stores_train['chain_name']==chain].shape[0] < 3:
        stores_train[stores_train['chain_name']==chain].chain_name = 'smallChain'

chain_mean_rev =  stores_train.groupby(['chain_name']).revenue.transform('mean')
stores_train['mean_chain_rev'] = chain_mean_rev

chain_rev_df = stores_train[['chain_name','mean_chain_rev']]
chain_rev_df.drop_duplicates(inplace=True)

In [28]:

chain_mean_rev =  stores_train.groupby(['chain_name']).revenue.transform('mean')
stores_train['mean_chain_rev'] = chain_mean_rev

chain_rev_df = stores_train[['chain_name','mean_chain_rev']]
chain_rev_df.drop_duplicates(inplace=True)



In [29]:
def self_aggregate_columns (stores):
    # Load in all datasets containg stores to make the most accurate features
    stores_ex = pd.read_csv('data/stores_extra.csv')
    stores_tr = pd.read_csv('data/stores_train.csv')
    stores_te = pd.read_csv('data/stores_test.csv')
    all_stores = stores_ex.copy()
    all_stores = all_stores.append(stores_tr)
    all_stores = all_stores.append(stores_te)
    
    minutes=60
    nautic=1.852 #km  
    #______________________________________________________________________________________________________________________________________________________
    
    # has chain
    stores['chain_name'] = stores['chain_name'].fillna('0')
    stores['has_chain'] = stores['chain_name'].apply(lambda x: 1 if x !='0' else 0)
    #______________________________________________________________________________________________________________________________________________________
    # Number of similar stores in municipality
    gk_municipalities = grunnkrets[['grunnkrets_id','municipality_name']]
    stores = pd.merge(stores, gk_municipalities, how='left', on='grunnkrets_id')
    
    nb_similar_stores_municipality = stores.groupby(['municipality_name','plaace_hierarchy_id']).plaace_hierarchy_id.transform('count')
    stores['nb_similar_stores_municipality'] = nb_similar_stores_municipality
    #______________________________________________________________________________________________________________________________________________________
    
    # Closest competitors
    all_stores[["lat_rad", "lon_rad"]] = np.deg2rad(all_stores[["lat", "lon"]])
    stores[["lat_rad", "lon_rad"]] = np.deg2rad(stores[["lat", "lon"]])
    
    stores['dist_closest_comp_km'] = 0
    
    for store_type in stores['plaace_hierarchy_id'].unique():
        
        temp_all_df = all_stores.loc[all_stores['plaace_hierarchy_id']==store_type]
        temp_target_df = stores.loc[stores['plaace_hierarchy_id']==store_type, ['store_id','lat_rad','lon_rad']]
        
        if temp_target_df.shape[0] > 1:
            tree = BallTree(temp_all_df[["lat_rad", "lon_rad"]].values, metric='haversine')
            
            distances, indices = tree.query(temp_target_df[["lat_rad", "lon_rad"]], k = 2)
            
            #distances = np.rad2deg(distances) # convert back to radians #TODO this has been forgotten, check if it improves result here and at the above one 

            distances_km = distances*minutes*nautic
            
            temp_target_df["dist_closest_comp_km_temp"] =np.max(distances_km, axis=1)
            
            temp_target_df.drop(['lat_rad','lon_rad'],axis=1, inplace=True)
            stores = pd.merge(stores, temp_target_df, how='left', on='store_id')
            stores['dist_closest_comp_km_temp'].fillna(0, inplace=True)
            
            stores['dist_closest_comp_km'] = stores['dist_closest_comp_km'] + stores['dist_closest_comp_km_temp']
            stores.drop('dist_closest_comp_km_temp', axis=1, inplace=True)
    
    # closest competitor transformed
    stores['dist_closest_comp_km_transform'] = stores['dist_closest_comp_km']
    stores['dist_closest_comp_km_transform'].replace(to_replace = 0,  method='ffill', inplace=True)
    stores['dist_closest_comp_km_transform'] = np.log1p(stores['dist_closest_comp_km_transform'])#*10000)
    #______________________________________________________________________________________________________________________________________________________
    
    # Closest competitor in chain
    stores['dist_closest_chain_km'] = 0

    for chain in stores['chain_name'].unique():
        
        temp_all_df = all_stores.loc[all_stores['chain_name']==chain]
        temp_target_df = stores.loc[stores['chain_name']==chain, ['store_id','lat_rad','lon_rad']]
        
        if temp_target_df.shape[0] > 1 and temp_all_df.shape[0] > 1 :
            tree = BallTree(temp_all_df[["lat_rad", "lon_rad"]].values, metric='haversine')
            
            distances, indices = tree.query(temp_target_df[["lat_rad", "lon_rad"]], k = 2)

            distances_km = distances*minutes*nautic
            
            temp_target_df["dist_closest_chain_km_temp"] =np.max(distances_km, axis=1)
            
            temp_target_df.drop(['lat_rad','lon_rad'],axis=1, inplace=True)
            stores = pd.merge(stores, temp_target_df, how='left', on='store_id')
            stores['dist_closest_chain_km_temp'].fillna(0, inplace=True)
            
            stores['dist_closest_chain_km'] = stores['dist_closest_chain_km'] + stores['dist_closest_chain_km_temp']
            stores.drop('dist_closest_chain_km_temp', axis=1, inplace=True)

    # closest chain transformed
    stores['dist_closest_chain_km_transform'] = stores['dist_closest_chain_km']
    stores['dist_closest_chain_km_transform'].replace(to_replace = 0,  method='ffill', inplace=True)
    stores['dist_closest_chain_km_transform'] = np.log1p(stores['dist_closest_chain_km_transform'])#*10000)
    #______________________________________________________________________________________________________________________________________________________
    
    # Number of stores within range
    tree = BallTree(all_stores[["lat_rad", "lon_rad"]].values, metric='haversine')
    
    distances , indices = tree.query(stores[["lat_rad", "lon_rad"]], k = 1200)
    distances = np.rad2deg(distances) # convert back to radians
    distances_km = distances*minutes*nautic
    
    r_1 = 0.1 #km
    r_2 = 0.5 #km
    r_3 = 1 #km
    r_4 = 5 #km
    r_5 = 10 #km
    
    stores['nb_stores_r1']  = np.count_nonzero(distances_km < r_1, axis=1)
    stores['nb_stores_r2']  = np.count_nonzero(distances_km < r_2, axis=1)
    stores['nb_stores_r3']  = np.count_nonzero(distances_km < r_3, axis=1)
    stores['nb_stores_r4']  = np.count_nonzero(distances_km < r_4, axis=1)
    stores['nb_stores_r5']  = np.count_nonzero(distances_km < r_5, axis=1)
    #______________________________________________________________________________________________________________________________________________________
    
    # Number of similar stores within radius
    radiuses = [10,5,1,0.5,0.1]
    for r in radiuses:
        stores[f"nb_of_close_competitors_{r}"] = 0
        
        for store_type in stores['plaace_hierarchy_id'].unique():
            
            temp_all_df = all_stores.loc[all_stores['plaace_hierarchy_id']==store_type]
            temp_target_df = stores.loc[stores['plaace_hierarchy_id']==store_type, ['store_id','lat_rad','lon_rad']]
            
            if temp_target_df.shape[0] > 1:
                k_neighs = temp_target_df.shape[0]
                
                tree = BallTree(temp_all_df[["lat_rad", "lon_rad"]].values, metric='haversine')
                
                distances, indices = tree.query(temp_target_df[["lat_rad", "lon_rad"]], k = (k_neighs-1))
                distances = np.rad2deg(distances)
                distances_km = distances*minutes*nautic
                distances_km[distances_km > r] = 0
                
                temp_target_df[f"nb_of_close_competitors_temp_{r}"] =np.count_nonzero(distances_km, axis=1)
                
                temp_target_df.drop(['lat_rad','lon_rad'],axis=1, inplace=True)
                stores = pd.merge(stores, temp_target_df, how='left', on='store_id')
                stores[f"nb_of_close_competitors_temp_{r}"].fillna(0, inplace=True)
                
                stores[f"nb_of_close_competitors_{r}"] = stores[f"nb_of_close_competitors_{r}"] + stores[f"nb_of_close_competitors_temp_{r}"]
                stores.drop(f"nb_of_close_competitors_temp_{r}", axis=1, inplace=True)
    #______________________________________________________________________________________________________________________________________________________
    
    
    
    remove_columns(stores,['municipality_name'])# gets added later
    return stores



### Combine all features into a single dataframe

In [30]:
def add_selected_columns(df, include_bad_columns=False):
    # Add all columns
    df = self_aggregate_columns(df)
    add_buss_feats(df)
    add_hierarchy(df)
    gk = grunnkrets[['grunnkrets_id','municipality_name', 'district_name']]
    gk_i = gk_incomes[['grunnkrets_id','income_gk','income_classification','income_district','mean_rev_district','mean_rev_district_log','income_district_log','single_parent_with_children']]
    gk_h = gk_households[['grunnkrets_id','nb_households_gk','nb_households_municipality','nb_households_gk_log','nb_households_municipality_log']]
    gk_a = gk_ages[['grunnkrets_id','tot_people_gk','tot_people_district','tot_people_municipality','people_density_gk_log','people_density_municipality_log','people_density_gk','people_density_municipality','city','people_density_district','people_density_district_log']]
    chain_rev = chain_rev_df[['chain_name','mean_chain_rev']]
    
    concat = pd.merge(df, gk, how='left', on='grunnkrets_id')
    concat = pd.merge(concat, gk_i, how='left', on='grunnkrets_id')
    concat = pd.merge(concat, gk_h, how='left', on='grunnkrets_id')
    concat = pd.merge(concat, gk_a, how='left', on='grunnkrets_id')
    concat = pd.merge(concat, chain_rev, how='left', on='chain_name')
    #______________________________________________________________________________________________________________________________________________________
    
    # NaN handling
    concat['income_gk'].fillna(450000, inplace=True)



    concat['municipality_name'].fillna('0', inplace=True)
    concat['income_classification'].fillna('medium', inplace=True)
    concat['nb_similar_stores_municipality'].fillna(0, inplace=True)
    
    concat['city'].fillna('none', inplace=True)
    concat['people_density_district'].fillna(concat['people_density_district'].dropna().mean(), inplace=True)#the mean value
    concat['people_density_district_log'].fillna(concat['people_density_district_log'].dropna().mean(), inplace=True)        
    concat['district_name'].fillna('none', inplace=True)
    concat['mall_name'].fillna('none', inplace=True)
    concat['income_district'].fillna(466285.26671004744, inplace=True) #replace with mean value
    concat['mean_rev_district'].fillna(8.20564984134999, inplace=True) #replace with mean value
    concat['tot_people_district'].fillna(concat['tot_people_district'].dropna().mean(), inplace=True)
    concat['tot_people_municipality'].fillna(concat['tot_people_municipality'].dropna().mean(), inplace=True)        
    concat['people_density_gk_log'].fillna(concat['people_density_gk_log'].dropna().mean(), inplace=True)
    concat['people_density_municipality_log'].fillna(concat['people_density_municipality_log'].dropna().mean(), inplace=True)
    concat['people_density_gk'].fillna(concat['people_density_gk'].dropna().mean(), inplace=True)
    concat['people_density_municipality'].fillna(concat['people_density_municipality'].dropna().mean(), inplace=True)
    concat['mean_chain_rev'].fillna(4.29, inplace=True)
    concat['single_parent_with_children'].fillna(concat['single_parent_with_children'].mean(), inplace=True)
    concat['single_parent_with_children'].mean()
    concat['grunnkrets_id'] = concat['grunnkrets_id'].astype(np.int0)
    #______________________________________________________________________________________________________________________________________________________
    
    # Select columns we want to return
    if not include_bad_columns:
        remove_columns(concat, [
                                'plaace_hierarchy_id',
                                'store_id',
                                'store_name',
                                'year',
                                'address',
                                'lat',
                                'lon',
                                'lat_rad',
                                'lon_rad',
                                'has_chain',
                                'municipality_name',
                                'district_name',
                                'income_district',
                                'income_district_log',# use unlogged column instead due to weird distribution somehow
                                'single_parent_with_children',
                                'mean_rev_district_log', #make things alot worse on kaggle
                                'mean_rev_district',
                                'income_classification',
                                'nb_similar_stores_municipality', # radius > Munc
                                'nb_households_gk', # radius > grunnkrets
                                'nb_households_gk_log',
                                'nb_households_municipality', #very ugly distributed
                                'nb_households_municipality_log',
                                'tot_people_gk',
                                'tot_people_district',
                                'tot_people_municipality', # should be removed, very poor distribution
                                'people_density_gk', # this distributibution is suuuuperweird compared in the different sets..
                                'people_density_gk_log',
                                'people_density_municipality',
                                'people_density_municipality_log',
                                'dist_closest_bus_transformed',
                                'dist_closest_important_stop_transformed',
                                'dist_closest_comp_km_transform',
                                'dist_closest_chain_km_transform',
                                'people_density_district',
                                'people_density_district_log'
                                ])
    #______________________________________________________________________________________________________________________________________________________
    
    return concat
#Chosen columns are below
#'chain_name',
#'mall_name',
#'income_gk',
#'sales_channel_name',
#'lv1_desc',
#'lv2_desc',
#'lv3_desc',
#'grunnkrets_id',
#'dist_closest_bus',
#'nb_stops_r1',
#'nb_stops_r2',
#'nb_stops_r3',
#'dist_closest_important_stop',
#'nb_stores_r1',
#'nb_stores_r2',
#'nb_stores_r3',
#'nb_stores_r4',
#'nb_stores_r5',
#'mean_chain_rev',
#'dist_closest_comp_km',
#'dist_closest_chain_km',
#'city',
#Bokmerke

In [31]:
""" le = LabelEncoder()
sns.set(font_scale=0.9)
selected_cols = [
                 # change "in stores_test" below with selected_cols to only show the ones you wanna plot
                ]
for col_name in stores_test:
    
    if col_name =='income_classification':
        stores_test[col_name] = le.fit_transform(stores_test[col_name])
        stores_train[col_name] = le.fit_transform(stores_train[col_name])
            
    elif stores_test[col_name].dtypes != 'object':
        stores_test[col_name] = stores_test[col_name]
        stores_train[col_name] = stores_train[col_name]
        
    else:
        stores_test[col_name] = le.fit_transform(stores_test[col_name])
        stores_train[col_name] = le.fit_transform(stores_train[col_name])
        
    
    fig, (ax1, ax2) = plt.subplots(figsize=(15, 5), ncols=2, dpi=100)
    sns.distplot(stores_test[col_name], ax=ax1);
    ax1.set_title(f"Distribution {col_name} test set");
    sns.distplot(stores_train[col_name], ax=ax2);
    ax2.set_title(f"Distribution {col_name} train set");
    plt.show()
        
    break # comment out to show all plots """


' le = LabelEncoder()\nsns.set(font_scale=0.9)\nselected_cols = [\n                 # change "in stores_test" below with selected_cols to only show the ones you wanna plot\n                ]\nfor col_name in stores_test:\n    \n    if col_name ==\'income_classification\':\n        stores_test[col_name] = le.fit_transform(stores_test[col_name])\n        stores_train[col_name] = le.fit_transform(stores_train[col_name])\n            \n    elif stores_test[col_name].dtypes != \'object\':\n        stores_test[col_name] = stores_test[col_name]\n        stores_train[col_name] = stores_train[col_name]\n        \n    else:\n        stores_test[col_name] = le.fit_transform(stores_test[col_name])\n        stores_train[col_name] = le.fit_transform(stores_train[col_name])\n        \n    \n    fig, (ax1, ax2) = plt.subplots(figsize=(15, 5), ncols=2, dpi=100)\n    sns.distplot(stores_test[col_name], ax=ax1);\n    ax1.set_title(f"Distribution {col_name} test set");\n    sns.distplot(stores_train[col_n

___

# ___________ _3. Machine Learning Models and Predictions_ ___________


### Helper functions

In [32]:
# Encode LGBM
def convert_DType_LGBM(dFrame):
    le = LabelEncoder()
    X = pd.DataFrame()
    
    for col_name in dFrame:
        if dFrame[col_name].dtypes == 'object':
            X[col_name] = dFrame[col_name].astype('category')
            
        elif col_name == 'has_chain':
            X[col_name] = dFrame[col_name].astype('category')
        
        else:
            X[col_name] = dFrame[col_name]
    return X

In [33]:
# Encode Catboost
def convert_DType_CatBoost_2(train_frame,test_frame=None):
    X = pd.DataFrame()
    test = pd.DataFrame()
    
    for col_name in train_frame:
        if col_name == 'grunnkrets_id':
            X[col_name] = train_frame[col_name].astype(np.int0)
            X[col_name] = train_frame[col_name].astype('category')
        
        elif train_frame[col_name].dtypes == 'object':
            X[col_name] = train_frame[col_name].astype('category')
        
        else:
            X[col_name] = train_frame[col_name]
        
    for col_name in test_frame:
        if col_name == 'grunnkrets_id':
            test[col_name] = test_frame[col_name].astype(np.int0)
            test[col_name] = test_frame[col_name].astype('category')
        
        elif test_frame[col_name].dtypes == 'object':
            test[col_name] = test_frame[col_name].astype('category')
        
        else:
            test[col_name] = test_frame[col_name]
    
    return X, test



In [34]:
def get_data():
    stores_train = pd.read_csv('data/stores_train.csv')
    stores_test = pd.read_csv('data/stores_test.csv')

    # Add features
    stores_train = add_selected_columns(stores_train, include_bad_columns=False)
    #if include_submission_set:
    stores_test = add_selected_columns(stores_test, include_bad_columns=False)

    # Divide data into train and test set
    
    x_train = stores_train.drop('revenue', axis=1)
    
    y_train = stores_train['revenue']
    y_train=np.log1p(y_train) #log transform revenue
    
    return x_train, y_train, stores_test
    

### Predict test and submit

In [35]:
# Load data
#x_train, x_test, y_train, y_test, test = get_data(test_size=0.0001)
x_train, y_train, test = get_data()
print(x_train.columns)

# Convert to approperiate dtypes
LGBM_x_train_1 = convert_DType_LGBM(x_train)
LGBM_test_1 = convert_DType_LGBM(test)
categorical_features = [f for f in LGBM_x_train_1.columns if LGBM_x_train_1[f].dtype == 'category']

#CB_x_train,CB_test = convert_DType_CatBoost(x_train, y_train, test)
CB_x_train, CB_test = convert_DType_CatBoost_2(x_train, test)
categorical_features_indices = np.where((CB_x_train.dtypes == 'object') | (CB_x_train.dtypes == str) | (CB_x_train.dtypes == 'category'))[0]

# LGBM_1
# parameters found through verstack, but learning rate and n_iterations have been modified through testing of our own
LGBM_parameters = {'learning_rate': 0.005, 'num_leaves': 253, 'colsample_bytree': 0.8609727402803514, 'subsample': 0.8100744538922787, 'verbosity': -1, 'random_state': 42, 'objective': 'regression', 'metric': 'l2', 'num_threads': 6, 'reg_alpha': 1.0025757876059077e-06, 'min_sum_hessian_in_leaf': 4.139004254818685, 'reg_lambda': 0.0018151441142073164, 'n_estimators': 500}
#LGBM_parameters = {}


LGBM_model_1 = LGBMRegressor(**LGBM_parameters)
LGBM_model_1.fit(LGBM_x_train_1, y_train, categorical_feature=categorical_features)
LGBM_pred_1 = LGBM_model_1.predict(LGBM_test_1)
#LGBM_pred=np.expm1(LGBM_pred) #invert log transform

# Catboost
CB_parameters = {'depth': 12, 'iterations': 3000, 'learning_rate': 0.005}
#CB_parameters = {}
CB_model = cb.CatBoostRegressor(loss_function='RMSE', **CB_parameters, silent=True)
CB_model.fit(CB_x_train,y_train, cat_features=categorical_features_indices)
CB_pred = CB_model.predict(CB_test)
#CB_pred = np.expm1(CB_pred)

# Aggregate result
PREDICTION= np.expm1(((CB_pred*0.7)+(LGBM_pred_1*0.3)))

Index(['sales_channel_name', 'grunnkrets_id', 'chain_name', 'mall_name',
       'dist_closest_comp_km', 'dist_closest_chain_km', 'nb_stores_r1',
       'nb_stores_r2', 'nb_stores_r3', 'nb_stores_r4', 'nb_stores_r5',
       'nb_of_close_competitors_10', 'nb_of_close_competitors_5',
       'nb_of_close_competitors_1', 'nb_of_close_competitors_0.5',
       'nb_of_close_competitors_0.1', 'dist_closest_bus', 'nb_stops_r1',
       'nb_stops_r2', 'nb_stops_r3', 'dist_closest_important_stop', 'lv1_desc',
       'lv2_desc', 'lv3_desc', 'income_gk', 'city', 'mean_chain_rev'],
      dtype='object')


In [36]:
#write the predicition to file
stores_test = pd.read_csv('data/stores_test.csv')
writeResultToFile(stores_test, PREDICTION, "CB_LGBM_FINAL_2_1")

# Verify format of submission file
submissionVery = pd.read_csv('submissionFiles/CB_LGBM_FINAL_2_1.csv')
submissionVery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8577 entries, 0 to 8576
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         8577 non-null   object 
 1   predicted  8577 non-null   float64
dtypes: float64(1), object(1)
memory usage: 134.1+ KB


# End of the Short Notebook