# Introduction

First submission :

<a href='#1'>1. Loading Data</a>

<a href='#2'>2. Combining Datasets</a>

<a href='#3'>3. Memory Limitation</a>

<a href='#4'>4. Missing Values</a>

<a href='#5'>5.EDA</a>

<a href='#6'>6. Model Training</a>

<a href='#7'>7. Model Predictions and Kaggle Submission</a>

# <a id='1'>1. Loading Data</a>

In [2]:
#import libraries
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sn
import statistics
import gc

In [3]:
#load data
train = pd.read_csv('kaggle/input/ashrae-energy-prediction/train.csv', memory_map=True)
train['timestamp'] = pd.to_datetime(train['timestamp']) #the train dataset contains a 'timestamp' column we convert to a datetime object for ease of use
test = pd.read_csv('kaggle/input/ashrae-energy-prediction/test.csv', memory_map=True)
test['timestamp'] = pd.to_datetime(test['timestamp'])
weather_train = pd.read_csv('kaggle/input/ashrae-energy-prediction/weather_train.csv', memory_map=True)
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp']) 
weather_test = pd.read_csv('kaggle/input/ashrae-energy-prediction/weather_test.csv', memory_map=True)
weather_test['timestamp'] = pd.to_datetime(weather_test['timestamp']) 
build_meta = pd.read_csv('kaggle/input/ashrae-energy-prediction/building_metadata.csv', memory_map=True)

In [4]:
len(train), len(test)

(20216100, 41697600)

In [None]:
train = train.sample(frac = 0.009)
test = test.sample(frac = 0.009)

For a simple first model, we are going to merge the training sets to gather all covariables and make predictions on it with a linear regressor.

# <a id='2'>2. Combining Datasets</a>
We will merge everything into train and test dataframes.

We need to convert the timestamp to the right type:

In [5]:
weather_test.timestamp = pd.to_datetime(weather_test.timestamp) 
weather_train.timestamp = pd.to_datetime(weather_train.timestamp) 

In [6]:
#merge the building meta data and weather data into the train data
train_m = train.merge(build_meta, how='left', on = ['building_id'], validate='many_to_one') #merge the building meta data into the train data
test_m = test.merge(build_meta, how='left', on = ['building_id'], validate='many_to_one') #merge the building meta data into the train data
train_m = train_m.merge(weather_train, how='left', on = ['site_id', 'timestamp'], validate='many_to_one')#add weather data to each time entry for each site ID
test_m = test_m.merge(weather_test, how='left', on = ['site_id', 'timestamp'], validate='many_to_one')#add weather data to each time entry for each site ID
del build_meta, weather_train, weather_test
import gc
gc.collect() #mandatory since we don't have unlimited space (16go and the databases are quite large)
train_m.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0


In [7]:
del train, test

# <a id='3'>3. Memory Limitation</a>

In [8]:
def reduce_mem_usage(df, verbose=True): #we need to drastically reduce memory usage to work with those data basis
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train_m = reduce_mem_usage(train_m) #this code changes the variable formats so it takes less space
test_m = reduce_mem_usage(test_m) #source for this code : https://www.kaggle.com/alexandersylvester/ashrae-energy-predictions-with-lightgbm

Mem. usage decreased to 1041.10 Mb (60.3% reduction)
Mem. usage decreased to 2147.36 Mb (60.3% reduction)


# <a id='4'>4. Missing Values</a>

/!\ ne pas executer la partie 4, la partie 5 traite ce sujet de manière plus complète

In [None]:
train_m.isna().describe()

In [None]:
test_m.isna().describe()

The easy way out in this scenario is to drop the columns containing any NA. Which we're going to do for the time being. We have no missing value for the target variable so we will not be dropping any training row

In [None]:
train_m = train_m.dropna(axis='columns')

In [None]:
gc.collect(generation=0)
gc.collect(generation=1)
gc.collect() #supposedly cleans the memory but not that efficient // we need a better way

In [None]:
test_m = test_m.dropna(axis='columns')

In [None]:
train_m.describe()

In [None]:
test_m.describe()

In [None]:
train_m.groupby("primary_use").sum()

In [None]:
train_m.primary_use = train_m.primary_use.astype('category')
test_m.primary_use = test_m.primary_use.astype('category')

In [None]:
# dr = test_m.primary_use
# test_m = test_m.drop('primary_use',axis=1)

In [None]:
# dr2 = train_m.primary_use
# train_m = train_m.drop('primary_use',axis=1)
## train_m = train_m.drop('meter_reading',axis=1)

In [None]:
test_m.dtypes

In [None]:
train_m.dtypes

In [None]:
train_df = train_m
test_df = test_m

In [None]:
import math

train_df['month_datetime'] = train_df['timestamp'].dt.month.astype(np.int8)
train_df['weekofyear_datetime'] = train_df['timestamp'].dt.weekofyear.astype(np.int8)
train_df['dayofyear_datetime'] = train_df['timestamp'].dt.dayofyear.astype(np.int16)
    
train_df['hour_datetime'] = train_df['timestamp'].dt.hour.astype(np.int8)  
train_df['day_week'] = train_df['timestamp'].dt.dayofweek.astype(np.int8)
train_df['day_month_datetime'] = train_df['timestamp'].dt.day.astype(np.int8)
train_df['week_month_datetime'] = train_df['timestamp'].dt.day/7
train_df['week_month_datetime'] = train_df['week_month_datetime'].apply(lambda x: math.ceil(x)).astype(np.int8)
    
# train_df['year_built'] = train_df['year_built']-1900
train_df['square_feet'] = np.log(train_df['square_feet'])
    
test_df['month_datetime'] = test_df['timestamp'].dt.month.astype(np.int8)
test_df['weekofyear_datetime'] = test_df['timestamp'].dt.weekofyear.astype(np.int8)
test_df['dayofyear_datetime'] = test_df['timestamp'].dt.dayofyear.astype(np.int16)
    
test_df['hour_datetime'] = test_df['timestamp'].dt.hour.astype(np.int8)
test_df['day_week'] = test_df['timestamp'].dt.dayofweek.astype(np.int8)
test_df['day_month_datetime'] = test_df['timestamp'].dt.day.astype(np.int8)
test_df['week_month_datetime'] = test_df['timestamp'].dt.day/7
test_df['week_month_datetime'] = test_df['week_month_datetime'].apply(lambda x: math.ceil(x)).astype(np.int8)

# <a id='5'>5. EDA</a>

In [None]:
mat_coorr=train_m.corr()
sn.heatmap(mat_coorr)
plt.show()

Pas de corrélation particulière entre `meter_reading` et les autres variables. Il y a cependant quelques corrélations entre certaines variables comme `floor_count` et `square_feet` ou `air_temperature` et `dew_temperature`. nous les traiterons plus tard pour voir si ces corrélations ont une influence sur nos modèles. 

In [None]:
gc.collect()

In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

In [None]:
is_na=train_m.isnull().sum()*100/train_m.shape[0]
is_na=is_na.sort_values()
ind = np.arange(len(is_na))
gc.collect()

In [None]:
from bokeh.models import HoverTool, Legend, LegendItem, Range1d, ColumnDataSource,  RangeTool,  Select, MultiSelect
from bokeh.transform import linear_cmap
from bokeh.palettes import RdYlGn11 as palette
from bokeh.models import LinearAxis
from bokeh.models import HoverTool, Legend, LegendItem, Range1d, ColumnDataSource,  RangeTool,  Select, MultiSelect

p = figure(width = 600, height = 300)
source = ColumnDataSource(data=dict(x=ind, y=is_na.values, xname = is_na.index.to_list()))
gc.collect()

In [None]:
color_mapper = linear_cmap(palette=palette, low=0, high=100, field_name='y')

hist1 = p.vbar(x='x', top = 'y' , width=1,  source=source, line_color=color_mapper, fill_color = color_mapper, name = "Histogramme",alpha=0.6)

h = HoverTool( # code pour l'outil qui affiche les informations de la courbe en passant la souris dessus
    tooltips=[
        ('Variable', '@xname'),
        ('Pourcentage de NA', '$y'),#pour obtenir le nom de la courbe sur laquelle se trouve le pointeur
    ],
    renderers = [hist1]#, j]
)
p.add_tools(h)


show(p)
gc.collect()

In [None]:
plt.bar(ind, is_na.values)
plt.xticks(ind,is_na.index,rotation=90)
plt.ylabel("Pourcentage de NA")
plt.show()
gc.collect()

Les labels `floor_count` et `year_built` sont ceux pour lesquels il y a le plus de NA. C'est dommage car ce sont des indexs qui intuitivement auront beaucoup d'influence sur l'energie consommée.

In [None]:
train_m['hour_datetime'] = train_m['timestamp'].dt.hour.astype(np.int8)  
train_m['day_week'] = train_m['timestamp'].dt.dayofweek.astype(np.int8)
gc.collect()

In [None]:
p2 = figure(width = 600, height = 300)
p3 = figure(width = 600, height = 300)#,x_axis_type = 'datetime')
# source2 = ColumnDataSource(data=dict(x=ind, y=is_na.values, xname = is_na.index.to_list()))
# p.line(df['Date'], df.vol.mul(df.vol), line_width = 3, name= 'données réelles', color = 'purple')
# p2.line(train_m.hour_datetime, train_m.meter_reading, line_width = 2, color='black', name = 'Par heure')
p2.line(train_m.groupby('hour_datetime').hour_datetime.mean(), train_m[['hour_datetime', 'meter_reading']].groupby('hour_datetime').meter_reading.mean(), line_width = 2, color='red', name = 'Meter reading par heure')

legend_list = []
for i in range(len(p2.renderers)):
        leg_tmp = LegendItem(label=p2.renderers[i].name, renderers = [p2.renderers[i]])
        legend_list.append(leg_tmp)
legend = Legend(items=legend_list, location='top_right')
p2.add_layout(legend)

show(p2)

p3.line(train_m.groupby('day_week').day_week.mean(), train_m[['day_week', 'meter_reading']].groupby('day_week').meter_reading.mean(), line_width = 2, color='red', name = 'Meter reading par jour')


legend_list = []
for i in range(len(p3.renderers)):
        leg_tmp = LegendItem(label=p3.renderers[i].name, renderers = [p3.renderers[i]])
        legend_list.append(leg_tmp)
legend = Legend(items=legend_list, location='top_right')
p3.add_layout(legend)




show(p3)
gc.collect()

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(14, 6),dpi=100)
train_m[['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes, label='Par heure').set_ylabel('Meter reading');
train_m[['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes, label='Par jour').set_ylabel('Meter reading');
axes.set_title('Metre relevé moyen par jour et par heure');
axes.legend();
plt.show()
gc.collect()

La forme de la distribution de la valeur cible dans le temps est plutôt étrange. On observe des pics irréguliers. Regardons de plus près.

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
fig, axes = plt.subplots(8,2,figsize=(14, 30), dpi=100)
list_pu=list(train_m['primary_use'].value_counts().index);
for i in range(len(list_pu)):
    train_m[train_m['primary_use'] == list_pu[i]][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i%8][i//8], label='Par heure').set_ylabel('Metre relevé moyen');
    train_m[train_m['primary_use'] == list_pu[i]][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i%8][i//8],  label='Par jour').set_xlabel('');
    axes[i%8][i//8].legend();
    axes[i%8][i//8].set_title(list_pu[i]);
    plt.subplots_adjust(hspace=0.45)
plt.show()
gc.collect()

Notre problème se situe dans Education.

In [None]:
fig, axes = plt.subplots(8,2,figsize=(14, 30), dpi=100)
list_sid=list(train_m['site_id'].value_counts().index);
train_hist=train_m[train_m['primary_use'] == 'Education']
for i in range(len(list_sid)):
    try :
        train_hist[train_hist['site_id'] == i][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i%8][i//8], label='Par heure').set_ylabel('Metre relevé moyen');
        train_hist[train_hist['site_id'] == i][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i%8][i//8],  label='Par jour').set_xlabel('');
        axes[i%8][i//8].legend();
    except :
        pass
    axes[i%8][i//8].set_title(i);
    plt.subplots_adjust(hspace=0.45)
plt.show()
del train_hist;
gc.collect()

Notre problème se situe dans le 13.

In [None]:
fig, axes = plt.subplots(3,1,figsize=(14, 18), dpi=100)
train_hist=train_m[train_m['primary_use'] == 'Education']
train_hist2=train_hist[train_hist['site_id'] == 13]
list_met=list(train_hist2['meter'].value_counts().index);
for i in range(len(list_met)):
    try :
        train_hist2[train_hist2['meter'] == list_met[i]][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i], label='Par heure').set_ylabel('Metre relevé moyen');
        train_hist2[train_hist2['meter'] == list_met[i]][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i],  label='Par jour').set_xlabel('');
        axes[i].legend();
    except TypeError:
        pass
    axes[i].set_title(list_met[i]);
    plt.subplots_adjust(hspace=0.45)
plt.show()
del train_hist;
del train_hist2;
gc.collect()

Le problème se situe dans le metre de type 2.

In [None]:
fig, axes = plt.subplots(9,2,figsize=(14, 36), dpi=100)
train_hist=train_m[train_m['primary_use'] == 'Education']
train_hist2=train_hist[train_hist['site_id'] == 13]
train_hist3=train_hist2[train_hist2['meter']==2]
list_build=list(train_hist3['building_id'].value_counts().index);
for i in range(len(list_build)):
    try :
        train_hist3[train_hist3['building_id'] == list_build[i]][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i%9][i//9], label='Par heure').set_ylabel('Metre relevé moyen');
        train_hist3[train_hist3['building_id'] == list_build[i]][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i%9][i//9],  label='Par jour').set_xlabel('');
        axes[i%9][i//9].legend();
    except TypeError:
        pass
    axes[i%9][i//9].set_title(list_build[i]);
    plt.subplots_adjust(hspace=0.45)
plt.show()
del train_hist;
del train_hist2;
del train_hist3;
gc.collect()

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(14, 6),dpi=100)
train_m[['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes, label='Par heure').set_ylabel('Meter reading');
train_m[['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes, label='Par jour').set_ylabel('Meter reading');
axes.set_title('Metre relevé moyen par jour et par heure');
axes.legend();
plt.show()
gc.collect()

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(14, 6),dpi=100)
new_train=train_m[train_m['building_id'] != 1099]
new_train[['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes, label='Par heure').set_ylabel('Meter reading');
new_train[['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes, label='Par jour').set_ylabel('Meter reading');
axes.set_title('Metre relevé moyen par jour et par heure');
axes.legend();
plt.show()
gc.collect()

In [None]:
new_test=test_m
new_test
gc.collect()

In [None]:
del train_m
del test_m

Remplacement NA

In [9]:
list_pu=list(train_m['primary_use'].value_counts().index);
#new_train=train_m[train_m['building_id'] != 1099]
new_train=train_m
new_test=test_m
del train_m
del test_m
new_train = new_train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')
gc.collect()

30

In [10]:
for i in list_pu:
    print(i)
    print(new_train[new_train['primary_use']==i].isnull().sum()*100/new_train[new_train['primary_use']==i].shape[0])
    print("")

Education
building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built            50.992359
floor_count           76.311594
air_temperature        0.536784
cloud_coverage        45.082600
dew_temperature        0.565242
precip_depth_1_hr     24.570677
sea_level_pressure     7.551545
wind_direction         8.627742
wind_speed             0.825937
dtype: float64

Office
building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built            76.891130
floor_count           91.739223
air_temperature        0.382801
cloud_coverage        43.312930
dew_temperature        0.388964
precip_depth_1_hr     14.418745
sea_level_pressure     4.293337
wind_di

`Services`, `Food sales and service`, `Religious worship` n'ont aucune donnée pour la variable `floor_count`. Nous allons d'abord combler ce qu'il manque dans les autres catégories par la moyenne. 

In [11]:
new_train.isnull().sum()*100/new_train.shape[0]

building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built            61.035302
floor_count           82.350513
air_temperature        0.484900
cloud_coverage        43.606908
dew_temperature        0.502424
precip_depth_1_hr     18.867345
sea_level_pressure     6.169160
wind_direction         7.253766
wind_speed             0.723084
dtype: float64

In [12]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                 0.000000
building_id            0.000000
meter                  0.000000
timestamp              0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built            58.991597
floor_count           82.605042
air_temperature        0.532167
cloud_coverage        46.866438
dew_temperature        0.625453
precip_depth_1_hr     18.709861
sea_level_pressure     6.035901
wind_direction         7.143488
wind_speed             0.724476
dtype: float64

In [13]:
import statistics
list_pu_mean=[]
list_pu_mean_test=[]
for i in list_pu:
    temp=new_train[new_train['primary_use']==i]['floor_count'].dropna()
    if (len(temp)>0):
        list_pu_mean.append(statistics.mean(temp))
    else :
        list_pu_mean.append(0)
        
for i in list_pu:
    temp=new_test[new_test['primary_use']==i]['floor_count'].dropna()
    if (len(temp)>0):
        list_pu_mean_test.append(statistics.mean(temp))
    else :
        list_pu_mean_test.append(0)        
gc.collect()

117

In [14]:
for i in range(len(list_pu)):
    if (list_pu[i]!="Services") & (list_pu[i]!="Food sales and service") & (list_pu[i]!="Religious worship") :
        new_train.loc[new_train.primary_use==list_pu[i],['floor_count']]=new_train[new_train['primary_use']==list_pu[i]]['floor_count'].fillna(list_pu_mean[i])
        new_test.loc[new_test.primary_use==list_pu[i],['floor_count']]=new_test[new_test['primary_use']==list_pu[i]]['floor_count'].fillna(list_pu_mean_test[i])
new_train
gc.collect()

0

In [15]:
list_sid=list(new_train['site_id'].value_counts().index);

In [16]:
list_sid_mean=[]
list_sid_mean_test=[]
for i in list_sid:
    temp=new_train[new_train['site_id']==i]['floor_count'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)
        
for i in list_sid:
    temp=new_test[new_test['site_id']==i]['floor_count'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)
gc.collect()

0

In [17]:
for i in range(len(list_sid)):
    new_train.loc[new_train.site_id==list_sid[i],['floor_count']]=new_train[new_train['site_id']==list_sid[i]]['floor_count'].fillna(list_sid_mean[i])
    new_test.loc[new_test.site_id==list_sid[i],['floor_count']]=new_test[new_test['site_id']==list_sid[i]]['floor_count'].fillna(list_sid_mean_test[i])
new_train
gc.collect()

0

In [18]:
new_train.isnull().sum()*100/new_train.shape[0]

building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built            61.035302
floor_count            0.000000
air_temperature        0.484900
cloud_coverage        43.606908
dew_temperature        0.502424
precip_depth_1_hr     18.867345
sea_level_pressure     6.169160
wind_direction         7.253766
wind_speed             0.723084
dtype: float64

In [19]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                 0.000000
building_id            0.000000
meter                  0.000000
timestamp              0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built            58.991597
floor_count            0.000000
air_temperature        0.532167
cloud_coverage        46.866438
dew_temperature        0.625453
precip_depth_1_hr     18.709861
sea_level_pressure     6.035901
wind_direction         7.143488
wind_speed             0.724476
dtype: float64

C'est bon pour `floor_count`, au tour de `year_built`

In [20]:
list_pu_mean=[]
list_pu_mean_test=[]
for i in list_pu:
    temp=new_train[new_train['primary_use']==i]['year_built'].dropna()
    if (len(temp)>0):
        list_pu_mean.append(statistics.mean(temp))
    else :
        list_pu_mean.append(0)
        
for i in list_pu:
    temp=new_test[new_test['primary_use']==i]['year_built'].dropna()
    if (len(temp)>0):
        list_pu_mean_test.append(statistics.mean(temp))
    else :
        list_pu_mean_test.append(0)
gc.collect()

0

In [21]:
for i in range(len(list_pu)):
    if list_pu[i]!="Services":
        new_train.loc[new_train.primary_use==list_pu[i],['year_built']]=new_train[new_train['primary_use']==list_pu[i]]['year_built'].fillna(list_pu_mean[i])
        new_test.loc[new_test.primary_use==list_pu[i],['year_built']]=new_test[new_test['primary_use']==list_pu[i]]['year_built'].fillna(list_pu_mean_test[i])
new_train
gc.collect()

0

In [22]:
new_train.isnull().sum()*100/new_train.shape[0]

building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built             0.485755
floor_count            0.000000
air_temperature        0.484900
cloud_coverage        43.606908
dew_temperature        0.502424
precip_depth_1_hr     18.867345
sea_level_pressure     6.169160
wind_direction         7.253766
wind_speed             0.723084
dtype: float64

In [23]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                 0.000000
building_id            0.000000
meter                  0.000000
timestamp              0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built             0.462185
floor_count            0.000000
air_temperature        0.532167
cloud_coverage        46.866438
dew_temperature        0.625453
precip_depth_1_hr     18.709861
sea_level_pressure     6.035901
wind_direction         7.143488
wind_speed             0.724476
dtype: float64

In [24]:
list_sid_mean=[]
list_sid_mean_test=[]
for i in list_sid:
    temp=new_train[new_train['site_id']==i]['year_built'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)
        
for i in list_sid:
    temp=new_test[new_test['site_id']==i]['year_built'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)
gc.collect()

0

In [25]:
for i in range(len(list_sid)):
    new_train.loc[new_train.site_id==list_sid[i],['year_built']]=new_train[new_train['site_id']==list_sid[i]]['year_built'].fillna(list_sid_mean[i])
    new_test.loc[new_test.site_id==list_sid[i],['year_built']]=new_test[new_test['site_id']==list_sid[i]]['year_built'].fillna(list_sid_mean_test[i])
new_train
gc.collect()

0

In [26]:
new_train.isnull().sum()*100/new_train.shape[0]

building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built             0.000000
floor_count            0.000000
air_temperature        0.484900
cloud_coverage        43.606908
dew_temperature        0.502424
precip_depth_1_hr     18.867345
sea_level_pressure     6.169160
wind_direction         7.253766
wind_speed             0.723084
dtype: float64

In [27]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                 0.000000
building_id            0.000000
meter                  0.000000
timestamp              0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built             0.000000
floor_count            0.000000
air_temperature        0.532167
cloud_coverage        46.866438
dew_temperature        0.625453
precip_depth_1_hr     18.709861
sea_level_pressure     6.035901
wind_direction         7.143488
wind_speed             0.724476
dtype: float64

Pour les infos météo, on se base seulement selon le site. Se baser sur `primary_use` n'aurait pas de sens pour ce type de donnée.

In [28]:
for i in list_sid:
    print(i)
    print(new_train[new_train['site_id']==i].isnull().sum()*100/new_train[new_train['site_id']==i].shape[0])
    print("")

13
building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built             0.000000
floor_count            0.000000
air_temperature        0.022790
cloud_coverage        48.947567
dew_temperature        0.022790
precip_depth_1_hr      0.147690
sea_level_pressure     1.377997
wind_direction         2.481301
wind_speed             0.022790
dtype: float64

9
building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built             0.000000
floor_count            0.000000
air_temperature        0.102563
cloud_coverage        39.411560
dew_temperature        0.125330
precip_depth_1_hr      0.125218
sea_level_pressure     2.770252
wind_direction     

In [29]:
#cloud coverage
#site id 7, 11 n'ont aucune donnée

list_sid_mean=[]
list_sid_mean_test=[]
for i in list_sid:
    temp=new_train[new_train['site_id']==i]['cloud_coverage'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)
        
for i in list_sid:
    temp=new_test[new_test['site_id']==i]['cloud_coverage'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)

for i in range(len(list_sid)):
    if (list_sid[i] != 7) & (list_sid[i] != 11):
        new_train.loc[new_train.site_id==list_sid[i],['cloud_coverage']]=new_train[new_train['site_id']==list_sid[i]]['cloud_coverage'].fillna(list_sid_mean[i])
        new_test.loc[new_test.site_id==list_sid[i],['cloud_coverage']]=new_test[new_test['site_id']==list_sid[i]]['cloud_coverage'].fillna(list_sid_mean_test[i])
new_train
gc.collect()

0

In [30]:
new_train.isnull().sum()*100/new_train.shape[0] #il reste 2% de cloud_coverage inconnu

building_id            0.000000
meter                  0.000000
timestamp              0.000000
meter_reading          0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built             0.000000
floor_count            0.000000
air_temperature        0.484900
cloud_coverage         2.446617
dew_temperature        0.502424
precip_depth_1_hr     18.867345
sea_level_pressure     6.169160
wind_direction         7.253766
wind_speed             0.723084
dtype: float64

In [31]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                 0.000000
building_id            0.000000
meter                  0.000000
timestamp              0.000000
site_id                0.000000
primary_use            0.000000
square_feet            0.000000
year_built             0.000000
floor_count            0.000000
air_temperature        0.532167
cloud_coverage         2.352941
dew_temperature        0.625453
precip_depth_1_hr     18.709861
sea_level_pressure     6.035901
wind_direction         7.143488
wind_speed             0.724476
dtype: float64

In [32]:
#precip_depth_1_hr
#12, 5, 1 n'ont pas de donnée

list_sid_mean=[]
list_sid_mean_test=[]
for i in list_sid:
    temp=new_train[new_train['site_id']==i]['precip_depth_1_hr'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)

for i in list_sid:
    temp=new_test[new_test['site_id']==i]['precip_depth_1_hr'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)
        
for i in range(len(list_sid)):
    if (list_sid[i] != 12) & (list_sid[i] != 5) & (list_sid[i] != 1):
        new_train.loc[new_train.site_id==list_sid[i],['precip_depth_1_hr']]=new_train[new_train['site_id']==list_sid[i]]['precip_depth_1_hr'].fillna(list_sid_mean[i])
        new_test.loc[new_test.site_id==list_sid[i],['precip_depth_1_hr']]=new_test[new_test['site_id']==list_sid[i]]['precip_depth_1_hr'].fillna(list_sid_mean_test[i])
new_train
gc.collect()

0

In [33]:
new_train.isnull().sum()*100/new_train.shape[0] #il reste 8% d'inconnu pour precip_depth_1_hr

building_id           0.000000
meter                 0.000000
timestamp             0.000000
meter_reading         0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.000000
floor_count           0.000000
air_temperature       0.484900
cloud_coverage        2.446617
dew_temperature       0.502424
precip_depth_1_hr     8.309268
sea_level_pressure    6.169160
wind_direction        7.253766
wind_speed            0.723084
dtype: float64

In [34]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                0.000000
building_id           0.000000
meter                 0.000000
timestamp             0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.000000
floor_count           0.000000
air_temperature       0.532167
cloud_coverage        2.352941
dew_temperature       0.625453
precip_depth_1_hr     7.899160
sea_level_pressure    6.035901
wind_direction        7.143488
wind_speed            0.724476
dtype: float64

In [35]:
#sea_lever_pressure
#le site 5 n'a aucune donnée

list_sid_mean=[]
list_sid_mean_test=[]
for i in list_sid:
    temp=new_train[new_train['site_id']==i]['sea_level_pressure'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)
        
for i in list_sid:
    temp=new_test[new_test['site_id']==i]['sea_level_pressure'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)

for i in range(len(list_sid)):
    if list_sid[i] != 5:
        new_train.loc[new_train.site_id==list_sid[i],['sea_level_pressure']]=new_train[new_train['site_id']==list_sid[i]]['sea_level_pressure'].fillna(list_sid_mean[i])
        new_test.loc[new_test.site_id==list_sid[i],['sea_level_pressure']]=new_test[new_test['site_id']==list_sid[i]]['sea_level_pressure'].fillna(list_sid_mean_test[i])
new_train
gc.collect()

0

In [36]:
new_train.isnull().sum()*100/new_train.shape[0] #il reste 3.8% d'inconnu pour sea_level_pressure

building_id           0.000000
meter                 0.000000
timestamp             0.000000
meter_reading         0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.000000
floor_count           0.000000
air_temperature       0.484900
cloud_coverage        2.446617
dew_temperature       0.502424
precip_depth_1_hr     8.309268
sea_level_pressure    3.934477
wind_direction        7.253766
wind_speed            0.723084
dtype: float64

In [37]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                0.000000
building_id           0.000000
meter                 0.000000
timestamp             0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.000000
floor_count           0.000000
air_temperature       0.532167
cloud_coverage        2.352941
dew_temperature       0.625453
precip_depth_1_hr     7.899160
sea_level_pressure    3.739496
wind_direction        7.143488
wind_speed            0.724476
dtype: float64

In [38]:
#air_temperature_v2

temp_train = pd.read_csv('kaggle/input/temperature.csv')
temp_train['datetime'] = pd.to_datetime(temp_train['datetime']) #the train dataset contains a 'timestamp' column we convert to a datetime object for ease of use
temp_test = pd.read_csv('kaggle/input/filled_weather_test.csv')
temp_test.timestamp = pd.to_datetime(temp_test.timestamp, format='%Y-%m-%d %H:%M:%S')

temp_train.set_index('datetime',inplace=True)
temp_train=temp_train.subtract(273.15)
temp_train.index.names=['timestamp']
temp_test.set_index('timestamp',inplace=True)

villes=['Jacksonville','NA','Phoenix','Philadelphia','San Francisco','NA','Philadelphia','Montreal','Jacksonville','San Antonio','Las Vegas','Montreal','NA','Minneapolis','Philadelphia','Toronto']

j=0
for i in villes:
    if i!='NA':
        index=new_train[new_train['site_id']==j][['timestamp','air_temperature']].index
        temp=new_train[new_train['site_id']==j][['timestamp','air_temperature']].set_index('timestamp')
        temp[temp.air_temperature.isnull()]=temp_train[[i]]     
        temp=temp.set_index(index)
        new_train.loc[new_train.site_id==j,'air_temperature']=temp
    j+=1

for i in range(16):
    index=new_test[new_test['site_id']==i][['timestamp','air_temperature']].index
    temp=new_test[new_test['site_id']==i][['timestamp','air_temperature']].set_index('timestamp')
    temp[temp.air_temperature.isnull()]=temp_test[[str(i)]] 
    new_test.loc[new_test.site_id==i,'air_temperature']=temp.set_index(index)


In [39]:
del temp_train, temp_test, temp

gc.collect()
list_sid_mean=[]
list_sid_mean_test=[]
for i in list_sid:
    temp=new_train[new_train['site_id']==i]['air_temperature'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)
        
for i in list_sid:
    temp=new_test[new_test['site_id']==i]['air_temperature'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)

for i in range(len(list_sid)):
    new_train.loc[new_train.site_id==list_sid[i],['air_temperature']]=new_train[new_train['site_id']==list_sid[i]]['air_temperature'].fillna(list_sid_mean[i])
    new_test.loc[new_test.site_id==list_sid[i],['air_temperature']]=new_test[new_test['site_id']==list_sid[i]]['air_temperature'].fillna(list_sid_mean_test[i])
new_train.isnull().sum()*100/new_train.shape[0]
gc.collect()

0

In [40]:
new_train.isnull().sum()*100/new_train.shape[0]

building_id           0.000000
meter                 0.000000
timestamp             0.000000
meter_reading         0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.000000
floor_count           0.000000
air_temperature       0.000000
cloud_coverage        2.446617
dew_temperature       0.502424
precip_depth_1_hr     8.309268
sea_level_pressure    3.934477
wind_direction        7.253766
wind_speed            0.723084
dtype: float64

In [41]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                0.000000
building_id           0.000000
meter                 0.000000
timestamp             0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.000000
floor_count           0.000000
air_temperature       0.000000
cloud_coverage        2.352941
dew_temperature       0.625453
precip_depth_1_hr     7.899160
sea_level_pressure    3.739496
wind_direction        7.143488
wind_speed            0.724476
dtype: float64

In [42]:
#dew_temperature
list_sid_mean=[]
list_sid_mean_test=[]
for i in list_sid:
    temp=new_train[new_train['site_id']==i]['dew_temperature'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)
        
for i in list_sid:
    temp=new_test[new_test['site_id']==i]['dew_temperature'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)

for i in range(len(list_sid)):
    new_train.loc[new_train.site_id==list_sid[i],['dew_temperature']]=new_train[new_train['site_id']==list_sid[i]]['dew_temperature'].fillna(list_sid_mean[i])
    new_test.loc[new_test.site_id==list_sid[i],['dew_temperature']]=new_test[new_test['site_id']==list_sid[i]]['dew_temperature'].fillna(list_sid_mean_test[i])

new_train.isnull().sum()*100/new_train.shape[0]
gc.collect()

0

In [43]:
#wind_direction
list_sid_mean=[]
list_sid_mean_test=[]
for i in list_sid:
    temp=new_train[new_train['site_id']==i]['wind_direction'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)

for i in list_sid:
    temp=new_test[new_test['site_id']==i]['wind_direction'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)

for i in range(len(list_sid)):
    new_train.loc[new_train.site_id==list_sid[i],['wind_direction']]=new_train[new_train['site_id']==list_sid[i]]['wind_direction'].fillna(list_sid_mean[i])
    new_test.loc[new_test.site_id==list_sid[i],['wind_direction']]=new_test[new_test['site_id']==list_sid[i]]['wind_direction'].fillna(list_sid_mean_test[i])

new_train.isnull().sum()*100/new_train.shape[0]


building_id           0.000000
meter                 0.000000
timestamp             0.000000
meter_reading         0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.000000
floor_count           0.000000
air_temperature       0.000000
cloud_coverage        2.446617
dew_temperature       0.000000
precip_depth_1_hr     8.309268
sea_level_pressure    3.934477
wind_direction        0.000000
wind_speed            0.723084
dtype: float64

In [44]:
#wind_speed
list_sid_mean=[]
list_sid_mean_test=[]

for i in list_sid:
    temp=new_train[new_train['site_id']==i]['wind_speed'].dropna()
    if (len(temp)>0):
        list_sid_mean.append(statistics.mean(temp))
    else :
        list_sid_mean.append(0)
        
for i in list_sid:
    temp=new_test[new_test['site_id']==i]['wind_speed'].dropna()
    if (len(temp)>0):
        list_sid_mean_test.append(statistics.mean(temp))
    else :
        list_sid_mean_test.append(0)

for i in range(len(list_sid)):
    new_train.loc[new_train.site_id==list_sid[i],['wind_speed']]=new_train[new_train['site_id']==list_sid[i]]['wind_speed'].fillna(list_sid_mean[i])
    new_test.loc[new_test.site_id==list_sid[i],['wind_speed']]=new_test[new_test['site_id']==list_sid[i]]['wind_speed'].fillna(list_sid_mean_test[i])

new_train.isnull().sum()*100/new_train.shape[0]
gc.collect()

0

In [45]:
new_test.isnull().sum()*100/new_test.shape[0]

row_id                0.000000
building_id           0.000000
meter                 0.000000
timestamp             0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.000000
floor_count           0.000000
air_temperature       0.000000
cloud_coverage        2.352941
dew_temperature       0.000000
precip_depth_1_hr     7.899160
sea_level_pressure    3.739496
wind_direction        0.000000
wind_speed            0.000000
dtype: float64

# <a id='6.1'>6.1 Model Training - Linear Regression</a>

In [46]:
#FE
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

new_train['date'] = new_train.timestamp.dt.date
new_train ['heure'] = new_train.timestamp.dt.hour
new_train ['mois'] = new_train.timestamp.dt.month
new_train ['joursemaine'] = new_train.timestamp.dt.dayofweek

cal = calendar()
vac = cal.holidays(start=new_train.timestamp.min(), end=new_train.timestamp.max())
new_train['EstVac'] = new_train['timestamp'].isin(vac).astype(int)

new_test['date'] = new_test.timestamp.dt.date
new_test['heure'] = new_test.timestamp.dt.hour
new_test['mois'] = new_test.timestamp.dt.month
new_test['joursemaine'] = new_test.timestamp.dt.dayofweek

cal2 = calendar()
vac2 = cal2.holidays(start=new_test.timestamp.min(), end=new_test.timestamp.max())
new_test['EstVac'] = new_test['timestamp'].isin(vac2).astype(int)
gc.collect()

45

In [47]:
new_train.primary_use = new_train.primary_use.astype('category')
new_test.primary_use = new_test.primary_use.astype('category')
gc.collect()

0

In [48]:
new_train = new_train.dropna(axis='columns')
gc.collect()

0

In [49]:
new_test=new_test.dropna(axis='columns')
gc.collect()

0

In [50]:
new_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,dew_temperature,wind_direction,wind_speed,date,heure,mois,joursemaine,EstVac
103,105,0,2016-01-01,23.3036,1,Education,50623,1964.0,5.0,3.800781,2.400391,240.0,3.099609,2016-01-01,0,1,4,1
104,106,0,2016-01-01,0.3746,1,Education,5374,1964.0,4.0,3.800781,2.400391,240.0,3.099609,2016-01-01,0,1,4,1
105,106,3,2016-01-01,0.0,1,Education,5374,1964.0,4.0,3.800781,2.400391,240.0,3.099609,2016-01-01,0,1,4,1
106,107,0,2016-01-01,175.184006,1,Education,97532,2005.0,10.0,3.800781,2.400391,240.0,3.099609,2016-01-01,0,1,4,1
107,108,0,2016-01-01,91.265297,1,Education,81580,1913.0,5.0,3.800781,2.400391,240.0,3.099609,2016-01-01,0,1,4,1


In [51]:
%%time
new_train = reduce_mem_usage(new_train)
new_test = reduce_mem_usage(new_test)

Mem. usage decreased to 1004.32 Mb (34.6% reduction)
Mem. usage decreased to 2107.60 Mb (34.6% reduction)
CPU times: user 7.24 s, sys: 6.66 s, total: 13.9 s
Wall time: 16.1 s


In [52]:
# On isole le batiment qui consomme beaucoup plus que les autres ainsi que le site et le type de conso qui consomment beaucoup plus
new_train['Indicatrice1'] = (new_train["site_id"]==13) & (new_train["meter"]==2).astype('uint8')
new_test['Indicatrice1'] = (new_test["site_id"]==13) & (new_test["meter"]==2).astype('uint8')

In [53]:
new_train['Indicatrice2'] = (new_train['building_id'] == 1099).astype('uint8')
new_test['Indicatrice2'] = (new_test['building_id'] == 1099).astype('uint8')

gc.collect()

0

In [54]:
new_test.shape, new_train.shape

((41697600, 20), (19869886, 20))

In [55]:
new_train['square_feet']=np.log1p(new_train['square_feet']**0.5)
new_test['square_feet']=np.log1p(new_test['square_feet']**0.5)

In [56]:
new_train.date=pd.to_numeric(pd.to_datetime(new_train.date))
new_test.date=pd.to_numeric(pd.to_datetime(new_test.date))

In [57]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(new_train['primary_use'])
new_train['primary_use']=le.transform(new_train['primary_use'])

le = preprocessing.LabelEncoder()
le.fit(new_test['primary_use'])
new_test['primary_use']=le.transform(new_test['primary_use'])

In [58]:
train_df = new_train.dropna(axis='columns')
test_df=new_test.dropna(axis='columns')

In [59]:
del new_train, new_test
gc.collect()

0

In [60]:
target=np.log1p(train_df['meter_reading'])
train_df=train_df.drop('meter_reading',axis=1)

In [61]:
categorical_f=["building_id","site_id","meter","primary_use","date","heure","mois","joursemaine","EstVac","Indicatrice1","Indicatrice2"]

import category_encoders

ce=category_encoders.CountEncoder(cols=categorical_f)
ce.fit(train_df)
train_df=ce.transform(train_df)


N_train=train_df.shape[0]
for f in categorical_f:
    train_df[f]=train_df[f]/N_train

ce.fit(test_df)
test_df=ce.transform(test_df)


N_test=test_df.shape[0]
for f in categorical_f:
    test_df[f]=test_df[f]/N_test

In [62]:
train_df=reduce_mem_usage(train_df)
test_df=reduce_mem_usage(test_df)

Mem. usage decreased to 985.37 Mb (58.1% reduction)
Mem. usage decreased to 2226.89 Mb (56.2% reduction)


In [63]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [64]:
train_df=train_df.reset_index(drop=True)
target=target.reset_index(drop=True)
train_df=train_df.drop(['timestamp'],axis=1)

In [65]:
row_ids=test_df["row_id"]
test_df=test_df.drop(['timestamp','row_id'],axis=1)

In [66]:
kf=KFold(n_splits=5)
models=[]
for trai_ind, val_ind in kf.split(train_df):
    test=trai_ind
    train_f=train_df.loc[trai_ind]
    target_f=target[trai_ind]
    
    val_f=train_df.loc[val_ind]
    val_targ=target[val_ind]
    
    model=LinearRegression()
    model.fit(train_f,target_f)
    models.append(model)
    val_pred=model.predict(val_f)
    print(np.sqrt(mean_squared_error(val_targ,val_pred)))
    del train_f, target_f, val_f, val_targ
    gc.collect()  


1.8446679
1.7897623
1.8735828
1.8396672
1.7875593


In [67]:
del train_df, target
gc.collect()

0

In [68]:
result=0
for m in models:
    result+=np.expm1(m.predict(test_df))/len(models)
    del m
    gc.collect()

In [69]:
del test_df, models
gc.collect()

0

In [70]:
results_df=pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(result, 0, a_max=None)})
del row_ids, result
gc.collect()
results_df.to_csv("sub_linear_2.csv",index=False)

# <a id='6.2'>6.2 Model Training - Decision Tree</a>

In [None]:
#FE
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

new_train['date'] = new_train.timestamp.dt.date
new_train ['heure'] = new_train.timestamp.dt.hour
new_train ['mois'] = new_train.timestamp.dt.month
new_train ['joursemaine'] = new_train.timestamp.dt.dayofweek

#cal = calendar()
#vac = cal.holidays(start=new_train.timestamp.min(), end=new_train.timestamp.max())
#new_train['EstVac'] = new_train['timestamp'].isin(vac)

new_test['date'] = new_test.timestamp.dt.date
new_test['heure'] = new_test.timestamp.dt.hour
new_test['mois'] = new_test.timestamp.dt.month
new_test['joursemaine'] = new_test.timestamp.dt.dayofweek

#cal = calendar()
#vac = cal.holidays(start=new_test.timestamp.min(), end=new_test.timestamp.max())
#new_test['EstVac'] = new_test['timestamp'].isin(vac)
gc.collect()

In [None]:
new_train.primary_use = new_train.primary_use.astype('category')
new_test.primary_use = new_test.primary_use.astype('category')
gc.collect()

In [None]:
new_train = new_train.dropna(axis='columns')
gc.collect()

In [None]:
new_test=new_test.dropna(axis='columns')
gc.collect()

In [None]:
new_train = reduce_mem_usage(new_train)
new_test = reduce_mem_usage(new_test)

In [None]:
new_train['Indicatrice1'] = (new_train["site_id"]==13) & (new_train["meter"]==2).astype('uint8')
new_test['Indicatrice1'] = (new_test["site_id"]==13) & (new_test["meter"]==2).astype('uint8')

In [None]:

new_train['Indicatrice2'] = (new_train['building_id'] == 1099).astype('uint8')
new_test['Indicatrice2'] = (new_test['building_id'] == 1099).astype('uint8')

gc.collect()

In [None]:
%%time
from sklearn.model_selection import GridSearchCV #, train_test_split,TimeSeriesSplit
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing

In [None]:
y_train = new_train['meter_reading']

# my_tree = DecisionTreeRegressor()

# MSE = make_scorer(mean_squared_error)
gc.collect()

In [None]:

le = preprocessing.LabelEncoder()
le.fit(new_train['primary_use'])
new_train['primary_use']=le.transform(new_train['primary_use'])
gc.collect()

In [None]:
new_test['primary_use']=le.transform(new_test['primary_use'])
new_train['date']=pd.to_numeric(pd.to_datetime(new_train['date']))
new_test['date']=pd.to_numeric(pd.to_datetime(new_test['date']))
new_train
gc.collect()

In [None]:
%%time
new_train = reduce_mem_usage(new_train)
new_test = reduce_mem_usage(new_test)

In [None]:
new_train = new_train.drop('timestamp', axis = 1)

In [None]:
new_test = new_test.drop('timestamp', axis = 1)
gc.collect()

In [None]:
new_train.shape, new_test.shape

In [None]:
new_test = new_test.drop('row_id', axis = 1)
new_test

In [None]:
# new_train = pd.read_csv('X.csv', memory_map=True)
# new_test = pd.read_csv('X_test.csv', memory_map=True)

# import gc
# gc.collect()
# new_train = reduce_mem_usage(new_train)
# new_test = reduce_mem_usage(new_test)

In [None]:
# %%time
# y_train = pd.read_csv('y_train.csv', memory_map=True)

In [None]:
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from math import sqrt

In [None]:
del new_train["wind_direction"]
del new_test["wind_direction"]

In [None]:
new_train.Indicatrice1 = new_train.Indicatrice1.astype('uint8')
new_test.Indicatrice1 = new_test.Indicatrice1.astype('uint8')

In [None]:
X = new_train.drop(['meter_reading'], axis=1)

y = y_train

In [None]:
# y_train = pd.read_csv('y_train.csv', memory_map=True)

In [None]:
#X = new_train.drop([0])

In [None]:
X.shape

In [None]:
%%time
reg = tree.DecisionTreeRegressor(min_samples_leaf= 5000, min_samples_split = 5000)#, min_impurity_decrease=0.5, min_impurity_split= 0.5)
reg = reg.fit(X,y)
y_train_predict_tree=reg.predict(X)

In [None]:
print("La RMSE de l'arbre de régression de train",sqrt(mean_squared_error(y, y_train_predict_tree)))

print("La RMSLE de l'arbre de régression de train",sqrt(mean_squared_log_error(y, y_train_predict_tree)))
gc.collect()

In [None]:
a = tree.plot_tree(reg, 
#                feature_names=X.columns, 
#                class_names=X.columns, 
              filled=True, 
              rounded=True, 
              fontsize=10)


In [None]:
from sklearn.tree import export_text
r=export_text(reg)
print(r)

In [None]:
decision_paths = reg.decision_path(X)
decision_paths

In [None]:
n_nodes = reg.tree_.node_count
children_left = reg.tree_.children_left
children_right = reg.tree_.children_right
feature = reg.tree_.feature
threshold = reg.tree_.threshold

node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
while len(stack) > 0:
    # `pop` ensures each node is only visited once
    node_id, depth = stack.pop()
    node_depth[node_id] = depth

    # If the left and right child of a node is not the same we have a split
    # node
    is_split_node = children_left[node_id] != children_right[node_id]
    # If a split node, append left and right children and depth to `stack`
    # so we can loop through them
    if is_split_node:
        stack.append((children_left[node_id], depth + 1))
        stack.append((children_right[node_id], depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has {n} nodes and has "
      "the following tree structure:\n".format(n=n_nodes))
for i in range(n_nodes):
    if is_leaves[i]:
        print("{space}node={node} is a leaf node.".format(
            space=node_depth[i] * "\t", node=i))
    else:
        print("{space}node={node} is a split node: "
              "go to node {left} if X[:, {feature}] <= {threshold} "
              "else to node {right}.".format(
                  space=node_depth[i] * "\t",
                  node=i,
                  left=children_left[i],
                  feature=feature[i],
                  threshold=threshold[i],
                  right=children_right[i]))

Now that we have tuned the model parameters and have an idea of model performance. We will fit on the entire training dataset so we have as much information as possible for the final test set prediction.

# <a id='7'>7. Model Predictions and Kaggle Submission</a>

In [None]:
# %%time
# X_test.to_csv('X_test.csv', index=False)

# %%time
# X.to_csv('X.csv', index=False)

# y_train.to_csv('y_train.csv', index=False)

In [None]:
%%time
#obtention des prédictions et du fichier à évaluer 

y_test_predict_tree=reg.predict(new_test)


In [None]:
%%time
my_submission = pd.DataFrame({'row_id': new_test.index, 'meter_reading': y_test_predict_tree})

In [None]:
%%time
my_submission.to_csv('submission_tree_f7.csv', index=False)