## ASHRAE Energy Predictions

## Importation des données

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.stats
import gc
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
le = LabelEncoder()
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from sklearn.ensemble import IsolationForest
    

In [None]:
train = pd.read_csv('drive/MyDrive/train.csv')
weather_train = pd.read_csv('drive/MyDrive/weather_train.csv')
building_metadata = pd.read_csv('drive/MyDrive/building_metadata.csv')


# Fusion des différentes tables de données

In [None]:
train = train.merge(building_metadata, on='building_id', how='left')
train.loc[  (train.site_id==0)&(train.meter == 'electricity') , 'meter_reading'] *= 0.2931
alltrain = train.merge(weather_train, on=['site_id','timestamp'], how='left')
del building_metadata,weather_train,train
gc.collect()
print(alltrain.shape)

(20216100, 16)


La base de données complète contient au total ... individus et ... variables.

**mauvaise compréhension des variables: building_id, timestamp, site_id et meter**  


## Analyse des valeurs manquantes

In [None]:
(alltrain.isna().sum()/alltrain.shape[0]).sort_values(ascending=False)

floor_count           0.826528
year_built            0.599900
cloud_coverage        0.436551
precip_depth_1_hr     0.185447
wind_direction        0.071678
sea_level_pressure    0.060925
wind_speed            0.007107
dew_temperature       0.004953
air_temperature       0.004781
square_feet           0.000000
primary_use           0.000000
site_id               0.000000
meter_reading         0.000000
timestamp             0.000000
meter                 0.000000
building_id           0.000000
dtype: float64

In [None]:
alltrain.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0


## Extraction de features et pré-processing

In [None]:
def rec_heur(x):
  """Cette fonction, regroupera les heures de la journée en 3 grands groupes, donc elle permettra de créer 
  une nouvelle variable qu'on appeler plus tard heureDiscredite"""
    
  if x in np.arange(6, 19):
        return 'journee'
    
  if x in np.arange(19, 23):
        return 'nuit'
    
  if x in [23, 0, 1, 2, 3, 4, 5]:
        return 'tard'

In [None]:
def discredit_var(x):

  """De même pour cette fonction, regroupera les ages des batiments suivant les quantiles
    Q1= 1951 , Q2= 1969, Q3 = 1993, donc elle permettra de recoder la variable year_built"""
    
  if x <= 1951:
      return 'yearB_q1'
    
  if  1951 < x <= 1969:
      return 'yearB_q2'
    
  if 1969 < x <= 1993:
      return 'yearB_q3'
    
  if  1993 < x:
      return 'yearB_q4'
    

In [None]:
def preProcecing_df(df_):
    """La fonction preProcecing, suit ces étapes:
    - Transformation de la variable timestamp en datetime et renomme les modalités de meter
    - Création des variables heureDiscredite, week_end et saison 
    - Remplissage des valeurs NAN:

          * Un groupby suivant site_id et year_built puis calcul la médianne de chaque groupe afin 
            de remplir certaines valeurs manquantes de year_built, et recode la variable year_built, 
            enfin, les autres valeurs manquantes seront remplies par le mode de  year_built recodé.

          * Un groupby suivant les variables 'site_id', 'saison', 'week_end' et 'primary_use'
          puis calcul la médianne de chaque groupe afin de remplir certaines valeurs manquantes,
          enfin, les autres valeurs manquantes seront remplies par la médianne

          * Quant à la variable floor_count, ces valeurs manquantes seront remplacées par 0.

    - Conversion des variables dont le type à été mal compris. 
    - Suppression des variables inutiles."""

    df=df_.copy()
    
    
    saison={3: 'printent',4:'printent',5:'printent',
          6: 'ete', 7: 'ete',8: 'ete', 
          9: 'automne', 10: 'automne', 11: 'automne', 
          1: 'hiver', 12: 'hiver', 2: 'hiver'}
      
    df.timestamp=pd.to_datetime(df['timestamp'])

    df['meter'] = pd.Categorical(df['meter']).rename_categories({0: 'electricity', 
                                                                   1: 'chilledwater',
                                                                   2: 'steam', 
                                                                   3: 'hotwater'})

    df['mois'] = df.timestamp.dt.month
    df['day'] = df.timestamp.dt.day
    df['heure'] = df.timestamp.dt.hour
    
    df['heureDiscredite'] = df['heure'].apply(rec_heur)
    
    df['week_end'] = [1 if x in [5,6] else 0 for x in df.day]

    df['saison'] = df['mois'].apply(lambda x: saison.get(x))

    median_group = df.groupby(['site_id'])['year_built'].transform('median')

    df['year_built'].fillna(median_group,inplace = True)

    df['year_built'].fillna(df['year_built'].median(), inplace=True)

    df['year_built'] = df['year_built'].apply(discredit_var) 

    df.floor_count.fillna(0,inplace = True)
    df['square_feet']=np.log(df['square_feet'])

    colonneAsNum=['air_temperature', 'dew_temperature','wind_direction']

    for col in colonneAsNum:
        median_group = df.groupby(['site_id', 'saison', 'week_end', 'primary_use'])[col].transform('median')
        df[col].fillna(median_group,inplace = True)

    for col in [ 'day', 'heure', 'timestamp',  "precip_depth_1_hr", "wind_speed", 
                "sea_level_pressure", "cloud_coverage", "mois"]:
        del df[col]

    
    df["building_id"] = df["building_id"].astype('category')
    df["site_id"] = df["site_id"].astype('category')
    df["saison"] = df["saison"].astype('category')
    df["heureDiscredite"] = df["heureDiscredite"].astype('category')
    df["year_built"] = df["year_built"].astype('category')
    df["primary_use"] = df["primary_use"].astype('category')

    if 'meter_reading' in df.columns:
      df =  df[df['meter_reading']!= np.float(0)]
      df['meter_reading']=np.log1p(df['meter_reading'])



    
    return df



## Encodeur One Hot

In [None]:
def encodeur(df): 

  """Cette fonction rajoutera comme nouvelle variable les n-1 modalités de chaque variable
   qualitative   puis la supprimera.
  """
  
  X_Encod=pd.concat([df, pd.get_dummies(df["primary_use"], dtype=int) ], axis=1)

  X_Encod=pd.concat([X_Encod, pd.get_dummies(df["saison"], dtype=int) ], axis=1)

  X_Encod=pd.concat([X_Encod, pd.get_dummies(df["heureDiscredite"], dtype=int) ], axis=1)
  X_Encod=pd.concat([X_Encod, pd.get_dummies(df["meter"], dtype=int) ], axis=1)
  X_Encod=pd.concat([X_Encod, pd.get_dummies(df["year_built"], dtype=int) ], axis=1)

  for col in ['year_built', 'yearB_q4', "saison","printent",
                "heureDiscredite", "journee", 'meter', 'hotwater','Office',"primary_use" ]:
      del X_Encod[col]


  return X_Encod

## Train test split

In [None]:
def trainAndTest(DF):
    df= DF.copy()
    
    uniqueSite=list(pd.unique(df["site_id"]))
    rs = ShuffleSplit(n_splits=1, test_size=.7, random_state=0)
    for train_index, test_index in rs.split(uniqueSite):

        df['trainIndex'] = [1 if x in train_index else 0 for x in df.site_id]
        x_train = df[df['trainIndex']==1]
        y_train = x_train['meter_reading']

        x_test = df[df['trainIndex']==0]
        y_test = x_test['meter_reading']

    del x_train['trainIndex'] 
    del x_train['meter_reading'] 

    del x_test['trainIndex']
    del x_test['meter_reading']
    
    return x_train, y_train, x_test, y_test


# *Standadisartion*

In [None]:
def minMax(DF, listColumns, scaler):
    df=DF.copy()
    for col in listColumns:
        df[col]=scaler.fit_transform(df[[col]])
        
    return df

# Pipeline

In [None]:
listColumns= [ 'wind_direction',  'dew_temperature',
              'air_temperature', 'floor_count', 'square_feet']
def my_pipe(df, trainTest= True):

  X = preProcecing_df(df)

  X = encodeur(X)

  if trainTest:
    X_train, Y_train, X_test, Y_test = trainAndTest(X)
    return X_train, Y_train, X_test, Y_test
  
  X_final = minMax( X, listColumns, scaler)

  return X_final.drop(["row_id"],axis=1)

# Retrait des valeurs aberrantes en utilisant IsolationForest

In [None]:
x_train, Y_train, X_test, Y_test = my_pipe(alltrain)
clf = IsolationForest(random_state=0).fit(x_train)
y_ab = clf.predict(x_train)
x_train = x_train[y_ab==1]
y_train = Y_train[y_ab==1]

In [None]:
X_test=minMax( X_test, listColumns, scaler)
x_train=minMax( x_train, listColumns, scaler)

# Entrainement  et Prédiction

In [None]:
def modelTrain(x_train, y_train, x_test, y_test, modEl):
    
    modEl.fit(x_train, y_train)
    
    y_pred = modEl.predict(x_test)
    scOre = np.sqrt(mean_squared_error(y_test, y_pred) )
    
    return scOre, modEl


## GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=0, 
                                    learning_rate= 0.05,
                                   n_estimators = 200)

In [None]:

scOre, gbr = modelTrain(x_train, y_train, X_test, Y_test, gbr)
pickle.dump(Gbr, open("drive/MyDrive/gbr.pickle.dat", "wb"))
scOre

In [None]:
pickle.dump(Gbr, open("drive/MyDrive/gbr.pickle.dat", "wb"))

In [None]:
pickle.dump(Gbr, open("drive/MyDrive/gbr.pickle.dat", "wb"))

## HistGradientBoostingRegressor

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

hgbr = HistGradientBoostingRegressor(random_state=0,
                                     scoring='neg_root_mean_squared_error',
                                     max_depth = 20 
                                     )
scOre_h, hgbr = modelTrain(x_train, y_train, X_test, Y_test, hgbr)

# choix du modele

## Soumission

In [None]:
test = pd.read_csv('drive/MyDrive/test.csv')
weather_test = pd.read_csv('drive/MyDrive/weather_test.csv')
building_metadata = pd.read_csv('drive/MyDrive/building_metadata.csv')

test =test.merge(building_metadata, on='building_id', how='left')
alltest= test.merge(weather_test, on=['site_id', 'timestamp'], how='left')
del test, weather_test,building_metadata
gc.collect()

0

In [2]:
X_final = my_pipe(alltest, trainTest= False) 

In [None]:
loaded_model = pickle.load(open("pima.pickle.dat", "rb"))
y_pred_final = loaded_model.predict(X_final)
y_pred_final.loc[ (X_final.site_id==0)&(X_final.electricity==1) ] /=0.2931
Ypred_finaldf = pd.DataFrame(data=y_pred_final)
Ypred_finaldf.to_csv('mypred.csv', index=False)

In [None]:
submission  = pd.read_csv('drive/MyDrive/sample_submission.csv')
submission['meter_reading'] = np.exp(y_pred_final)
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('drive/MyDrive/mysubmission.csv', index=False)