In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import json
import geopandas as gpd
import datetime
import scipy.stats as sst
    

from matplotlib.collections import LineCollection
from scipy.stats import chi2_contingency
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from shapely.geometry import Point, Polygon
%matplotlib inline


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data2015 = pd.read_csv("/kaggle/input/sea-building-energy-benchmarking/2015-building-energy-benchmarking.csv", sep=',',low_memory=False)
data2016 = pd.read_csv("/kaggle/input/sea-building-energy-benchmarking/2016-building-energy-benchmarking.csv", sep=',',low_memory=False)


In [None]:
with open('/kaggle/input/sea-building-energy-benchmarking/socrata_metadata_2015-building-energy-benchmarking.json') as json_data:
    data_dict = json.load(json_data)

In [None]:
for k in data_dict['columns']:
    if 'description' in k:
        print(k['name']+" ==> "+k['description'] ,'\n')
    else:
        print(k['name'])

In [None]:
data2016.rename(columns = {'ZipCode': 'Zip Codes', 'Comments': 'Comment','GHGEmissionsIntensity':'GHGEmissionsIntensity(kgCO2e/ft2)', 'TotalGHGEmissions':'GHGEmissions(MetricTonsCO2e)' }, inplace= True)

In [None]:
data = pd.concat([data2015, data2016])
len(data.columns)

In [None]:
data.columns

In [None]:
diff  = list(set(data2015.columns) - set(data2016.columns)) + list(set(data2016.columns) - set(data2015.columns))
diff1 = list(set(data2015.columns) - set(data2016.columns))
diff2 = list(set(data2016.columns) - set(data2015.columns))

In [None]:
# 0. Fonction qui donne le % de données manquantes (NaN) du dataframe

def ratio_nan(dataframe):
        print('* Ratio de NaN: {} %, Taille du jeu de données : {}\n'
              .format(round(dataframe.isna().sum().sum()/dataframe.shape[0]/dataframe.shape[1],2)*100,dataframe.shape))



    # 0.1 Fonction qui donne le % de données manquantes (NaN) par colonne

def ratio_nan_colonne(dataframe):
    ratio_nan_colonne = (dataframe.isna().sum()*100/(dataframe.shape[0])).sort_values(ascending=False)
    
    # Afficahge en graphique
    
    sns.set(style="dark")
    plt.figure(figsize=(17, 31))
    ax = sns.barplot(y = ratio_nan_colonne.index, x=ratio_nan_colonne.values)
    ax.xaxis.set_ticks_position('top')
    plt.title('1.1 Données manquantes par colonne (en %)', size=15)
    plt.show()

    
    
    # 0.2 Fonction qui donne le % de données manquantes (NaN) par ligne

def ratio_nan_ligne(dataframe):
    ratio_nan_ligne = pd.DataFrame(data = round(dataframe.isna().sum(axis=1)*100/dataframe.shape[1]), columns = ['%'])
    print(' 1.2 Ratio de NaN par ligne:\n')
    print(ratio_nan_ligne)

In [None]:
ratio_nan_colonne(data)
ratio_nan_ligne(data)
ratio_nan(data)

In [None]:
# On vérifie la liste des catégories pour procéder à d'éventuels traitements
print(data['Neighborhood'].unique().tolist())

In [None]:
print(data['LargestPropertyUseType'].unique().tolist())

In [None]:
print(data['PrimaryPropertyType'].unique().tolist())

In [None]:
# On corrige !


data['PrimaryPropertyType'].replace('Restaurant\n', 'Restaurant', inplace=True)
data['PrimaryPropertyType'].replace('Self-Storage Facility\n', 'Self-Storage Facility', inplace=True)
data['PrimaryPropertyType'].replace('Distribution Center\n', 'Distribution Center', inplace=True)


data['Neighborhood'].replace('Central', 'CENTRAL', inplace=True)
data['Neighborhood'].replace('North', 'NORTH', inplace=True)
data['Neighborhood'].replace('Ballard', 'BALLARD', inplace=True)
data['Neighborhood'].replace('Northwest', 'NORTHWEST', inplace=True)
data['Neighborhood'].replace('DELRIDGE NEIGHBORHOODS', 'DELRIDGE', inplace=True)
data['Neighborhood'].replace('Delridge', 'DELRIDGE', inplace=True)

In [None]:
def doublon_outliers(dataframe):
    
    # 1. Traitement des doublons et outliers

    print('\n',' 1. Traitement des doublons et outliers')
    
        # 1.A. Doublons (par OSEBuildingID)
        
    print('\n',' 1.A. Traitement des doublons\n')
      
    dataframe.drop_duplicates(inplace=True)
    ratio_nan(dataframe)
    dataframe.drop_duplicates(subset='OSEBuildingID',inplace=True, keep= "last")
    ratio_nan(dataframe)
        
        # 1.B. Outliers
        
    print('\n',' 1.B. Traitement des outliers\n')
    
    dataframe = dataframe[dataframe['SiteEnergyUseWN(kBtu)']>0]
    ratio_nan(dataframe)
    dataframe = dataframe[dataframe['GHGEmissions(MetricTonsCO2e)']>0]
    
    
        
    ratio_nan(dataframe)
    
    return dataframe

In [None]:
def use_type(dataframe):
    
    column_type = ['LargestPropertyUseType','PrimaryPropertyType']

    use_type = {
                'Retail Store' : 'Store',
                'Supermarket/Grocery Store' : 'Store',
                'Repair Services (Vehicle, Shoe, Locksmith, etc)' : 'Store',
                'Automobile Dealership': 'Store',
                'Personal Services (Health/Beauty, Dry Cleaning, etc)': 'Store',
                'Strip Mall': 'Store',
                'Wholesale Club/Supercenter': 'Store',
                'Other - Mall': 'Store',
                'Supermarket / Grocery Store': 'Store',

                'Restaurant' : 'Entertainment',
                'Other - Restaurant/Bar' : 'Entertainment',
                'Food Service': 'Entertainment' ,
                'Worship Facility': 'Entertainment',
                'Other - Recreation': 'Entertainment',
                'Other - Entertainment/Public Assembly': 'Entertainment',
                'Performing Arts': 'Entertainment',
                'Movie Theater': 'Entertainment',
                'Museum': 'Entertainment',
                'Social/Meeting Hall': 'Entertainment',
                'Fitness Center/Health Club/Gym': 'Entertainment',
                'Lifestyle Center' : 'Entertainment',
                
                'Hotel' : 'Residence',
                'Mid-Rise Multifamily' : 'Residence',
                'Low-Rise Multifamily' : 'Residence',
                'Mixed Use Property' : 'Residence',
                'Multifamily Housing' : 'Residence',
                'Other - Lodging/Residential': 'Residence',
                'Residence Hall/Dormitory': 'Residence',
                'Senior Care Community' :'Residence',
                'Residential Care Facility' : 'Residence',
                'High-Rise Multifamily': 'Residence',
                'Prison/Incarceration': 'Residence',
                'Residence Hall': 'Residence',
                
                'Medical Office' : 'Medical',
                'Urgent Care/Clinic/Other Outpatient' : 'Medical',
                'Laboratory' : 'Medical',
                'Hospital (General Medical & Surgical)' : 'Medical',
                'Health (General Medical & Surgical)' : 'Medical',
                'Other/Specialty Hospital' : 'Medical',
                'Outpatient Rehabilitation/Physical Therapy' : 'Medical',
                'Hospital' : 'Medical',
        
                'Small- and Mid-Sized Office' : 'Offices',
                'Other - Services' : 'Offices',
                'Bank Branch' : 'Offices',
                'Financial Office' : 'Offices',
                'Other - Public Services': 'Offices',
                'Police Station' : 'Offices',
                'Courthouse' : 'Offices',
                'Large Office' : 'Offices',
                'Office' : 'Offices',
     
                'K-12 School' : 'Education',
                'SPS-District K-12' : 'Education',
                'Other - Education': 'Education',
                'Vocational School' : 'Education', 
                'Adult Education' : 'Education',
                'Pre-school/Daycare': 'Education',
                'University': 'Education',
                'College/University': 'Education',
                'Library': 'Education',
                'College/Education' : 'Education',
        
                'Self-Storage Facility' : 'Warehouse',
                'Non-Refrigerated Warehouse' : 'Warehouse',
                'Distribution Center' : 'Warehouse',
                'Refrigerated Warehouse' : 'Warehouse',
                'Warehouse':'Warehouse',
        
                'Data Center' : 'Facility',
                'Manufacturing/Industrial Plant' : 'Facility',
                'Convention Center' : 'Facility',
                
                'Fire Station' : 'Other',
                'Other - Utility' : 'Other',
                'Parking' : 'Other',

               }
    
    for c in column_type:
        dataframe[c] = dataframe[c].replace(use_type)
    

    return dataframe

In [None]:
def filter_dataset(dataframe):
    
    # 2. Réduction du Dataset aux bâtiments non destinés à l’habitation
    
    print('\n',' 2. Réduction du Dataset aux bâtiments non destinés à l’habitation\n')
    
    c = 'Residence'
    
    index = dataframe[ dataframe['PrimaryPropertyType'] == c].index
    dataframe.drop(index, inplace=True)
    
    index = dataframe[ dataframe['LargestPropertyUseType'] == c].index
    dataframe.drop(index, inplace=True)
    
    ratio_nan(dataframe)
    return dataframe 

In [None]:
def location(dataframe):
    
    # 3.A. Location
    
    print('\n',' 3.A. Traitement de la location \n')
    
    
    # A.1 split en latitude et longitude
    
    postion = dataframe['Location'].str.split(pat=",",expand=True)
    postion = postion.drop(columns = [2,3,4,5], axis=1)
    remove_characters = ["{'latitude': '", "'longitude': '","'"]

    for i in range(0,2):
        for character in remove_characters:
            postion[i] = postion[i].str.replace(character, "",regex=True)
        postion[i] = pd.to_numeric(postion[i])
        postion[i] = postion[i].fillna(0)
     
    dataframe['Latitude'] = dataframe['Latitude'].fillna(0) + postion[0]
    dataframe['Longitude'] = dataframe['Longitude'].fillna(0) + postion[1]
    
    
    ratio_nan(dataframe)
    return dataframe

In [None]:
def suppression_coloonnes(dataframe):
    
    # 3.B. Suppression de colonnes non pertinentes
    
    print('\n',' 3.B. Suppression de colonnes non pertinentes \n')
    
    dataframe = dataframe.drop(columns=['Location','YearsENERGYSTARCertified','OSEBuildingID','DataYear',  'PropertyName','TaxParcelIdentificationNumber','SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
       'SiteEnergyUse(kBtu)','DefaultData', 'Comment', 'ComplianceStatus', 'Outlier','2010 Census Tracts', 'Seattle Police Department Micro Community Policing Plan Areas','City Council Districts',
        'SPD Beats', 'Zip Codes', 'Address', 'City','State','SteamUse(kBtu)','Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'OtherFuelUse(kBtu)','GHGEmissionsIntensity(kgCO2e/ft2)',
      'ListOfAllPropertyUseTypes','SecondLargestPropertyUseType','ThirdLargestPropertyUseType','SecondLargestPropertyUseTypeGFA','ThirdLargestPropertyUseTypeGFA'])
 
    # 'ListOfAllPropertyUseTypes': information dispo dans les autres colonnes
    # 'SecondLargestPropertyUseType','ThirdLargestPropertyUseType','SecondLargestPropertyUseTypeGFA','ThirdLargestPropertyUseTypeGFA'

    ratio_nan(dataframe)
    return dataframe

In [None]:
def variables_nonlineaire(dataframe):
    
        
    # 3.C Ajout de variables non linéaires
    
    print('\n',' 3.C Ajout de variables non linéaires \n')
    
    # Surfaces
    
    dataframe['largest/total'] = dataframe['LargestPropertyUseTypeGFA']*100/dataframe['PropertyGFATotal']
    
    # Remplacer les valeurs nulles et NAN de NumberofBuildings par 1:
    
    dataframe['NumberofBuildings'] = dataframe['NumberofBuildings'].fillna(1)
    dataframe['NumberofBuildings'] = dataframe['NumberofBuildings'].replace(0,1)
    dataframe['Gfa_building/nbbuildings'] = dataframe['PropertyGFABuilding(s)']/(dataframe['NumberofBuildings'])  

    # De même pour NumberofFloors
 
    dataframe['NumberofFloors'] = dataframe['NumberofFloors'].fillna(1)
    dataframe['NumberofFloors'] = dataframe['NumberofFloors'].replace(0,1)
    dataframe['Gfa_building/nbfloors'] = dataframe['PropertyGFABuilding(s)']/dataframe['NumberofFloors']
    
    ratio_nan(dataframe)
    return dataframe

In [None]:
def réorganisation_colonnes(dataframe):
    
    # 3.D. Réorganisation des colonnes
    
    print('\n',' 3.D. Réorganisation des colonnes \n')
    
        # ordre des variables
            
            # 1. Variables qualitatives
            # 2. Variables quantitatives
            # 3. Variables cibles
    
    dataframe = dataframe.reindex(columns = [ 'Neighborhood', 'CouncilDistrictCode','Latitude', 'Longitude',
                                             'YearBuilt','BuildingType','PrimaryPropertyType','LargestPropertyUseType',
                                             'PropertyGFATotal','LargestPropertyUseTypeGFA','largest/total',
                                             'PropertyGFABuilding(s)','NumberofBuildings',
                                             'Gfa_building/nbbuildings','NumberofFloors','Gfa_building/nbfloors',
                                             'PropertyGFAParking',
                                             'SiteEnergyUseWN(kBtu)','ENERGYSTARScore','GHGEmissions(MetricTonsCO2e)'])
                                             
    ratio_nan(dataframe)
    return dataframe

In [None]:
def traitement_colonnes(dataframe):
    
    # 3. Traitement des colonnes
    
    print('\n',' 3. Traitement des colonnes')
    
    dataframe = location(dataframe)
    dataframe = suppression_coloonnes(dataframe)
    dataframe = variables_nonlineaire(dataframe)
    dataframe = réorganisation_colonnes(dataframe)
    
    
    return dataframe

In [None]:
def nettoyage(dataframe) : 
    
    print('\n',' Nettoyage de données\n')
    ratio_nan(dataframe)
    
    dataframe = doublon_outliers(dataframe)

    
    dataframe = use_type(dataframe)
    dataframe = filter_dataset(dataframe)
    dataframe = traitement_colonnes(dataframe)
    
    return dataframe

In [None]:
def seattle_map(dataframe):
    
    # Seattle map
    
    # import street map
    street_map = gpd.read_file("/kaggle/input/seattle/Municipal_Boundaries.shp")
    
    #designate coordinate system
    crs = {'init':'EPSG:4326'}
           
    # zip x and y coordinates into single feature
    geometry = [Point(xy) for xy in zip(dataframe['Longitude'], dataframe['Latitude'])]
           
    # create GeoPandas dataframe
    geo_df = gpd.GeoDataFrame(dataframe,crs = crs,geometry = geometry)
           
    # create figure and axes, assign to subplot
    fig, ax = plt.subplots(figsize=(10,10))
    # add .shp mapfile to axes
    street_map.plot(ax=ax, alpha=0.4,color='grey')
           
    # add geodataframe to axes
    # assign 'SiteEnergyUseWN(kBtu)' variable to represent coordinates on graph
    # add legend
    # make datapoints transparent using alpha
    # assign size of points using markersize
           
    geo_df.plot(column='SiteEnergyUseWN(kBtu)',ax=ax,alpha=0.5, legend=True,markersize=10,cmap='Spectral_r', vmax=7.009638e+06)
           
    # add title to graph
    
    plt.title("Energy Use in Seattle", fontsize=15,fontweight='bold')
           
    # set latitiude and longitude boundaries for map display
    plt.xlim(-122.45,-122.20)
    plt.ylim(47.75,47.45)
           
    # show map
    plt.show()
           
    #https://medium.com/@ianforrest11/graphing-latitudes-and-longitudes-on-a-map-bf64d5fca391
           
    return dataframe

In [None]:
data = nettoyage(data)

In [None]:
data = seattle_map(data)

In [None]:
def nan(dataframe):
    
    nan = ((dataframe.isna()).sum()).sort_values(ascending=False)
    print(nan)
    
    return

In [None]:
# on vérifie les NAN par colonne
nan(data)

In [None]:
# on décide de drop les lignes en NAN

data = data.dropna(subset=['LargestPropertyUseType'])
data = data.dropna(subset=['SiteEnergyUseWN(kBtu)'])

# on laisse ENERGYSTARScore dans un premier temps car on ne va pas travailler avec de suite

nan(data)

In [None]:
data.columns

In [None]:
data = data.drop(columns=['Latitude', 'Longitude','BuildingType', 'geometry'])

In [None]:
data.columns

In [None]:
variables_discretes = ['YearBuilt','NumberofBuildings','NumberofFloors' ]
variables_continues = ['PropertyGFATotal','LargestPropertyUseTypeGFA','largest/total',
                      'PropertyGFABuilding(s)','Gfa_building/nbbuildings','Gfa_building/nbfloors',
                      'PropertyGFAParking','SiteEnergyUseWN(kBtu)','GHGEmissions(MetricTonsCO2e)',
                      'ENERGYSTARScore']
variables_quantitatives = variables_discretes + variables_continues
variables_qualitatives = ['Neighborhood', 'CouncilDistrictCode','PrimaryPropertyType','LargestPropertyUseType']

In [None]:
# 1 Analyse univarié

# 1.1 Distribution et Boîtes à moustaches

for c in variables_continues:
    print("-"*20)
    print(c)
    print("-"*10)
    print(data[c].describe())
    print("-"*20)  
    
    # 1.1.1 Distribution
  
    sns.set(font_scale=1)
    plt.figure(figsize = (9,3.5))
    
    sns.histplot(data[c], bins= 100)
    titre = 'Distribution de : ' + c
    plt.title(titre)
    plt.xlabel(c)
    plt.show()

    # 1.1.2 Boîtes à moustaches

    sns.set(font_scale=1)
    plt.figure(figsize = (8,2.5))
    
    sns.boxplot(x=data[c], showfliers= True)
    titre = 'Distribution de : ' + c
    plt.title(titre)
    plt.xlabel(c)
    plt.show()

In [None]:
# Normaliser les valeurs dont la distribution est étalée à droite

log_variables = [ 'PropertyGFATotal','LargestPropertyUseTypeGFA','largest/total',
                 'PropertyGFABuilding(s)','Gfa_building/nbbuildings',
                 'Gfa_building/nbfloors','PropertyGFAParking','SiteEnergyUseWN(kBtu)','GHGEmissions(MetricTonsCO2e)'
                      ]


for c in log_variables:
    
    
    d = data[c]
    
    d = np.log(d+1)
    d.plot.hist()

    log_var = 'log_'+c
    titre = 'Distribution de : ' + log_var
    plt.title(titre)
    plt.xlabel(log_var)
    plt.show()
    
    data[log_var] = d
    


In [None]:
# 1.2 Barplots

# 1.2.1 Barplots pour valeurs discrètes

for c in variables_discretes:
    
    titre = c
    
    data[c].plot(kind='hist', bins=110)
    
    plt.title(c)
    plt.show()

In [None]:
# 1.2.2 Barplots pour valeurs qualitatives

for c in variables_qualitatives:
    
    titre = c
    
    data[c].value_counts().sort_values()[-10:].plot(kind='bar')
    
    plt.title(c)
    plt.show()

In [None]:
# 2. Analyse bivarié

# 2.1 Matrice de corrélation linéaire de pearson 


plt.figure(figsize=(7,7))
sns.set(font_scale=1)
plt.title('Matrice de corrélation de pearson')
Coef_corr = data[variables_quantitatives].corr()

# plot only part of a matrix
mask = np.zeros_like(Coef_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# https://seaborn.pydata.org/generated/seaborn.heatmap.html

sns.heatmap(Coef_corr, mask=mask, vmin=-1, vmax=1, linewidths=1, cmap='Spectral_r')
plt.show()

In [None]:
# 3. ACP

# choix du nombre de composantes à calculer
n_comp = 4

# selection des colonnes à prendre en compte dans l'ACP
data_pca = data[variables_continues]

# préparation des données pour l'ACP

data_pca = data_pca.fillna(data_pca.mean())

X = data_pca.values
features = variables_continues


# Centrage et Réduction
std_scale = preprocessing.StandardScaler().fit(X)
X_scaled = std_scale.transform(X)

# Calcul des composantes principales
pca = decomposition.PCA(n_components=n_comp)
pca.fit(X_scaled)

In [None]:
def display_scree_plot(pca):
    scree = pca.explained_variance_ratio_*100
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
    plt.xlabel("rang de l'axe d'inertie")
    plt.ylabel("pourcentage d'inertie")
    plt.title("Eboulis des valeurs propres")
    plt.show(block=False)

# Eboulis des valeurs propres
display_scree_plot(pca)

plt.show()

In [None]:
# cercle de corrélation

def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
    for d1, d2 in axis_ranks: # On affiche les 3 premiers plans factoriels, donc les 6 premières composantes
        if d2 < n_comp:

            # initialisation de la figure
            fig, ax = plt.subplots(figsize=(10,10))

            # détermination des limites du graphique
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # affichage des flèches
            # s'il y a plus de 30 flèches, on n'affiche pas le triangle à leur extrémité
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
                   pcs[d1,:], pcs[d2,:], 
                   angles='xy', scale_units='xy', scale=1, color="grey", alpha=0.4)
                # (voir la doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=0.1, color='black'))
            
            # affichage des noms des variables  
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        plt.text(x, y, labels[i], fontsize='11', ha='center', va='center', rotation=label_rotation, color="blue", alpha=0.5)
            
            # affichage du cercle
            circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
            plt.gca().add_artist(circle)

            # définition des limites du graphique
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)
        
            # affichage des lignes horizontales et verticales
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Cercle des corrélations (F{} et F{})".format(d1+1, d2+1))
            plt.show(block=False)

In [None]:
# Cercle des corrélations
pcs = pca.components_
display_circles(pcs, n_comp, pca, [(0,1),(2,3)], labels = np.array(features))

plt.show()

In [None]:
# plan factoriel

def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=1, illustrative_var=None):
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
 
            # initialisation de la figure       
            fig = plt.figure(figsize=(13,13))
        
            # affichage des points
            if illustrative_var is None:
                plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha)
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value)
                plt.legend()

            # affichage des labels des points
            if labels is not None:
                for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
                    plt.text(x, y, labels[i],
                              fontsize='14', ha='center',va='center') 
                
            # détermination des limites du graphique
            boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 0.25
            plt.xlim([-boundary,boundary])
            plt.ylim([-boundary,boundary])
        
            # affichage des lignes horizontales et verticales
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Projection des individus (sur F{} et F{})".format(d1+1, d2+1))
            plt.show(block=False)

In [None]:
# 1. Projection des individus (avec noms)
X_projected = pca.transform(X_scaled)
display_factorial_planes(X_projected, n_comp, pca, [(0,1),(2,3)],illustrative_var = data['PrimaryPropertyType'], alpha= 1)

plt.show()

In [None]:
data = data.drop(columns=['NumberofFloors','NumberofBuildings',
       'PrimaryPropertyType',  'PropertyGFATotal',
       'LargestPropertyUseTypeGFA', 'largest/total', 'PropertyGFABuilding(s)',
       'Gfa_building/nbbuildings', 'Gfa_building/nbfloors',
        'PropertyGFAParking','SiteEnergyUseWN(kBtu)',
       'log_Gfa_building/nbfloors','CouncilDistrictCode','GHGEmissions(MetricTonsCO2e)',
        'log_Gfa_building/nbbuildings'
       ])

In [None]:
data.columns

In [None]:
data['log_GHGEmissions(MetricTonsCO2e)'].plot.hist()

In [None]:
# Export du fichier nettoyé
data.to_csv('/kaggle/working/data_p3.csv',index=False)

In [None]:
data