In [37]:
# Librairies
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

# Model Regression Lineair
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# Cleaning
from sklearn import svm
from sklearn.impute import KNNImputer
from sklearn.impute import MissingIndicator

# Algorithme Scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

In [38]:
data = pd.read_csv("data/06e9c61d-e45f-4d74-beb5-e5e53ac6d2de.csv")

In [39]:
robust=["longitude", "latitude", "Age moyen logement"]
standard=["Total pieces", "Nb personne logement", "Salaire median logement", "Salaire median logement"]
liste_log=["Valeur moyenne logement"]
delete_column=[""]
remove_outliers = []

In [40]:
def PipeLine(df, delete_column="", outliers=False, liste_robust=0, liste_standard=0, liste_log=0, scaling=False):
    
    #<=========================================================># 
    # Renommer les colonnes     
    data = df.rename(columns={'Unnamed: 0': 'ID logement', 
                          'housing_median_age': 'Age moyen logement', 
                          'total_rooms': 'Total pieces', 
                          'total_bedrooms': 'Total chambres', 
                          'population': 'Nb personne logement', 
                          'households': 'Nb famille logement', 
                          'median_income': 'Salaire median logement',
                          'median_house_value': 'Valeur moyenne logement',
                          'ocean_proximity': 'Proximation Ocean'})
    # Suppression de la colonne ID_column     
    data.drop(['ID logement'], axis=1, inplace=True)
    
    # Encodage des variables catégorielles     
    data = pd.get_dummies(data, columns=['Proximation Ocean'], prefix=[""])

    # Suppression des données manquantes     
    imputer = KNNImputer(n_neighbors=1)
    total_chambres = imputer.fit_transform(data[["Total chambres"]])
    data["Total chambres"] = total_chambres
    
    #<=========================================================># 
    
    # Suppression des Outliers 
    if outliers:
        for i in remove_outliers:
            mean = data[i].mean()
            std = data[i].std()
            outliers = data[(data[i] - mean).abs() > 3*std]
            data = data.drop(outliers.index)
     
    # Préparation du dataset final (on remet les catégories, on sépare la Target...)    
    categorie = data[["_<1H OCEAN","_INLAND","_ISLAND","_NEAR BAY","_NEAR OCEAN"]]
    Y = data[["Valeur moyenne logement"]]
    X = data.drop(columns=["Valeur moyenne logement"], axis=1)
    X = pd.concat([X, categorie], axis=1)
    
    # Suppression des features sélectionner   
    if not delete_column == "":
        X = X.drop(columns=delete_column, axis=1)
    
    #<=========================================================># 
    
    # Scaling des données     
    if scaling:
        
        # Préparation des algorithmes de Scaling     
        robust = RobustScaler()    
        standard = StandardScaler()

        # Création d'un Dataframe des features sélectionner.   
        df_robust   =  X[[i for i in liste_robust]]
        df_standard =  X[[i for i in liste_standard]]
        
        # Scaling des données         
        robust.fit_transform(df_robust)
        standard.fit_transform(df_standard)
        
        # Préparation du Dataset       
        X = pd.concat([df_robust, df_standard], axis=1)
        
    #<=========================================================># 
    
    Y =  np.log(Y[[i for i in liste_log]])
    score = cross_val_score(LinearRegression(), X, Y, cv=5).mean()
    return score

In [41]:
PipeLine(data, 
         delete_column="", 
         outliers=True, 
         scaling=False,
         liste_robust=robust, 
         liste_standard=standard, 
         liste_log=liste_log,
         )

0.6658728530973806