In [37]:
import seaborn as sns
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None # supprime certains warnings
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import numpy as np

In [38]:
X_train = pd.read_csv("C:/Users/maila/Documents/Centrale Nantes/EI2/INFOIA/STASC/Data_Challenge/STASC/X_train_J01Z4CN.csv")
y_train = pd.read_csv("C:/Users/maila/Documents/Centrale Nantes/EI2/INFOIA/STASC/Data_Challenge/STASC/y_train_OXxrJt1.csv")
X_pred = pd.read_csv("C:/Users/maila/Documents/Centrale Nantes/EI2/INFOIA/STASC/Data_Challenge/STASC/X_test_BEhvxAN.csv")

annonces_pred = X_pred[["id_annonce"]]

y_train = y_train[["price"]] # On ne garde que la colonne price, un supprime la colonne qui porte sur le numéro de l'annonce

## Feature Engineering

In [39]:
def features_incomplete(X):   
    Features_incomplete = []
    for x in X.columns:
        if X[x].isnull().sum()/len(X[x])*100 > 0:
            Features_incomplete.append(x)
    return Features_incomplete

def FE(X):
    X = X.drop(columns = ["exposition"],axis=1)
    X.drop(columns=['ghg_category', 'energy_performance_category'], inplace=True)
    X['floor'].fillna(0, inplace=True)
    X['land_size'].fillna(0, inplace=True)

    #one hot encoding for property_type
    X = pd.get_dummies(X, columns=['property_type'], drop_first=True)

    # Label encoding for city
    le = LabelEncoder()
    X['city'] = le.fit_transform(X['city'])

    for x in features_incomplete(X):
    # On remplit les valeurs manquantes par la valeur la plus fréquente
        X[x].fillna(X[x].value_counts().index[0], inplace=True)
    
    X["Somme_bedrooms_rooms"] = X["nb_rooms"]+X["nb_bedrooms"]
    X["Somme_bathrooms_rooms"] = X["nb_rooms"]+X["nb_bathrooms"]
    X["Diff_bedrooms_bathrooms"] = X["nb_bathrooms"]-X["nb_bedrooms"]
    # Calcul des valeurs des départements plutôt que des codes postaux (en séparant les départements à 4 chiffres des départements à 5 chiffres)
    X["departement"]=X["postal_code"]
    X["departement"][X["departement"] < 10000]=X["departement"].astype(str).str[:1].astype(int)
    X["departement"][X["departement"] >= 10000]=X["departement"].astype(str).str[:2].astype(int)
    
    X = X.drop(columns = ["id_annonce"],axis=1)

    scaler = MinMaxScaler()
    for x in X.columns:
        if X[x].dtype != 'object':
            X[x] = scaler.fit_transform(X[x].values.reshape(-1,1))

    # création d'un mini dataset avec les coordonnées et le prix pour pouvoir faire un clustering
    X_cluster_size = X[["approximate_longitude","approximate_latitude","Diff_bedrooms_bathrooms"]]

    N=15
    kmeans = KMeans(n_clusters=N)
    X_cluster_size["Cluster"] = kmeans.fit_predict(X_cluster_size)
    X_cluster_size["Cluster"] = X_cluster_size["Cluster"].astype("category")
    X["Cluster"]=X_cluster_size["Cluster"]

    scaler = MinMaxScaler()
    for x in X.columns:
        if X[x].dtype != 'object':
            X[x] = scaler.fit_transform(X[x].values.reshape(-1,1))

    return X

## Création d'un modèle de XGBoost

#### Modèle choisi

In [40]:
y_log_train = np.log1p(y_train)
y_log_train = y_log_train.reset_index(drop=True)
X_train = FE(X_train)

x_train, x_test, y_train, y_test = train_test_split(X_train, y_log_train, test_size=0.2, random_state=1)

xgb_best = xgb.XGBRegressor(n_estimators=700,learning_rate=0.15,n_jobs=-1)
xgb_best.fit(x_train,y_train)

y_pred = xgb_best.predict(x_test)

print("MAPE : ",mean_absolute_percentage_error(np.expm1(y_test),np.expm1(y_pred)))

#### Récup de la prédiction finale

In [41]:
X_pred = FE(X_pred)

data1 = X_train[["property_type_atelier"]]
data1["property_type_atelier"] = 0
data2 = X_train[["property_type_hôtel"]]
data2["property_type_hôtel"] = 0

X_pred.insert(22, 'property_type_atelier', data1)
X_pred.insert(30, 'property_type_hôtel', data2)

prediction_y2 = xgb_best.predict(X_pred)

annonces_pred["price"] = np.expm1(prediction_y2)

annonces_pred.set_index("id_annonce", inplace=True)

annonces_pred.to_csv("C:/Users/maila/Documents/Centrale Nantes/EI2/INFOIA/STASC/Data_Challenge/STASC/Test_ML/y_predictions.csv")