# DATA CHALLENGE 2022 : House Price Forecasting

In [None]:
#!python -m pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import scipy as sp
#from metrics.custom_metric_ilb import custom_metric_function
from pycaret.regression import compare_models, setup
from sklearn.model_selection import KFold, cross_val_score

In [None]:
pd.options.display.max_rows=28
pd.options.display.max_columns=28

## 1. DATA PREPROCESSING :

In [None]:
filepath='data/'
X_train_raw=pd.read_csv(filepath +'X_train_J01Z4CN.csv') 
Y_train_raw=pd.read_csv(filepath + 'y_train_OXxrJt1.csv')
Y_test_raw=pd.read_csv(filepath + 'y_random_MhJDhKK.csv')
X_test_raw=pd.read_csv(filepath + 'X_test_BEhvxAN.csv')


In [None]:
X_train_0=X_train_raw.drop(columns="id_annonce")
X_test_0=X_test_raw.drop(columns="id_annonce")
X_train_0.head()
X_test_ids=X_test_raw["id_annonce"]
X_test_ids.to_pickle("data/X_test_ids.pkl")


In [None]:
X_train_0.index.max()

In [None]:
len(X_test_0.index)

In [None]:
Y_train_0=Y_train_raw.drop(columns="id_annonce")
Y_train_0.head()

In [None]:
X_train_raw.describe()

In [None]:
# L=["property_type","city","energy_performance_category","ghg_category","exposition"]
# for x in L:
#     X_train[x]=X_train[x].astype("category")
#     X_test[x]=X_test[x].astype("category")


In [None]:
X_train_0.isna().sum()/len(X_train_0.index)*100


In [None]:
sb.heatmap(X_train_0.isnull(),cmap='viridis')

## 1. Cleaning the data
### a. DataTypes :
First let's make sure that all the data has the proper type (especially that categorical data isn't set as numerical)

In [None]:
# Numerical type Columns
X_train_0.select_dtypes(np.number)


In [None]:
X_train_0.select_dtypes(np.number).columns

In [None]:
X_train_0.select_dtypes(object)

In [None]:
X_train_0.select_dtypes(object).columns

### b. Dealing with NaN Values :

Missing values :

```
size                             512 # Numerical
floor                          27625 # Numerical
land_size                      21787 # Numerical
energy_performance_value       18300 # Numerical
energy_performance_category    18300 # Categorical
ghg_value                      18838 # Numerical
ghg_category                   18838 # Categorical
exposition                     28274 # Categorical
nb_rooms                        1566 # Numerical
nb_bedrooms                     2733 # Numerical
nb_bathrooms                   13273 # Numerical
```

- For houses we'll set the the floor value at 0
- For appartements we'll set the landsize value at 0

Based on the missing values heatmap we'll drop : 
- Energy__performance_category (Derived Data from Energy_performance value)
- GHG_category (Derived Data from ghg_value)
- Exposition (75.66% missing Data)


In [None]:
# Dropping the columns
data = pd.concat([X_train_0, X_test_0], axis=0).reset_index(drop=True)
data_1=data.drop(columns=["exposition","city", "energy_performance_category", "ghg_category"])

In [None]:
from sklearn.neighbors import KNeighborsRegressor

#KNN imputation / Try and expirement different imputations
def knn_impute(df0, column):
    """ 
    """
    # Creating a copy of the input dataframe
    df = df0.copy()

    # numeric_df : subset of df composed only of numerical data type colums
    numeric_df = df.select_dtypes(np.number)

    # full columns : columns that have no missing data
    full_columns=numeric_df.loc[:,numeric_df.isna().sum()==0].columns

    # knn_x_train : training data for the missing values
    knn_x_train = numeric_df.loc[numeric_df[column].isna()==False, full_columns]

    # knn_y_train: target data for the missing valies 
    knn_y_train= numeric_df.loc[numeric_df[column].isna()==False, column]

    # knn_x_test : the data with missing values for the target column
    knn_x_test = numeric_df.loc[numeric_df[column].isna()==True, full_columns]

    # Creating the KNeighbors Regress
    knn=KNeighborsRegressor()

    # Fitting the model
    knn.fit(knn_x_train, knn_y_train)

    y_pred=knn.predict(knn_x_test)

    df.loc[df[column].isna()==True, column]=y_pred


    return df

In [None]:
def knn_impute_all(df, list_columns):
    """
    """
    for column in list_columns:
        df=knn_impute(df,column)
    return df

In [None]:
list_columns = ["size", "land_size","energy_performance_value","ghg_value", "nb_rooms","nb_bathrooms", "nb_bedrooms"]
data_2 = knn_impute_all(data_1, list_columns=list_columns)
data_2.isna().sum()

In [None]:
data_2.loc[(data_2['property_type']!="appartement"), 'floor'] = 0


In [None]:
data_2.isna().sum()

In [None]:
data_3=knn_impute(data_2, "floor")

In [None]:
data_3.isna().sum()

## 2. Feature Engineering 


## 3. Feature Transformations 


In [None]:
from scipy.stats import skew

In [None]:
for column in data_3.select_dtypes(np.number).columns :
    print(f"{column} : {skew(data_3[column])}")

## 4. Encoding
Issue 1 : Hot encoding city column : explostion in dimensionality (for now I dropped it )
possible solution : Frequency encoding / Target encoding

In [None]:
data_4 = pd.get_dummies(data_3)

In [None]:
data_4.head()

## 5. Scaling


In [None]:
scaler = StandardScaler()
scaler.fit(data_4)
data_5=pd.DataFrame(scaler.transform(data_4), index=data_4.index, columns=data_4.columns)

In [None]:
data_5.head()

## 6. Target Transformation


In [None]:
Y_train_0.hist()

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sb.distplot(Y_train_0, kde=True, fit=sp.stats.norm)
plt.title("Without Log Transform")

plt.subplot(1, 2, 2)
sb.distplot(np.log(Y_train_0), kde=True, fit=sp.stats.norm)
plt.xlabel("Log SalePrice")
plt.title("With Log Transform")

plt.show()


In [None]:
Y_train_1=np.log(Y_train_0)

## 7. Model Selection


In [None]:
X_train_1=data_5.loc[:X_train_0.index.max(),:]
X_train_1.head()

In [None]:
X_train_1.index.max()

In [None]:
X_test_1=data_5.loc[X_train_0.index.max()+1:,:]
X_test_1.head()
X_test_1.to_pickle("data/X_test_1.pkl")

In [None]:
len(X_test_1.index)

In [None]:
data_6=pd.concat([X_train_1, Y_train_1], axis=1)
data_6.to_pickle("data/data.pkl") 

In [None]:
# data6.head()

In [None]:
#from metrics.custom_metric_ilb import custom_metric_function
from pycaret.regression import compare_models, setup
import pycaret


In [None]:
#setup(feature_interaction=True,polynomial_features=True,use_gpu = True, silent = True, data = data_6, target="price")

In [None]:
#compare_models()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor


In [None]:
#baseline model
baseline_model=XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=0,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1,
             monotone_constraints='()', n_estimators=100, n_jobs=-1,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto')
baseline_model.fit(X_train_1, Y_train_1)

In [None]:
print(sorted(sklearn.metrics.SCORERS.keys()))

In [None]:
# Baseline model evaluation
kf = KFold(n_splits=10)
import sklearn
baseline_result = cross_val_score(baseline_model, X_train_1, Y_train_1, scoring="neg_mean_absolute_error", cv=kf)

In [None]:

mean_baseline_result = -np.mean(baseline_result)
print(mean_baseline_result)

## 8. Hyperparameter Optimization


## 9. Ensembling

In [None]:
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
models = {
    "xgboost":XGBRegressor(),
    # "lgbm":LGBMRegressor(),
    "rfr":RandomForestRegressor(),
    "etr":ExtraTreesRegressor(),
    # "gbr":GradientBoostingRegressor()
}

for model_name, model in models.items():
    model.fit(X_train_1, Y_train_1)
    print(f"Finished Training {model_name}.")


In [None]:
results={}

for model_name, model in models.items() :
    result = cross_val_score(model, X_train_1, Y_train_1, scoring="neg_mean_absolute_error", cv=kf)
    results[model_name]=result

In [None]:
for model_name, result in results.items():
    print(f"{model_name}. Mean : {-np.mean(result)}  & Var : {-np.std(result)} ")


In [None]:
final_predictions_raw = (
    1/3*np.exp(models["xgboost"].predict(X_test_1)) +
    # 0.2*np.exp(models["lgbm"].predict(X_test_1)) +
    1/3*np.exp(models["rfr"].predict(X_test_1)) +
    1/3*np.exp(models["etr"].predict(X_test_1)) 
    # 0.2*np.exp(models["gbr"].predict(X_test_1)) 
    )


## 10. Feauture selection


## 11. Submission

In [None]:
Y_test_raw.head()

In [None]:
predictions = pd.Series(np.exp(baseline_model.predict(X_test_1)), name="price")
final_predictions=pd.Series(final_predictions_raw,name="price")

In [None]:
predictions.head()

In [None]:
submission=pd.concat([X_test_ids, predictions], axis=1)

In [None]:
submission.to_csv("data/submission.csv", index=False, header=True)

In [None]:
final_submission = pd.concat([X_test_ids, final_predictions], axis=1)


In [None]:
final_submission.to_csv("data/final_submission.csv", index=False, header=True)