# Projet data science - analyse initiale

Ce projet est construit selon l'approche CRISP-ML.

## Notre use case

Nous sommes Banklytics, pour valider les hypothèques que nous proposons à nos clients, nous souhaitons estimer le prix de vente des maisons afin de nous assurer de la sureté de nos prêts. Nous devons donc développer un modèle permettant de prédire la valeur du bien immobilier se basant sur les critères fournis afin de vérifier si le prix de vente se rapproche de la valeur du bien.

## Data & business understanding

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from statsmodels.graphics.gofplots import qqplot

In [37]:
# Data loading
train_data = pd.read_csv('train.csv', index_col='Id')
validation_data = pd.read_csv('test.csv', index_col='Id')




### Variables choices

## Baseline model engineering

## Model enginering

### Base pipeline


In [38]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Define the column types to apply the transformer
train = train_data.drop(columns=['SalePrice'])
numerical_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()
train[numerical_cols] = train[numerical_cols].fillna(0)
train[categorical_cols] = train[categorical_cols].fillna('None')

#Define the transformer for each type of column
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))]
)

categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder())
])

#Create the ColumnTransformer to apply the transformers to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop'  # Drop any columns not specified in transformers
)

preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline


In [39]:
# Converting back to Pandas DataFrame
onehot_encoder_feature_names = list(preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_cols))
column_order = numerical_cols + onehot_encoder_feature_names

# Ensure the shape matches
pd.DataFrame(X_train.toarray(), columns=column_order, index=train.index)

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60.0,62.0,7917.0,6.0,5.0,1999.0,2000.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1457,20.0,85.0,13175.0,6.0,6.0,1978.0,1988.0,119.0,790.0,163.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1458,70.0,66.0,9042.0,7.0,9.0,1941.0,2006.0,0.0,275.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1459,20.0,68.0,9717.0,5.0,6.0,1950.0,1996.0,0.0,49.0,1029.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### Model 1
Decision Tree Classifier
All Columns without sorting

In [None]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Define the hyperparameters for the DecisionTreeClassifier
hyperparams = {
    'criterion': 'entropy',     # Function to measure the quality of a split
    'max_depth': 3,             # Limits the depth of the tree to prevent overfitting
    'min_samples_split': 20,    # The minimum number of samples required to split an internal node
    'min_samples_leaf': 10,     # The minimum number of samples required to be at a leaf node
    'random_state': 42          # Ensures reproducibility of the results
}

# Update the model pipeline with the new DecisionTreeClassifier parameters
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(**hyperparams))
])

# Fit the model
model_pipeline.fit(X, y)

model_pipeline

ValueError: Specifying the columns using strings is only supported for dataframes.

In [None]:
validation_data[numerical_cols] = validation_data[numerical_cols].fillna(0)
validation_data[categorical_cols] = validation_data[categorical_cols].fillna('None')


X_test = validation_data

y_pred = model_pipeline.predict(X_test)

kaggle_submission = pd.DataFrame(y_pred, columns=['Saleprice'], index=X_test.index)
kaggle_submission

ValueError: Found unknown categories ['None'] in column 0 during transform


### Model 2



### Model 3



### Model 4

