In [26]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [27]:
df = pd.read_csv("train.csv")
df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [28]:
# Dropping features with many missing values (columns with more than half missing variables)
threshold = len(df)/2 # half
columns_to_drop = df.columns[df.isnull().sum() >= threshold]
print(columns_to_drop.tolist())
#df = df.drop(columns=columns_to_drop)

['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']


In [29]:
df = df.drop(['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature', 'Id','MSSubClass','MSZoning'],axis=1)

In [30]:
df.head(5)

Unnamed: 0,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [31]:
def preprocess(df):
    # Mapping pour remplacer les valeurs textuelles par des nombres
    mappings = {
        "BsmtQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "BsmtCond": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "BsmtExposure": {"No": 1, "Mn": 2, "Av": 3, "Gd": 4},
        "BsmtFinType1": {"Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
        "BsmtFinType2": {"Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
        "FireplaceQu": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "GarageQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "GarageCond": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
    }
    for col, mapping in mappings.items():
        if col in df.columns:
            df[col] = df[col].fillna(0).replace(mapping)
            df[col] = df[col].astype(int)  # Assurer que le type est bien int
            
    # Remplir les valeurs manquantes pour les autres colonnes
    fill_values = {
        "LotFrontage": df["LotFrontage"].mean() if "LotFrontage" in df.columns else 0,
        "MasVnrArea": 0,
        "Electrical": df["Electrical"].mode()[0] if "Electrical" in df.columns else "Unknown",
        "GarageType": "No Garage",
        "GarageYrBlt": 0,
        "GarageFinish": "No Garage",
        "Utilities": "AllPub",
        "Exterior1st": "VinylSd",
        "Exterior2nd": "VinylSd",
        "BsmtFinSF2": 0,
        "BsmtUnfSF": 0,
        "TotalBsmtSF": 0,
        "BsmtFinSF1": 0,
        "BsmtFullBath": 0,
        "BsmtHalfBath": 0,
        "KitchenQual": "TA",
        "Functional": "Typ",
        "GarageCars": 2,
        "GarageArea": 0,
        "SaleType": "WD"
    }
    for col, val in fill_values.items():
        if col in df.columns:
            df[col] = df[col].fillna(val)
    return df

In [32]:
df = preprocess(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 73 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotFrontage    1460 non-null   float64
 1   LotArea        1460 non-null   int64  
 2   Street         1460 non-null   object 
 3   LotShape       1460 non-null   object 
 4   LandContour    1460 non-null   object 
 5   Utilities      1460 non-null   object 
 6   LotConfig      1460 non-null   object 
 7   LandSlope      1460 non-null   object 
 8   Neighborhood   1460 non-null   object 
 9   Condition1     1460 non-null   object 
 10  Condition2     1460 non-null   object 
 11  BldgType       1460 non-null   object 
 12  HouseStyle     1460 non-null   object 
 13  OverallQual    1460 non-null   int64  
 14  OverallCond    1460 non-null   int64  
 15  YearBuilt      1460 non-null   int64  
 16  YearRemodAdd   1460 non-null   int64  
 17  RoofStyle      1460 non-null   object 
 18  RoofMatl

  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)


In [33]:
df.tail(5)

Unnamed: 0,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,0,0,0,0,0,8,2007,WD,Normal,175000
1456,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,...,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,112,0,0,0,0,4,2010,WD,Normal,142125
1459,75.0,9937,Pave,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,...,0,0,0,0,0,6,2008,WD,Normal,147500


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
X = df.drop("SalePrice", axis=1)
Y = df["SalePrice"]

In [36]:
from sklearn.preprocessing import OneHotEncoder

def create_consistent_encoding_sklearn(train_df, test_df, categorical_columns):

    # Initialize encoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    
    # Fit on training data
    encoder.fit(train_df[categorical_columns])
    
    # Transform both datasets
    train_encoded = encoder.transform(train_df[categorical_columns])
    test_encoded = encoder.transform(test_df[categorical_columns])
    
    # Convert to DataFrames with proper column names
    feature_names = encoder.get_feature_names_out(categorical_columns)
    train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names)
    test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names)
    
    # Combine with non-categorical columns
    train_final = pd.concat([
        train_df.select_dtypes(exclude=['object']),
        train_encoded_df
    ], axis=1)
    
    test_final = pd.concat([
        test_df.select_dtypes(exclude=['object']),
        test_encoded_df
    ], axis=1)
    
    return train_final, test_final

In [37]:
def get_categorical_columns(df, include_bool=True):
    # Initialize categories list
    categorical_columns = []
    
    for column in df.columns:
        # Skip ID columns
        if column.lower() in ['id', 'salesid', 'saleid', 'sale_id']:
            continue
            
        # Get column data type and number of unique values
        dtype = df[column].dtype
        nunique = df[column].nunique()
        
        # Check if column is categorical based on various criteria
        is_categorical = (
            # Explicit object or category dtype
            dtype == 'object' or 
            dtype == 'category' or
            # Boolean columns if include_bool is True
            (include_bool and dtype == 'bool') or
            # Integer columns with low cardinality (likely encoded categories)
            (dtype in ['int64', 'int32'] and nunique < 20) or
            # Check if column name suggests categorical
            any(hint in column.lower() for hint in ['type', 'category', 'code', 'quality', 'condition', 'class'])
        )
        
        if is_categorical:
            categorical_columns.append(column)
    
    return categorical_columns

In [38]:
subm_data = pd.read_csv("test.csv")
ids = subm_data["Id"]
subm_data = preprocess(subm_data)

  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)


In [39]:
categorical_columns = get_categorical_columns(X)
X, subm_data = create_consistent_encoding_sklearn(
    X,
    subm_data,
    categorical_columns
)

Ramdom Forest Regressor

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=7)

model = RandomForestRegressor(n_estimators=200, random_state=7)
model.fit(X_train, y_train)

In [41]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

Feature Importance:
            feature  importance
2       OverallQual    0.521588
19        GrLivArea    0.117883
15      TotalBsmtSF    0.033388
124   OverallQual_7    0.031396
11       BsmtFinSF1    0.025622
..              ...         ...
269  KitchenAbvGr_3    0.000000
235    HeatingQC_Po    0.000000
291  Functional_Sev    0.000000
339    PoolArea_576    0.000000
147   RoofMatl_Roll    0.000000

[374 rows x 2 columns]


In [42]:
y_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print("\nModel Performance on train set:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance on test set:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")


Model Performance on train set:
Mean Squared Error: 136342454.27
R² Score: 0.98

Model Performance on test set:
Mean Squared Error: 671198114.83
R² Score: 0.91


In [50]:
import joblib
joblib.dump(model, "notre_model.joblib")

['notre_model.joblib']

In [47]:
sample_submission_df = pd.read_csv('sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(subm_data)
sample_submission_df.to_csv('submission.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,126350.83
1,1462,151208.075
2,1463,181915.03
3,1464,182341.56
4,1465,201211.215


XGB Regressor

In [69]:
from xgboost import XGBRegressor

model = XGBRegressor(max_depth= 4, learning_rate= 0.1, n_estimators= 300, min_child_weight=2, subsample=0.8)
model.fit(X_train, y_train)

In [70]:
y_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print("\nModel Performance on train set:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.3f}")

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance on test set:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.3f}")


Model Performance on train set:
Mean Squared Error: 21719078.00
R² Score: 0.996

Model Performance on test set:
Mean Squared Error: 515925024.00
R² Score: 0.929


In [None]:
sample_submission_df = pd.read_csv('sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(subm_data)
sample_submission_df.to_csv('submission2.csv', index=False)
sample_submission_df.head()

In [20]:
import joblib
joblib.dump(model, "mon_deuxieme_model.joblib")

['mon_deuxieme_model.joblib']

In [21]:
import pickle
feature_columns = X_train.columns.tolist()

In [22]:
with open("features_list.pkl", "wb") as f:
    pickle.dump(feature_columns, f)