# Import

In [150]:
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer
import pandas as pd

# Read Data

In [133]:
train_data = pd.read_csv("data/train.csv").drop(columns=['Unnamed: 0', 'N°DPE'])
test_data = pd.read_csv("data/test.csv").drop(columns=['Unnamed: 0', 'N°DPE'])
# val_data = pd.read_csv("data/val.csv").drop(columns=['N°DPE'])

# Data preprocessing

In [151]:
def train_test_split():
    X_train  = train_data.drop(columns=['Etiquette_DPE'])
    y_train  = train_data['Etiquette_DPE']

    X_test = test_data.drop(columns=["Etiquette_DPE"])
    y_test = test_data["Etiquette_DPE"]

    return X_train, y_train, X_test, y_test

In [152]:
def to_categorical():
    X_train, y_train, X_test, y_test = train_test_split()

    for dataset in [X_train, X_test]:
        dataset[dataset.select_dtypes(['object']).columns] = dataset.select_dtypes(['object']).apply(lambda x: x.astype('category').cat.codes)

    return X_train, y_train, X_test, y_test

In [153]:
def fill_nan():
    X_train, y_train, X_test, y_test = to_categorical()

    for col in X_train.select_dtypes(include=['float64', 'int64']).columns:
        median = X_train[col].median()
        X_train[col].fillna(median, inplace=True)
        X_test[col].fillna(median, inplace=True)
    
    for col in X_train.select_dtypes(include=['object', 'category']).columns:
        mode = X_train[col].mode()[0]
        X_train[col].fillna(mode, inplace=True)
        X_test[col].fillna(mode, inplace=True)
    
    return X_train, y_train, X_test, y_test

In [154]:
def knn_imputer(n_neighbors: int = 2):
    X_train, y_train, X_test, y_test = to_categorical()

    imputer = KNNImputer(n_neighbors=n_neighbors)
    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

    return X_train, y_train, X_test, y_test

pipelines

In [None]:
def build_pipeline_preprocessing(onehot: bool = True):

    numerical_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = train_data.select_dtypes(include=['object', 'category']).columns

    transformers = [
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder() if onehot else OrdinalEncoder(), categorical_cols)
    ]
    preprocessor = ColumnTransformer(transformers)

    steps = [
        ("preprocessor", preprocessor),
        ('imputer', KNNImputer(n_neighbors=2)),
    ]

    return steps
    

# HistGradientBoostingClassifier
no pre-processing

In [155]:
X_train, y_train, X_test, y_test = to_categorical()

gradient_boosting_clf = HistGradientBoostingClassifier()
gradient_boosting_clf.fit(X_train, y_train)

y_pred_val = gradient_boosting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9800


with fill nan

In [156]:
X_train, y_train, X_test, y_test = fill_nan()

gradient_boosting_clf = HistGradientBoostingClassifier()
gradient_boosting_clf.fit(X_train, y_train)

y_pred_val = gradient_boosting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9794


In [157]:
X_train, y_train, X_test, y_test = fill_nan()

gradient_boosting_clf = HistGradientBoostingClassifier(learning_rate=0.01, max_iter=1000)
gradient_boosting_clf.fit(X_train, y_train)

y_pred_val = gradient_boosting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9868


with KNNImputer

In [158]:
X_train, y_train, X_test, y_test = knn_imputer()

gradient_boosting_clf = HistGradientBoostingClassifier()
gradient_boosting_clf.fit(X_train, y_train)

y_pred_val = gradient_boosting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")

KeyboardInterrupt: 

with pipeline

In [None]:
steps = build_pipeline_preprocessing()
steps.append(('classifier', HistGradientBoostingClassifier()))
pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)
y_pred_val = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")

# RadomForest
with fill nan

In [149]:
X_train, y_train, X_test, y_test = fill_nan()

clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

y_pred_val = clf_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9921


with KNNImputer

In [None]:
X_train, y_train, X_test, y_test = knn_imputer()

clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

y_pred_val = clf_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
# Custom transformer for numerical columns
class NumericalImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.medians = X.median()
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in X_copy.columns:
            X_copy[col].fillna(self.medians[col], inplace=True)
        return X_copy

# Custom transformer for categorical columns
class CategoricalImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.modes = X.mode().iloc[0]
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in X_copy.columns:
            X_copy[col].fillna(self.modes[col], inplace=True)
        return X_copy

# SimpleImputer