In [7]:
import kagglehub
import pandas as pd


# Download latest version
path = kagglehub.dataset_download("vjchoudhary7/customer-segmentation-tutorial-in-python")

print("Path to dataset files:", path)

df = pd.read_csv(path + "/Mall_Customers.csv")
df.head()



Path to dataset files: C:\Users\Fitia\.cache\kagglehub\datasets\vjchoudhary7\customer-segmentation-tutorial-in-python\versions\1


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


# 1. Chargement des données

In [13]:
import sklearn
import pandas as pd

ds = sklearn.datasets.fetch_california_housing()
df = pd.DataFrame(ds.data, columns=ds.feature_names)
df['MedHouseVal'] = ds.target  # Ajout de la variable cible
df.head()


URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

# 2. Nettoyage et prétraitement

In [9]:
# Vérification des valeurs manquantes
print(df.isnull().sum())

# Vérification des types
print(df.dtypes)

# Conversion des types si nécessaire (exemple)
# df['col'] = df['col'].astype('type')

# Suppression des doublons si nécessaire
df = df.drop_duplicates()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64
CustomerID                 int64
Gender                    object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object


# 3. Exploration des données

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Statistiques descriptives
print(df.describe())

# Histogrammes
df.hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(12, 6))
sns.boxplot(data=df.drop(['Latitude', 'Longitude'], axis=1))
plt.xticks(rotation=45)
plt.show()

# Scatter plots prix vs variables clés
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for i, col in enumerate(df.columns[:-1]):  # Exclure la variable cible
    sns.scatterplot(x=col, y='MedHouseVal', data=df, ax=axes[i//3, i%3])
plt.tight_layout()
plt.show()

# Matrice de corrélation
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Matrice de corrélation')
plt.show()

# 4. Split train/validation/test

In [17]:
from sklearn.model_selection import train_test_split

X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Split initial en train+val (80%) et test (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split du train+val en train (60%) et val (20%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

KeyError: "['MedHouseVal'] not found in axis"

# 5. Feature engineering

In [None]:
from sklearn.preprocessing import StandardScaler

# Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 6. Modélistion initiale

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialisation et entraînement
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Prédiction sur le validation set
y_pred = lr.predict(X_val_scaled)

# Métriques
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Régression Linéaire - Validation Set:")
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

# 7. Régularisation

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

# Ridge Regression
ridge = Ridge()
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_grid = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train_scaled, y_train)

# Meilleur modèle Ridge
best_ridge = ridge_grid.best_estimator_
y_pred_ridge = best_ridge.predict(X_val_scaled)

# Lasso Regression
lasso = Lasso()
lasso_grid = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train_scaled, y_train)

# Meilleur modèle Lasso
best_lasso = lasso_grid.best_estimator_
y_pred_lasso = best_lasso.predict(X_val_scaled)

# Comparaison des modèles
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - Validation Set:")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

evaluate_model(y_val, y_pred, "Régression Linéaire")
evaluate_model(y_val, y_pred_ridge, "Ridge Regression")
evaluate_model(y_val, y_pred_lasso, "Lasso Regression")

# 8. Evaluation finale sur le test set

In [16]:
# Sélection du meilleur modèle (exemple avec Ridge)
final_model = best_ridge

# Prédiction sur le test set
y_test_pred = final_model.predict(X_test_scaled)

# Métriques finales
evaluate_model(y_test, y_test_pred, "Meilleur modèle (Ridge) - Test Set")

# Intervalles de confiance (exemple avec bootstrap)
import numpy as np
from sklearn.utils import resample

np.random.seed(42)
bootstrap_r2 = []
for _ in range(1000):
    X_bs, y_bs = resample(X_test_scaled, y_test)
    y_pred_bs = final_model.predict(X_bs)
    bootstrap_r2.append(r2_score(y_bs, y_pred_bs))

print(f"\nIntervalle de confiance à 95% pour R²:")
print(f"({np.percentile(bootstrap_r2, 2.5):.4f}, {np.percentile(bootstrap_r2, 97.5):.4f})")

NameError: name 'best_ridge' is not defined

# 9. Conclusions et interprétation

In [14]:
# Importance des variables
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': final_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nImportance des variables (coefficients standardisés):")
print(coefficients)

# Visualisation des coefficients
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coefficients)
plt.title('Importance des variables dans le modèle Ridge')
plt.show()

# Conclusions
print("\nConclusions:")
print("- Variables les plus importantes: ", list(coefficients.head(3)['Feature']))
print("- Performance acceptable mais pourrait être améliorée")
print("- Pistes d'amélioration:")
print("  * Feature engineering supplémentaire")
print("  * Essayer d'autres algorithmes (Random Forest, Gradient Boosting)")
print("  * Collecte de données supplémentaires si possible")

NameError: name 'X' is not defined