## 2. Feature Engineering: The 'KNN Region Imputation' Strategy

**Innovation**: Instead of relying on unsupervised clustering, we use KNN (K-Nearest Neighbors) to impute the actual region for properties labeled as 'Autres villes' based on similar properties in the same city that have known regions.

**Method**: For each city, we train a KNN classifier on properties with known regions using physical attributes (Size, Rooms, Bathrooms, Price per mÂ²). The model then predicts the most likely region for properties with 'Autres villes' based on their similarity to properties with known regions. This preserves the actual neighborhood information while filling in missing region data intelligently.

In [None]:
# --- 2.1 KNN Region Imputation for 'Autres villes' ---
from sklearn.model_selection import cross_val_score

# Global dictionary to store models for future inference on new data
CITY_KNN_MODELS = {}
BEST_K_PER_CITY = {}  # Stores the best k found for each city

def find_best_k_for_city(X_train_scaled, y_train_encoded, k_range=(2, 5)):
    """
    Find the best k value using cross-validation.
    Returns the best k and the corresponding CV score.
    """
    best_k = k_range[0]
    best_score = -1
    results = {}
    
    for k in range(k_range[0], k_range[1] + 1):
        if k > len(X_train_scaled):
            continue
        knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
        # Use 5-fold CV (or less if not enough samples)
        cv_folds = min(5, len(X_train_scaled))
        scores = cross_val_score(knn, X_train_scaled, y_train_encoded, cv=cv_folds, scoring='accuracy')
        mean_score = scores.mean()
        results[k] = mean_score
        
        if mean_score > best_score:
            best_score = mean_score
            best_k = k
    
    return best_k, best_score, results

def impute_region_with_knn(df):
    """
    For each city, find the best k (from 2 to 5) using cross-validation,
    train a KNN classifier on properties with known regions,
    and use it to predict the region for properties with 'autres villes'.
    """
    df = df.copy()
    df['imputed_region'] = df['region'].copy()
    
    knn_features = ['size', 'room_count', 'bathroom_count', 'price_per_m2']
    cities = df['city'].unique()
    
    for city in cities:
        city_mask = (df['city'] == city)
        city_data = df[city_mask].copy()
        
        # Split into known regions and 'autres villes'
        known_mask = city_data['region'] != 'autres villes'
        unknown_mask = city_data['region'] == 'autres villes'
        
        known_data = city_data[known_mask]
        unknown_data = city_data[unknown_mask]
        
        print(f"\n{city.upper()}: {len(known_data)} with known region, {len(unknown_data)} with 'autres villes'")
        
        if len(known_data) > 10 and len(unknown_data) > 0:
            # Prepare training data
            X_train = known_data[knn_features].fillna(known_data[knn_features].median())
            y_train = known_data['region']
            
            # Encode labels
            label_encoder = LabelEncoder()
            y_train_encoded = label_encoder.fit_transform(y_train)
            
            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            
            # Find best k using cross-validation (try k from 2 to 5)
            best_k, best_score, k_results = find_best_k_for_city(X_train_scaled, y_train_encoded, k_range=(2, 5))
            BEST_K_PER_CITY[city] = best_k
            
            print(f"  K evaluation: {k_results}")
            print(f"  Best k = {best_k} (CV accuracy: {best_score:.3f})")
            
            # Train final KNN classifier with best k
            knn = KNeighborsClassifier(n_neighbors=best_k, weights='distance')
            knn.fit(X_train_scaled, y_train_encoded)
            
            # Save models for inference
            CITY_KNN_MODELS[city] = {
                'scaler': scaler,
                'knn': knn,
                'label_encoder': label_encoder,
                'best_k': best_k
            }
            
            # Predict regions for 'autres villes'
            X_unknown = unknown_data[knn_features].fillna(city_data[knn_features].median())
            X_unknown_scaled = scaler.transform(X_unknown)
            predicted_encoded = knn.predict(X_unknown_scaled)
            predicted_regions = label_encoder.inverse_transform(predicted_encoded)
            
            # Update the imputed_region column
            df.loc[unknown_data.index, 'imputed_region'] = predicted_regions
            
            print(f"  Predicted regions distribution: {pd.Series(predicted_regions).value_counts().to_dict()}")
        else:
            print(f"  Skipped (insufficient data)")
    
    return df

print("Imputing regions for 'Autres villes' using KNN with automatic k selection...\n")
df = impute_region_with_knn(df)
print("\n" + "="*50)
print("Best k per city:", BEST_K_PER_CITY)
print("Models saved for cities:", list(CITY_KNN_MODELS.keys()))
print("\nRegion distribution after imputation:")
print(df['imputed_region'].value_counts())