# Model Training and Evaluation

This notebook handles:
- XGBoost model training
- Model evaluation
- Feature importance analysis
- Visualization of results

In [None]:
# ============================================================================
# XGBOOST MACHINE LEARNING MODEL FOR APARTMENT PRICE PREDICTION
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [None]:
# Load the preprocessed data
df = pd.read_csv('/content/data_prices_cleaned.csv')

# Apply the same preprocessing steps
def clean_numeric_column(series):
    cleaned_series = series.astype(str).str.replace(' ', '', regex=False)
    cleaned_series = cleaned_series.str.replace(',', '.', regex=False)
    cleaned_series = cleaned_series.replace(['√Ä Vendre', '√Ä Louer', 'Location', '+', 'Ref924a', 'IFC Marsa', 'sale', 'nan', 'None'], pd.NA)
    return pd.to_numeric(cleaned_series, errors='coerce')

df['superficie'] = clean_numeric_column(df['superficie'])
df['chambres'] = clean_numeric_column(df['chambres'])
df['salles_de_bains'] = clean_numeric_column(df['salles_de_bains'])
df['price'] = clean_numeric_column(df['price'])

df = df.rename(columns={'superficie': 'size', 'chambres': 'room_count', 'salles_de_bains': 'bathroom_count'})
grand_tunis_states = ['Ben Arous', 'Tunis', 'La Manouba', 'Ariana']
df = df[(df['transaction'] == 'sale') & (df['category'] == 'Appartements') & (df['state'].isin(grand_tunis_states))].copy()
df['price'] = df['price']/1000
df = df[(df['size'] < 500) & (df['size'] >= 24)]
df = df[df['price']>20]
df = df[~(df['price']/df['size']>6)]
df = df[~((df['size'] > 70) & (df['price'] < 70))]
df = df[~((df['size']<90) & (df['price'])>1000)]
df = df[(df['room_count']>0) & (df['room_count']<10)]
df = df[df['bathroom_count']>=0]
df = df.drop(columns=['contact', 'category', 'location', 'descriptions', 'currency' , 'date','transaction','titles','shops','profiles'])
df.dropna(subset=['price', 'size', 'room_count', 'bathroom_count'], inplace=True)

print(f"Data loaded: {df.shape}")

## Step 1: Data Preparation

In [None]:
print("\nüìä STEP 1: DATA PREPARATION")
print("-"*80)

# Define features and target
X = df[['room_count', 'bathroom_count', 'size']]
y = df['price']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")
print(f"Target column: price")

# Check for any remaining NaN values
print(f"\nMissing values in features:\n{X.isnull().sum()}")
print(f"Missing values in target: {y.isnull().sum()}")

## Step 2: Train-Test Split

In [None]:
print("\nüîÄ STEP 2: SPLITTING DATA")
print("-"*80)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set size: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

## Step 3: Feature Scaling

In [None]:
print("\n‚öñÔ∏è STEP 3: FEATURE SCALING")
print("-"*80)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úì Features scaled using StandardScaler")

## Step 4: Build XGBoost Model

In [None]:
print("\nü§ñ STEP 4: BUILDING XGBOOST MODEL")
print("-"*80)

# Initialize XGBoost Regressor
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective='reg:squarederror'
)

# Train the model
print("Training model...")
model.fit(X_train_scaled, y_train)
print("‚úì Model trained successfully!")

## Step 5: Make Predictions

In [None]:
print("\nüéØ STEP 5: MAKING PREDICTIONS")
print("-"*80)

y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

print("‚úì Predictions generated")

## Step 6: Model Evaluation

In [None]:
print("\nüìà STEP 6: MODEL EVALUATION")
print("="*80)

# Training metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Test metrics
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print("\nüìä TRAINING SET PERFORMANCE:")
print(f"  MAE (Mean Absolute Error): {train_mae:.2f} TND")
print(f"  RMSE (Root Mean Squared Error): {train_rmse:.2f} TND")
print(f"  R¬≤ Score: {train_r2:.4f}")

print("\nüìä TEST SET PERFORMANCE:")
print(f"  MAE (Mean Absolute Error): {test_mae:.2f} TND")
print(f"  RMSE (Root Mean Squared Error): {test_rmse:.2f} TND")
print(f"  R¬≤ Score: {test_r2:.4f}")

# Cross-validation
print("\nüîÑ CROSS-VALIDATION (5-fold):")
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5,
                            scoring='r2')
print(f"  R¬≤ Scores: {cv_scores}")
print(f"  Mean R¬≤: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## Step 7: Feature Importance

In [None]:
print("\nüéØ STEP 7: FEATURE IMPORTANCE")
print("-"*80)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print(feature_importance)

## Step 8: Visualizations

In [None]:
print("\nüìä STEP 8: GENERATING VISUALIZATIONS")
print("-"*80)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('XGBoost Model Performance Analysis', fontsize=16, fontweight='bold')

# Plot 1: Actual vs Predicted (Training)
axes[0, 0].scatter(y_train, y_train_pred, alpha=0.5, color='blue', s=20)
axes[0, 0].plot([y_train.min(), y_train.max()],
                [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Price (TND)', fontweight='bold')
axes[0, 0].set_ylabel('Predicted Price (TND)', fontweight='bold')
axes[0, 0].set_title(f'Training Set: Actual vs Predicted\nR¬≤ = {train_r2:.4f}',
                     fontweight='bold')
axes[0, 0].grid(alpha=0.3)

# Plot 2: Actual vs Predicted (Test)
axes[0, 1].scatter(y_test, y_test_pred, alpha=0.5, color='green', s=20)
axes[0, 1].plot([y_test.min(), y_test.max()],
                [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 1].set_xlabel('Actual Price (TND)', fontweight='bold')
axes[0, 1].set_ylabel('Predicted Price (TND)', fontweight='bold')
axes[0, 1].set_title(f'Test Set: Actual vs Predicted\nR¬≤ = {test_r2:.4f}',
                     fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Plot 3: Feature Importance
axes[0, 2].barh(feature_importance['Feature'], feature_importance['Importance'],
                color='coral')
axes[0, 2].set_xlabel('Importance', fontweight='bold')
axes[0, 2].set_title('Feature Importance', fontweight='bold')
axes[0, 2].grid(axis='x', alpha=0.3)

# Plot 4: Residuals (Training)
train_residuals = y_train - y_train_pred
axes[1, 0].scatter(y_train_pred, train_residuals, alpha=0.5, color='blue', s=20)
axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 0].set_xlabel('Predicted Price (TND)', fontweight='bold')
axes[1, 0].set_ylabel('Residuals', fontweight='bold')
axes[1, 0].set_title('Training Set: Residual Plot', fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Plot 5: Residuals (Test)
test_residuals = y_test - y_test_pred
axes[1, 1].scatter(y_test_pred, test_residuals, alpha=0.5, color='green', s=20)
axes[1, 1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 1].set_xlabel('Predicted Price (TND)', fontweight='bold')
axes[1, 1].set_ylabel('Residuals', fontweight='bold')
axes[1, 1].set_title('Test Set: Residual Plot', fontweight='bold')
axes[1, 1].grid(alpha=0.3)

# Plot 6: Prediction Error Distribution
axes[1, 2].hist(test_residuals, bins=30, color='purple', alpha=0.7, edgecolor='black')
axes[1, 2].set_xlabel('Prediction Error (TND)', fontweight='bold')
axes[1, 2].set_ylabel('Frequency', fontweight='bold')
axes[1, 2].set_title('Distribution of Prediction Errors (Test Set)', fontweight='bold')
axes[1, 2].axvline(x=0, color='r', linestyle='--', lw=2)
axes[1, 2].grid(alpha=0.3)

plt.tight_layout()
plt.show()