# New Section

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
import os

# Silence all warnings and XGBoost messages
warnings.filterwarnings('ignore')
os.environ['XGBOOST_SILENT'] = '1'

# Load data
try:
    df = pd.read_csv('aaa.csv')
    print("Data loaded successfully. Shape:", df.shape)
except Exception as e:
    print(f"Error loading data: {e}")
    exit()

# Data Preparation - only using required features
required_columns = ['brand', 'model', 'price', 'year', 'mileage', 'brand_classification']
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
    print(f"Missing columns: {missing_cols}")
    exit()

# Enhanced Feature Engineering
current_year = pd.Timestamp.now().year
df['car_age'] = current_year - df['year']

# Advanced mileage transformations
df['log_mileage'] = np.log1p(df['mileage'])
df['mileage_per_year'] = df['mileage'] / (df['car_age'] + 1)
df['mileage_penalty'] = df['mileage'] * df['brand_classification'].map({'luxury': 2.2, 'economy': 1.0})
df['inverse_mileage'] = 1 / (df['mileage'] + 1)
df['mileage_squared'] = df['mileage'] ** 2

# Create brand-model interaction feature
df['brand_model'] = df['brand'] + '_' + df['model']

# Encode categorical variables
label_encoders = {}
for col in ['brand', 'model', 'brand_model', 'brand_classification']:
    le = LabelEncoder()
    df[f'{col}_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le

# Prepare features (no engine/fuel/gearbox)
features = [
    'car_age',
    'inverse_mileage',
    'mileage_penalty',
    'log_mileage',
    'mileage_per_year',
    'mileage_squared',
    'brand_encoded',
    'model_encoded',
    'brand_model_encoded',
    'brand_classification_encoded'
]

X = df[features]
y = df['price']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=features)

# Feature selection
selector = SelectKBest(f_regression, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
X = pd.DataFrame(X_selected, columns=selected_features)

# Train-test split with stratification by price
df['price_strata'] = pd.qcut(df['price'], q=4, labels=False)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df['price_strata'])

# Optimized XGBoost model (silent mode)
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=7,
    min_child_weight=2,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    monotone_constraints={
        'inverse_mileage': 1,
        'mileage_penalty': -1,
        'log_mileage': -1
    },
    eval_metric=['mae', 'rmse'],
    early_stopping_rounds=30,
    n_jobs=-1,
    verbosity=0  # Silences all messages
)

# Silent training
model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=False
)

# Evaluation
def evaluate_model(model, X, y, set_name):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    r2 = r2_score(y, preds)
    print(f"\n{set_name} Evaluation:")
    print(f"R²: {r2:.4f}")
    print(f"MAE: €{mae:,.2f}")
    print(f"Avg Price: €{y.mean():,.2f}")
    print(f"MAE % of Avg: {mae/y.mean()*100:.1f}%")
    return preds

print("\nModel Performance:")
train_preds = evaluate_model(model, X_train, y_train, "Training")
test_preds = evaluate_model(model, X_test, y_test, "Test")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop Predictive Features:")
print(feature_importance)

# Prediction function (no engine/fuel/gearbox)
def predict_car_price(car_data):
    car = car_data.copy()

    # Calculate numeric features
    car['car_age'] = current_year - car['year']
    car['log_mileage'] = np.log1p(car['mileage'])
    car['mileage_per_year'] = car['mileage'] / (car['car_age'] + 1)
    car['mileage_penalty'] = car['mileage'] * (2.2 if car['brand_classification'] == 'luxury' else 1.0)
    car['inverse_mileage'] = 1 / (car['mileage'] + 1)
    car['mileage_squared'] = car['mileage'] ** 2
    car['brand_model'] = car['brand'] + '_' + car['model']

    # Encode categorical variables
    for col in ['brand', 'model', 'brand_model', 'brand_classification']:
        try:
            car[f'{col}_encoded'] = label_encoders[col].transform([car[col]])[0]
        except ValueError:
            car[f'{col}_encoded'] = len(label_encoders[col].classes_)

    # Create and scale feature vector
    feature_vector = [car[col] for col in features if col in selected_features]
    feature_vector_scaled = scaler.transform([feature_vector])[0]

    return model.predict([feature_vector_scaled])[0]

# Test predictions (without engine/fuel/gearbox)
test_cases = [
    {'brand': 'BMW', 'model': 'X5', 'year': 2011, 'mileage': 137566, 'brand_classification': 'luxury'},
    {'brand': 'SEAT', 'model': 'Leon', 'year': 2018, 'mileage': 40250, 'brand_classification': 'economy'}
]
real_prices = [13490,21990]
print("\nPrice Prediction Examples:")
i=0
for case in test_cases:
    price = predict_car_price(case)
    print(f"\n{case['brand']} {case['model']} ({case['year']}, {case['mileage']}km) : Prediction : €{price:,.2f} Real Price: €{real_prices[i]:,.2f}")
    i = i +1
    # Show price sensitivity to mileage
    print("Mileage impact:")
    for mileage in [10000, 30000, 60000, 90000]:
        temp_case = case.copy()
        temp_case['mileage'] = mileage
        price = predict_car_price(temp_case)
        print(f"{mileage:,} km → €{price:,.2f} (Δ: €{price - predict_car_price({**temp_case, 'mileage':10000}):,.0f})")

Data loaded successfully. Shape: (531, 15)

Model Performance:

Training Evaluation:
R²: 0.8855
MAE: €1,505.93
Avg Price: €15,940.52
MAE % of Avg: 9.4%

Test Evaluation:
R²: 0.3976
MAE: €3,222.20
Avg Price: €15,788.88
MAE % of Avg: 20.4%

Top Predictive Features:
                        Feature  Importance
9  brand_classification_encoded    0.435315
0                       car_age    0.117229
4              mileage_per_year    0.081152
6                 brand_encoded    0.065932
8           brand_model_encoded    0.065601
7                 model_encoded    0.060948
2               mileage_penalty    0.058283
1               inverse_mileage    0.045831
3                   log_mileage    0.041456
5               mileage_squared    0.028254

Price Prediction Examples:

BMW X5 (2011, 137566km) : Prediction : €15,704.47 Real Price: €13,490.00
Mileage impact:
10,000 km → €37,970.42 (Δ: €0)
30,000 km → €32,131.71 (Δ: €-5,839)
60,000 km → €18,070.17 (Δ: €-19,900)
90,000 km → €17,656.32 (Δ: €-2