In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12,6)
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 14

In [None]:
df = pd.read_csv("/kaggle/input/housedata/data.csv")

df['year_sold'] = pd.to_datetime(df['date']).dt.year

In [None]:
df['house_age'] = df['year_sold'] - df['yr_built']
df['total_area'] = df['sqft_living'] + df['sqft_basement']
df['bathrooms_per_bedroom'] = df['bathrooms'] / (df['bedrooms'] + 0.1)
df['renovated'] = df['yr_renovated'].apply(lambda x: 0 if x==0 else 1)
df['price_per_sqft'] = df['price'] / (df['total_area'] + 1)

df.drop(columns=['date', 'yr_renovated'], inplace=True)

In [None]:
df = df[df['price'] > 50000]
df = df[df['sqft_living'] > 200]
df = df[df['bedrooms'] > 0]
df = df[df['bathrooms'] > 0]

for col in ['price', 'sqft_living', 'sqft_lot', 'total_area', 'price_per_sqft']:
    upper = df[col].quantile(0.995)
    df = df[df[col] < upper]

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['price'], bins=50, kde=True, color='green')
plt.title("📉 Distribution of House Prices")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()


plt.figure(figsize=(12,6))
sns.scatterplot(x='sqft_living', y='price', data=df, alpha=0.5, color='blue')
plt.title("📈 Price vs Living Area")
plt.xlabel("Living Area (sqft)")
plt.ylabel("Price")
plt.show()


numeric_cols = df.select_dtypes(include=np.number)
plt.figure(figsize=(14,12))
sns.heatmap(numeric_cols.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("📊 Correlation Heatmap (Numeric Features Only)")
plt.show()


features_to_plot = ['house_age', 'total_area', 'bathrooms_per_bedroom', 'renovated', 'price_per_sqft']
plt.figure(figsize=(16,10))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(3, 2, i)
    sns.scatterplot(x=feature, y='price', data=df, alpha=0.5, color='purple')
    plt.title(f"📊 Price vs {feature}")
plt.tight_layout()
plt.show()

In [None]:
y = df['price']
X = df.drop(columns=['price'])

categorical_features = ['street', 'city', 'statezip', 'country']
numerical_features = [col for col in X.columns if col not in categorical_features]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(
        n_estimators=700,
        max_depth=12,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ))
])

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"🏁 أداء النموذج بعد تنظيف أعمق:")
print(f"RMSE: {rmse:,.2f}")
print(f"R² Score: {r2:.2f}")

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6, color='darkblue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("📊 Actual vs Predicted Prices")
plt.show()

In [None]:
errors = y_test - y_pred
plt.figure(figsize=(12,6))
sns.histplot(errors, bins=50, kde=True, color='red')
plt.title("📉 Distribution of Prediction Errors")
plt.xlabel("Prediction Error")
plt.show()

In [None]:
regressor = model.named_steps['regressor']
importances = regressor.feature_importances_

# أسماء الأعمدة بعد التحويل
num_features_scaled = preprocessor.named_transformers_['num'].get_feature_names_out(numerical_features)
cat_features_encoded = model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features)
all_features = np.concatenate([num_features_scaled, cat_features_encoded])

fi_df = pd.DataFrame({'Feature': all_features, 'Importance': importances})
fi_df = fi_df.sort_values(by='Importance', ascending=False).head(20)

plt.figure(figsize=(12,8))
sns.barplot(x='Importance', y='Feature', data=fi_df, palette="magma")
plt.title("🔥 Top 20 Feature Importances")
plt.show()