# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
print("Train Shape:", train.shape)

In [None]:
print("Test Shape:", test.shape)

In [None]:
train.sample(5)

In [None]:
test.sample(5)

# Exploratory Data Analysis (EDA)

### Check missing values

In [None]:
missing = train.isnull().sum().sort_values(ascending=False)
print(missing.head(10))

### Correlation with SalePrice

In [None]:
# Select only numeric columns
numeric_cols = train.select_dtypes(include=[np.number])

# Correlation with SalePrice
plt.figure(figsize=(10,6))
sns.heatmap(
    numeric_cols.corr()['SalePrice'].sort_values(ascending=False).to_frame(),
    annot=True, cmap="coolwarm"
)
plt.show()


# Example scatter plot

In [None]:
sns.scatterplot(x=train["GrLivArea"], y=train["SalePrice"])
plt.show()


# Preprocessing

### Drop missing values

In [None]:
# Drop columns with too many missing values
train = train.drop(columns=["Alley","PoolQC","Fence","MiscFeature"])

### Fill missing values separately

In [None]:
# Separate numeric and categorical columns
num_cols = train.select_dtypes(include=[np.number]).columns.drop('SalePrice')
cat_cols = train.select_dtypes(exclude=[np.number]).columns

# Fill numeric columns with median
train[num_cols] = train[num_cols].fillna(train[num_cols].median())
test[num_cols] = test[num_cols].fillna(test[num_cols].median())

# Fill categorical columns with mode (most frequent value)
train[cat_cols] = train[cat_cols].fillna(train[cat_cols].mode().iloc[0])
test[cat_cols] = test[cat_cols].fillna(test[cat_cols].mode().iloc[0])


### Encode categorical features

In [None]:
train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)


In [None]:
# Align train & test
train, test = train.align(test, join="left", axis=1, fill_value=0)

In [None]:
# Features & Target
X = train.drop("SalePrice", axis=1)
y = train["SalePrice"]

In [None]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Model Training

###  Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)


### Random Forest

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

###  XGBoost

In [None]:
xgb = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=4, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_val)


# Evaluation

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - RMSE: {rmse:.2f}, R²: {r2:.2f}")

evaluate_model(y_val, y_pred_lr, "Linear Regression")
evaluate_model(y_val, y_pred_rf, "Random Forest")
evaluate_model(y_val, y_pred_xgb, "XGBoost")


In [None]:
final_preds = xgb.predict(test)

submission = pd.DataFrame({
    "Id": test.index + 1461,  # Kaggle test IDs start after train
    "SalePrice": final_preds
})

submission.to_csv("submission.csv", index=False)
