In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('train.csv', sep=',')
data.head()

In [None]:
df_nulls = pd.Series(data.isnull().sum(),name='nulls').to_frame()
df_nulls['%'] = ((df_nulls['nulls'] / data.shape[0]) * 100).round(2)
df_nulls.sort_values(by='%', ascending=False, inplace=True)
df_nulls

In [None]:
#poolQC variable
data['PoolQC'] = data['PoolQC'].fillna("No Pool")

#MiscFeature variable
data['MiscFeature'] = data['MiscFeature'].fillna("None")

#Alley variable
data['Alley'] = data['Alley'].fillna("No alley access")

#Fence variable
data['Fence'] = data['Fence'].fillna("No Fence")

#MasVnrType variable
data['MasVnrType'] = data['MasVnrType'].fillna("None")

#FireplaceQu
data['FireplaceQu'] = data['FireplaceQu'].fillna("No Fireplace")

#BsmtQual
data['BsmtQual'] = data['BsmtQual'].fillna("No Basement")

#BsmtCond
data['BsmtCond'] = data['BsmtCond'].fillna("No Basement")

#BsmtCond
data['BsmtExposure'] = data['BsmtExposure'].fillna("No Basement")

#BsmtFinType1
data['BsmtFinType1'] = data['BsmtFinType1'].fillna("No Basement")

#BsmtFinType2
data['BsmtFinType2'] = data['BsmtFinType2'].fillna("No Basement")

#GarageType
data['GarageType'] = data['GarageType'].fillna("No Garage")

#GarageFinish
data['GarageFinish'] = data['GarageFinish'].fillna("No Garage")

#GarageFinish
data['GarageFinish'] = data['GarageFinish'].fillna("No Garage")

#GarageQual
data['GarageQual'] = data['GarageQual'].fillna("No Garage")

#GarageCond
data['GarageCond'] = data['GarageCond'].fillna("No Garage")

In [None]:
data.drop(columns='Id', inplace=True)

In [None]:
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = data.select_dtypes(exclude=[np.number]).columns.tolist()

# numeric columns operations

In [None]:
data.select_dtypes(include=[np.number]).describe(include='all')

In [None]:
threshold = 10
categorical_like = [col for col in data.select_dtypes(include="number").columns
                    if data[col].nunique() < threshold]
print(categorical_like)
for col in categorical_like:
    data[col] = data[col].astype("category")

numeric_columns = [col for col in numeric_columns if col not in categorical_like]
categorical_columns.extend(categorical_like)

In [None]:
unique_counts = data[numeric_columns].nunique().to_frame(name="n_uniques")
unique_counts

In [None]:
data[numeric_columns].hist(figsize=(15, 12), bins=30, edgecolor="black")
plt.tight_layout()
plt.show()

In [None]:
numeric_nan_frame = pd.Series(data[numeric_columns].isna().sum(),name='num_Nan').to_frame()
numeric_nan_frame = numeric_nan_frame[numeric_nan_frame["num_Nan"] > 0]
numeric_nan_frame

In [None]:
# data[numeric_nan_frame.index].hist(figsize=(8, 5), bins=30, edgecolor="black")
# plt.tight_layout()
# plt.show()

In [None]:
data['GarageYrBlt'].replace(np.nan, data['GarageYrBlt'].median(), inplace=True)
data['MasVnrArea'].replace(np.nan, data['MasVnrArea'].median(), inplace=True)
data['LotFrontage'].replace(np.nan, data['LotFrontage'].median(), inplace=True)


In [None]:
print(numeric_columns)

# categorical columns operations

In [None]:
print(categorical_columns)

In [None]:
data['BsmtQual'].unique()

In [None]:

# ExterQual > should be numeric (cat ordinal 1 > 5)

exterqual_mapping = {
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["ExterQual"] = data["ExterQual"].map(exterqual_mapping)

# ExterCond > should be numeric (cat ordinal 1>5)

extercond_mapping = {
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["ExterCond"] = data["ExterCond"].map(extercond_mapping)

# BsmtQual > should be numeric (cat ordinal 1>5)

bsmtqual_mapping = {
    'No Basement': 0,  # No Basement
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["BsmtQual"] = data["BsmtQual"].map(bsmtqual_mapping)

# BsmtCond > should be numeric (cat ordinal 1>5)

BsmtCond_mapping = {
    "No Basement": 0,
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["BsmtCond"] = data["BsmtCond"].map(BsmtCond_mapping)

# BsmtExposure > should be numeric (cat ordinal 1>5)

bsmtexposure_mapping = {
    "No Basement": 0,   # No Basement
    "No": 1,   # No Exposure
    "Mn": 2,   # Minimum Exposure
    "Av": 3,   # Average Exposure
    "Gd": 4    # Good Exposure
}

data["BsmtExposure"] = data["BsmtExposure"].map(bsmtexposure_mapping)

# BsmtFinType1 > should be numeric (cat ordinal 1>5)

bsmtfintype1_mapping = {
    "No Basement": 0,
    "Unf": 1,
    "LwQ": 2,
    "Rec": 3,
    "BLQ": 4,
    "ALQ": 5,
    "GLQ": 6
}

data["BsmtFinType1"] = data["BsmtFinType1"].map(bsmtfintype1_mapping)

# BsmtFinType2 > should be numeric (cat ordinal 1>5)

bsmtfintype2_mapping = {
    "No Basement": 0,
    "Unf": 1,
    "LwQ": 2,
    "Rec": 3,
    "BLQ": 4,
    "ALQ": 5,
    "GLQ": 6
}

data["BsmtFinType2"] = data["BsmtFinType2"].map(bsmtfintype2_mapping)

# HeatingQC > should be numeric (cat ordinal 1>5)

heatingqc_mapping = {
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["HeatingQC"] = data["HeatingQC"].map(heatingqc_mapping)

# Electrical > should be numeric (cat ordinal 1>5)

electrical_mapping = {
    "FuseP": 1,   # Poor
    "FuseF": 2,   # Fair
    "FuseA": 3,   # Average
    "SBrkr": 4,   # Standard breakers (best)
    "Mix": 3      # or set as 3/NA if unsure
}

data["Electrical"] = data["Electrical"].map(electrical_mapping)

# KitchenQual > should be numeric (cat ordinal 1>5)

kitchenqual_mapping = {
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["KitchenQual"] = data["KitchenQual"].map(kitchenqual_mapping)

# Functional > should be numeric (cat ordinal 1>5)

functional_mapping = {
    "Sal": 1,
    "Sev": 2,
    "Maj2": 3,
    "Maj1": 4,
    "Mod": 5,
    "Min2": 6,
    "Min1": 7,
    "Typ": 8
}

data["Functional"] = data["Functional"].map(functional_mapping)

# FireplaceQu > should be numeric (cat ordinal 1>5)

fireplacequ_mapping = {
    "No Fireplace": 0,
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["FireplaceQu"] = data["FireplaceQu"].map(fireplacequ_mapping)

# GarageType > should be numeric (cat ordinal 1>5)

garage_type_mapping = {
    "No Garage": 0,
    "CarPort": 1,
    "Detchd": 2,
    "Basment": 3,
    "Attchd": 4,
    "BuiltIn": 5,
    "2Types": 6
}

data["GarageType"] = data["GarageType"].map(garage_type_mapping)

# GarageFinish > should be numeric (cat ordinal 1>5)

garagefinish_mapping = {
    "No Garage": 0,
    "Unf": 1,
    "RFn": 2,
    "Fin": 3
}

data["GarageFinish"] = data["GarageFinish"].map(garagefinish_mapping)

# GarageQual > should be numeric (cat ordinal 1>5)

garagequal_mapping = {
    "No Garage": 0,
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["GarageQual"] = data["GarageQual"].map(garagequal_mapping)

# GarageCond > should be numeric (cat ordinal 1>5)

garagecond_mapping = {
    "No Garage": 0,
    "Po": 1,
    "Fa": 2,
    "TA": 3,
    "Gd": 4,
    "Ex": 5
}

data["GarageCond"] = data["GarageCond"].map(garagecond_mapping)

# PoolQC > should be numeric (cat ordinal 1>5)

poolqc_mapping = {
    "No Pool": 0,
    "Fa": 1,
    "TA": 2,
    "Gd": 3,
    "Ex": 4
}

data["PoolQC"] = data["PoolQC"].map(poolqc_mapping)

# Fence > should be numeric (cat ordinal 1>5)

fence_mapping = {
    "No Fence": 0,
    "MnWw": 1,
    "MnPrv": 2,
    "GdWo": 3,
    "GdPrv": 4
}

data["Fence"] = data["Fence"].map(fence_mapping)

# SaleType > nominal categorical


le = LabelEncoder()
data["SaleType"] = le.fit_transform(data["SaleType"])
data["SaleCondition"] = le.fit_transform(data["SaleCondition"])
data["Utilities"] = le.fit_transform(data["Utilities"])




In [None]:
unique_counts = data[categorical_columns].nunique().to_frame(name="n_uniques")
unique_counts

In [None]:
cat_cols = data.select_dtypes(include=['object', 'category']).columns
data[cat_cols] = data[cat_cols].astype('category')

In [None]:
# data.to_csv('test_processed.csv',index=False)

## Correlations

In [None]:

# 4. Compute the correlation matrix
corr_matrix = data[numeric_columns].corr()

# 5. Plot the heatmap
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True,annot_kws={"size":8} )
plt.title('Correlation Heatmap of Numeric Features')
plt.show()

In [None]:
corr_matrix['SalePrice']

In [None]:
low_correlated_features = corr_matrix['SalePrice'][corr_matrix['SalePrice']<abs(0.4)].index
low_correlated_features = low_correlated_features.to_list()

In [None]:
data.drop(columns = low_correlated_features,inplace=True)

In [None]:
data = data[data['SalePrice']<300000]

## split

In [None]:
X = data.drop(columns = ['SalePrice'])
Y= data['SalePrice'].to_numpy()
Y = np.log1p(Y)


In [None]:
Y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import matplotlib.pyplot as plt


X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42
)

xgb = XGBRegressor(
    objective="reg:squarederror",   # regression task
    eval_metric="rmse",             # only RMSE
    tree_method="hist",
    enable_categorical=True         # categorical support
)


param_dist = {
    "n_estimators": [200, 500, 800],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.001,0.01, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_alpha": [0, 0.5, 5],
    "reg_lambda": [0.5, 1, 5],
    "min_child_weight": [1, 5, 10],
    "gamma": [0, 1, 5],
}


random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=40,                       # reasonable number of random trials
    scoring="neg_root_mean_squared_error",  # RMSE
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best parameters:", random_search.best_params_)
print("Best CV RMSE:", -random_search.best_score_)


best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
print("Test RMSE:", rmse_test)


best_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=2
)

results = best_model.evals_result()
train_rmse = results["validation_0"]["rmse"]
test_rmse = results["validation_1"]["rmse"]


plt.figure(figsize=(8,5))
plt.plot(train_rmse, label="Train RMSE")
plt.plot(test_rmse, label="Test RMSE")
plt.xlabel("Boosting Round")
plt.ylabel("RMSE")
plt.title("XGBoost RMSE Learning Curve")
plt.legend()
plt.show()

importance = best_model.feature_importances_

# Create DataFrame
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": importance
}).sort_values(by="Importance", ascending=False)

# Take top 20 features
top_features = feature_importance.head(20)

# Plot
plt.figure(figsize=(10,8))
bars = plt.barh(top_features["Feature"], top_features["Importance"], color='skyblue')
plt.gca().invert_yaxis()  # largest on top
plt.xlabel("Importance")
plt.title("Top 20 XGBoost Feature Importances")

# Annotate bars with importance values
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.01, bar.get_y() + bar.get_height()/2,
             f"{width:.3f}", va='center')

plt.show()

In [None]:

y_pred_original = np.expm1(y_pred)        # predicted SalePrice
y_test_original = np.expm1(y_test)        # actual SalePrice

# Combine into a DataFrame
df_results = pd.DataFrame({
    "Actual": y_test_original,
    "Predicted": y_pred_original
})

print(df_results.head())

rmse_test = np.sqrt(mean_squared_error(y_test_original, y_pred_original))  # RMSE
print("Test RMSE:", rmse_test)

# Saleprice

In [None]:
plt.boxplot(data['SalePrice'])

In [None]:
plt.hist(data['SalePrice'])