# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

import warnings
warnings.filterwarnings('ignore') # Ignore all warnings

# Read Data

In [None]:
pd.set_option('display.max_columns', None)
df_train = pd.read_csv('train.csv')

In [None]:
df_train.head(5)

# Data Preprocessing

## Handling Missing Values

In [None]:
df_train.info()

`
show the Missing values Ratio
`

In [None]:
all_data_na = (df_train.isnull().sum() / len(df_train)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' : all_data_na})
print(missing_data)

`
drop id and columns that have missing ratio greater than 30.0
`

In [None]:
df_train.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu'], axis=1, inplace=True)

In [None]:
df_train.info()

In [None]:
pd.set_option('display.max_rows', None)
df_train.isnull().sum()

`
split features into categorical and numerical
`

In [None]:
cat_data = df_train.select_dtypes(include='object')
num_data = df_train.select_dtypes(exclude='object')

In [None]:
cat_data.info()

In [None]:
num_data.info()

In [None]:
cat_data.isna().sum()

`
fill missing values in cat_data with mode
`

In [None]:
cat_data['GarageQual'].fillna('None',inplace=True)

cat_data['GarageFinish'].fillna('None',inplace=True)

cat_data['GarageCond'].fillna('None',inplace=True)

cat_data['GarageType'].fillna('None',inplace=True)

for column in cat_data:
    if cat_data[column].isna().sum():
        cat_data[column] = cat_data[column].fillna(value=cat_data[column].mode()[0])

In [None]:
cat_data.isna().sum()

In [None]:
num_data.isna().sum()

In [None]:
num_data['LotFrontage'].fillna(0,inplace=True)

`
fill missing values in num_data using interquartile range
`

In [None]:
def fill_with_quartile(value, quartiles, probabilities):
    if pd.isna(value):
        return np.random.choice(quartiles, p=probabilities)
    return value 
    
for column in num_data:
    if num_data[column].isna().sum():
        quartiles = [
            num_data[column].quantile(0.25),
            num_data[column].quantile(0.50),
            num_data[column].quantile(0.75)
        ]
        probabilities = [0.33, 0.34, 0.33]
        num_data[column] = num_data[column].apply(fill_with_quartile, args=(quartiles, probabilities)) 
# args pass additional arguments to applied function

In [None]:
num_data.isna().sum()

In [None]:
df_train = pd.concat([num_data, cat_data], axis=1)

In [None]:
df_train.head(3)

In [None]:
all_data_na = (df_train.isnull().sum() / len(df_train)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' : all_data_na})
print(missing_data)

In [None]:
df_train.info()

In [None]:
df_train.duplicated().sum()

## EDA

In [None]:
for column in cat_data:
    sorted_unique = cat_data[column].unique().tolist()
    sorted_unique.sort()
    print({column: sorted_unique})
    print('\n')

In [None]:
for feature in cat_data:
    if cat_data[feature].nunique() > 5:
        fig = px.violin(cat_data, y=feature, x=num_data['SalePrice'], box=True, points='all', color=feature,
                        color_discrete_sequence=px.colors.qualitative.Set2)
        fig.update_layout(
            title={
                'text': f'Relationship between SalePrice and {feature}',
                'y':0.99,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'
            },
            xaxis_title='SalePrice',
            yaxis_title=feature,
            template='plotly_white',
            width = 1000,
            height = 2000
        )
    else:
        fig = px.violin(cat_data, x=feature, y=num_data['SalePrice'], box=True, points='all', color=feature,
                        color_discrete_sequence=px.colors.qualitative.Set2)
        fig.update_layout(
            title={
                'text': f'Relationship between SalePrice and {feature}',
                'y':0.9,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'
            },
            xaxis_title=feature,
            yaxis_title='SalePrice',
            template='plotly_white',
            width = 1000,
            height = 500
        )
    
    fig.show()

In [None]:
for feature in num_data:
    plt.figure(figsize=(12, 6))
    sns.scatterplot(num_data, x=feature, y='SalePrice', hue='SalePrice', palette='Blues')
    plt.show()

## Feature Engineering

`
create one feature that rate Garage from 1 to 5 based on ['GarageQual', 'GarageCond', 'GarageFinish', 'GarageType']
`

In [None]:
# Define scoring dictionaries
qual_cond_scores = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
finish_scores = {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0}
type_scores = {'2Types': 5, 'BuiltIn': 4, 'Attchd': 3, 'Basment': 2, 'Detchd': 1, 'CarPort': 1, 'None': 0}

# Map scores to the DataFrame
cat_data['GarageQual'] = cat_data['GarageQual'].map(qual_cond_scores)
cat_data['GarageCond'] = cat_data['GarageCond'].map(qual_cond_scores)
cat_data['GarageFinish'] = cat_data['GarageFinish'].map(finish_scores)
cat_data['GarageType'] = cat_data['GarageType'].map(type_scores)

# Calculate total score
cat_data['TotalScore'] = cat_data['GarageQual'] + cat_data['GarageCond'] + cat_data['GarageFinish'] + cat_data['GarageType']

# Normalize to a scale of 1 to 5
cat_data['GarageRating'] = (cat_data['TotalScore'] / cat_data['TotalScore'].max()) * 10

`
keep GarageRating and drop all other Garage features
`

In [None]:
cat_data.drop(['GarageQual', 'GarageCond', 'GarageFinish', 'GarageType', 'TotalScore'], axis = 1, inplace=True)

`
Merge Condition1 and Condition2
`

In [None]:
condition1_encoded = pd.get_dummies(cat_data['Condition1'], prefix='Cond', dtype=int, drop_first=True)
cat_data = pd.concat([cat_data, condition1_encoded], axis=1)

In [None]:
# Update the encoded columns based on Condition2
for i in range(len(cat_data)):
    condition_col = f"Cond_{cat_data.at[i, 'Condition2']}"
    if condition_col in cat_data.columns:
        cat_data.at[i, condition_col] = 1

In [None]:
# drop Cond_Norm to reduce colliniarity
cat_data.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
# cat_data.drop(['Cond_Norm'], axis=1, inplace=True)

In [None]:
exterior_encoded = pd.get_dummies(cat_data['Exterior1st'], prefix='Exterior', dtype=int, drop_first=True)
cat_data = pd.concat([cat_data, exterior_encoded], axis=1)

In [None]:
# Update the encoded columns based on Condition2
for i in range(len(cat_data)):
    exterior_col = f"Exterior_{cat_data.at[i, 'Exterior2nd']}"
    if condition_col in cat_data.columns:
        cat_data.at[i, condition_col] = 1

In [None]:
# drop Exterior_Wd Sdng to reduce colliniarity
cat_data.drop(['Exterior1st', 'Exterior2nd'], axis=1, inplace=True)
# cat_data.drop(['Exterior_Wd Sdng'], axis=1, inplace=True)

`
Based on Description there is some overlap between these two features but MSSubClass offers more granularity
and specific details about the construction and style of the dwelling.
so I'll Drop HouseStyle feature
`

In [None]:
cat_data.drop('HouseStyle', axis=1, inplace=True)

`
LandContour == LandSlop
`

In [None]:
cat_data.drop('LandContour', axis=1, inplace=True)

`
Drop Weak Features
`

In [None]:
# cat_data.drop(
#     ['LotConfig', 'BldgType', 'ExterCond',
#      'BsmtCond', 'BsmtFinType2', 'Heating',
#      'Electrical', 'Street', 'LandContour',
#      'Utilities', 'RoofStyle', 'RoofMatl',
#      'CentralAir', 'PavedDrive'], axis=1, inplace=True)

cat_data.drop(['Utilities', 'Street'], axis=1, inplace=True)

In [None]:
num_data.drop(['BsmtFinSF2', 'BsmtHalfBath', 
               'KitchenAbvGr', 'BsmtFinSF1','LowQualFinSF',
               'PoolArea', 'MiscVal'], axis=1, inplace=True)

In [None]:
df_train = pd.concat([num_data, cat_data], axis=1)

def remove_outliers(df, column):

  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  df_no_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
  return df_no_outliers

print(f"Original shape: {df_train.shape}")
df_train = remove_outliers(df_train, 'SalePrice')
df_train = remove_outliers(df_train, 'LotFrontage')
df_train = remove_outliers(df_train, 'TotalBsmtSF')
df_train = remove_outliers(df_train, 'LotArea')
df_train = remove_outliers(df_train, 'GrLivArea')
df_train = remove_outliers(df_train, 'GarageArea')
print(f"Shape after outlier removal: {df_train.shape}")

In [None]:
cat_data = df_train.select_dtypes(include='object')
num_data = df_train.select_dtypes(exclude='object')

## Encoding

In [None]:
# this feature represents different shapes of the property with a clear order or ranking
# from regular to increasingly irregular, So it is ordinal.

# Mapping for ordinal encoding
lotshape_mapping = {'Reg': 0, 'IR1': 1, 'IR2': 2, 'IR3': 3}

cat_data['LotShape'] = cat_data['LotShape'].map(lotshape_mapping)

In [None]:
# cat_data.drop(['LotShape'], axis=1, inplace=True)

In [None]:
# this feature represents the Quality of the Kitchen so it's ordinal

kitchenqual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}

cat_data['KitchenQual'] = cat_data['KitchenQual'].map(kitchenqual_mapping)

In [None]:
# MSSubClass is an Id that represents information about the House
mssubclass_encoded = pd.get_dummies(num_data['MSSubClass'], prefix='MSSubClass', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, mssubclass_encoded], axis=1)

In [None]:
num_data.drop('MSSubClass', axis=1, inplace=True)

In [None]:
# Apply one-hot encoding
salecondition_encoded = pd.get_dummies(cat_data['SaleCondition'], prefix='SaleCond', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, salecondition_encoded], axis=1)

In [None]:
# drop SaleCond_Normal to reduce colliniarity
cat_data.drop(['SaleCondition'], axis=1, inplace=True)
# cat_data.drop(['SaleCond_Normal'], axis=1, inplace=True)

In [None]:
mszoning_encoded = pd.get_dummies(cat_data['MSZoning'], prefix='MSZoning', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, mszoning_encoded], axis=1)

In [None]:
cat_data.drop(['MSZoning'], axis=1, inplace=True)

In [None]:
neighborhood_encoded = pd.get_dummies(cat_data['Neighborhood'], prefix='Neighborhood', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, neighborhood_encoded], axis=1)

In [None]:
cat_data.drop(['Neighborhood'], axis=1, inplace=True)

In [None]:
exterqual_mapping = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

cat_data['ExterQual'] = cat_data['ExterQual'].map(exterqual_mapping)

In [None]:
bsmtqual_mapping = {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

cat_data['BsmtQual'] = cat_data['BsmtQual'].map(bsmtqual_mapping)

# cat_data.drop('BsmtQual', axis=1, inplace=True)

In [None]:
bsmt_exposure_mapping = {'No': 0, 'Mn': 1, 'Av': 2, 'Gd': 3}

cat_data['BsmtExposure'] = cat_data['BsmtExposure'].map(bsmt_exposure_mapping)

# cat_data.drop('BsmtExposure', axis=1, inplace=True)

In [None]:
bsmtfintype_mapping = {'NA': 0, 'Unf': 1, 'LwQ': 2, 'BLQ': 3, 'Rec': 4, 'ALQ': 5, 'GLQ': 6}

cat_data['BsmtFinType1'] = cat_data['BsmtFinType1'].map(bsmtfintype_mapping)

# cat_data.drop('BsmtFinType1', axis=1, inplace=True)

In [None]:
cat_data['BsmtFinType2'] = cat_data['BsmtFinType2'].map(bsmtfintype_mapping)

# cat_data.drop('BsmtFinType2', axis=1, inplace=True)

In [None]:
heatingqc_mapping = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

cat_data['HeatingQC'] = cat_data['HeatingQC'].map(heatingqc_mapping)

# cat_data.drop('HeatingQC', axis=1, inplace=True)

In [None]:
foundation_encoded = pd.get_dummies(cat_data['Foundation'], prefix='Foundation', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, foundation_encoded], axis=1)

In [None]:
cat_data.drop('Foundation', axis=1, inplace=True)

In [None]:
# land_contour_mapping = { 'Lvl': 1, 'Bnk': 2, 'HLS': 3, 'Low': 4 }

# cat_data['LandContour'] = cat_data['LandContour'].map(land_contour_mapping)

In [None]:
lot_config_mapping = { 'Inside': 1, 'Corner': 2, 'CulDSac': 3, 'FR2': 4, 'FR3': 5 }

cat_data['LotConfig'] = cat_data['LotConfig'].map(lot_config_mapping)

In [None]:
land_slope_mapping = { 'Gtl': 1, 'Mod': 2, 'Sev': 3 }

cat_data['LandSlope'] = cat_data['LandSlope'].map(land_slope_mapping)
# cat_data.drop('LandSlope', axis=1, inplace=True)

In [None]:
bldgtype_encoded = pd.get_dummies(cat_data['BldgType'], prefix='BldgType', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, bldgtype_encoded], axis=1)

In [None]:
cat_data.drop('BldgType', axis=1, inplace=True)

In [None]:
roofstyle_encoded = pd.get_dummies(cat_data['RoofStyle'], prefix='RoofStyle', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, roofstyle_encoded], axis=1)

In [None]:
cat_data.drop('RoofStyle', axis=1, inplace=True)

In [None]:
extercond_mapping = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

cat_data['ExterCond'] = cat_data['ExterCond'].map(extercond_mapping)

In [None]:
bsmtcond_mapping = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

cat_data['BsmtCond'] = cat_data['BsmtCond'].map(extercond_mapping)

In [None]:
centralair_mapping = { 'Y': 1 ,  'N' : 0}

cat_data['CentralAir'] = cat_data['CentralAir'].map(centralair_mapping)

In [None]:
paveddrive_mapping = { 'Y': 2, 'P': 1 ,  'N' : 0}

cat_data['PavedDrive'] = cat_data['PavedDrive'].map(paveddrive_mapping)

In [None]:
electrical_encoded = pd.get_dummies(cat_data['Electrical'], prefix='Electrical', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, electrical_encoded], axis=1)

In [None]:
cat_data.drop('Electrical', axis=1, inplace=True)

In [None]:
saletype_encoded = pd.get_dummies(cat_data['SaleType'], prefix='SaleType', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, saletype_encoded], axis=1)

In [None]:
cat_data.drop('SaleType', axis=1, inplace=True)

In [None]:
home_functionality_mapping = {
    'Typ': 1,
    'Min1': 2,
    'Min2': 3,
    'Mod': 4,
    'Maj1': 5,
    'Maj2': 6,
    'Sev': 7,
    'Sal': 8
}

cat_data['Functional'] = cat_data['Functional'].map(home_functionality_mapping)

In [None]:
# housestyle_encoded = pd.get_dummies(cat_data['HouseStyle'], prefix='HouseStyle', dtype=int, drop_first=True)

# cat_data = pd.concat([cat_data, housestyle_encoded], axis=1)

In [None]:
# cat_data.drop('HouseStyle', axis=1, inplace=True)

In [None]:
heating_encoded = pd.get_dummies(cat_data['Heating'], prefix='Heating', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, heating_encoded], axis=1)

In [None]:
cat_data.drop('Heating', axis=1, inplace=True)

In [None]:
# street_encoded = pd.get_dummies(cat_data['Street'], prefix='Street', dtype=int, drop_first=True)

# cat_data = pd.concat([cat_data, street_encoded], axis=1)

In [None]:
# cat_data.drop('Street', axis=1, inplace=True)

In [None]:
roofmatl_encoded = pd.get_dummies(cat_data['RoofMatl'], prefix='RoofMatl', dtype=int, drop_first=True)

cat_data = pd.concat([cat_data, roofmatl_encoded], axis=1)

In [None]:
cat_data.drop('RoofMatl', axis=1, inplace=True)

In [None]:
cat_data.head(3)

In [None]:
cat_data.shape

In [None]:
cat_data.select_dtypes(include='object').sum()

# Split and Scaling data

In [None]:
df_train.shape

In [None]:
df_train = pd.concat([num_data, cat_data], axis=1)

In [None]:
x = df_train.drop('SalePrice', axis=1)
y = df_train['SalePrice']

In [None]:
columns = x.columns

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state= 42)

In [None]:
# from sklearn.preprocessing import StandardScaler
# st = StandardScaler()
# x = pd.DataFrame(data= st.fit_transform(x), columns = columns)

In [None]:
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler()
x_train = pd.DataFrame(data= norm.fit_transform(x_train), columns = columns)
x_test = pd.DataFrame(data= norm.transform(x_test), columns = columns)

In [None]:
# from sklearn.preprocessing import RobustScaler
# robust = RobustScaler()
# x = pd.DataFrame(data= robust.fit_transform(x), columns = columns)

In [None]:
# x = pd.DataFrame(data= np.log(x), columns = columns)

# Modeling

## Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score, r2_score

lasso = Lasso(alpha=0.003)

model = lasso.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_train)

R2_score = r2_score(y_pred=y_pred, y_true=y_train)

print(f'train score = {R2_score}')

In [None]:
y_pred = model.predict(x_test)

R2_score = r2_score(y_pred=y_pred, y_true=y_test)

print(f'test score = {R2_score}')

## Ridge

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.0003)

model = ridge.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_train)

R2_score = r2_score(y_pred=y_pred, y_true=y_train)

print(f'train score = {R2_score}')

In [None]:
y_pred = model.predict(x_test)

R2_score = r2_score(y_pred=y_pred, y_true=y_test)

print(f'test score = {R2_score}')

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()

model = LR.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_train)

R2_score = r2_score(y_pred=y_pred, y_true=y_train)

print(f'train score = {R2_score}')

In [None]:
y_pred = model.predict(x_test)

R2_score = r2_score(y_pred=y_pred, y_true=y_test)

print(f'test score = {R2_score}')