In [3]:
import numpy as np
import pandas as pd
import seaborn as sns

<div class="alert alert-block alert-warning">
<b>Notes:</b>
<br> - Each cell has its own notes.
<br> - I tried Multi linear regression, Polynomial regression, and random forest regression.
<br> - For some reason, the result we got locally is better than the one we get on Kaggle!
<br> - Some output couldn't reduce the volume (like data.info()).
<br> - I tried several encoding techniques, (one-hot encoding was painful, 
</div>

### First Step: Data Setup

In [4]:
train_dt_path = "../data/train.csv"
test_dt_path = "../data/test.csv"
from sklearn.model_selection import train_test_split

data = pd.read_csv(train_dt_path)
Kaggle_testing_data = pd.read_csv(test_dt_path)

In [5]:
# data.describe()

In [6]:
# print(data.info())

In [7]:
# Kaggle_testing_data.info()

### Second Step: Feature processing ( remove Empty features, populate Nan Values,Encode, Scale)

- Here I removed the features that have more than 50% values of NaN.  
- I removed Id, Utilities & FireplaceQu, because they have only one value for all the feature and because Id is not needed in the prediction process.  
- I am preparing the train set and the test set together so I could use test_set on Kaggle submission.

In [8]:
Y = data['SalePrice']
X = data.drop(['SalePrice'], axis =1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_final_train, X_val, y_final_train, y_val = train_test_split(X_train, y_train, test_size = 0.4, random_state = 0)

perc = 50.0
min_count_Train =  int(((100-perc)/100)*X_final_train.shape[0] + 1)
min_count_Val =  int(((100-perc)/100)*X_val.shape[0] + 1)
min_count_Test =  int(((100-perc)/100)*X_test.shape[0] + 1)
min_count_Kaggle_Test =  int(((100-perc)/100)*Kaggle_testing_data.shape[0] + 1)


X_final_train.dropna( axis=1,
                thresh=min_count_Train, inplace=True)
X_val.dropna( axis=1,
                thresh=min_count_Val, inplace=True)
X_test.dropna( axis=1,
                thresh=min_count_Test, inplace=True)
Kaggle_testing_data.dropna(axis=1,
                thresh=min_count_Kaggle_Test, inplace=True)


X_final_train = X_final_train.drop(['Id','Utilities','FireplaceQu'], axis=1)
X_val = X_val.drop(['Id','Utilities'], axis=1)
X_test = X_test.drop(['Id','Utilities','FireplaceQu'], axis=1)
# ids = pd.DataFrame()
# ids['Id'] = Kaggle_testing_data['Id']
Kaggle_testing_data = Kaggle_testing_data.drop(['Id','Utilities'], axis=1)
# print(X_val.tail(20))

In [9]:
# X_final_train['MSSubClass'].head(5)

### Split the Features to: Numeric, Ordinal, and Categorical

In [10]:
categorical_features_Train = X_final_train.select_dtypes('object')

categorical_features_Train_Ordinal = categorical_features_Train[['MSZoning', 'LandSlope','BldgType','RoofMatl','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','HeatingQC'
,'CentralAir','Electrical','KitchenQual','Functional','GarageFinish','GarageQual','GarageCond','PavedDrive','SaleCondition']]

categorical_features_Train_Not_Ordinal = categorical_features_Train[['Street','LotShape','LandContour','LotConfig','Neighborhood','Condition1','Condition2','HouseStyle','RoofStyle','Exterior1st','Exterior2nd'
,'MasVnrType','Foundation','Heating','GarageType','SaleType']]


categorical_features_Val = X_val.select_dtypes('object')

categorical_features_Val_Ordinal = categorical_features_Val[['MSZoning', 'LandSlope','BldgType','RoofMatl','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','HeatingQC'
,'CentralAir','Electrical','KitchenQual','Functional','GarageFinish','GarageQual','GarageCond','PavedDrive','SaleCondition']]

categorical_features_Val_Not_Ordinal = categorical_features_Val[['Street','LotShape','LandContour','LotConfig','Neighborhood','Condition1','Condition2','HouseStyle','RoofStyle','Exterior1st','Exterior2nd'
,'MasVnrType','Foundation','Heating','GarageType','SaleType']]



categorical_features_Test = X_test.select_dtypes('object')

categorical_features_Test_Ordinal = categorical_features_Test[['MSZoning', 'LandSlope','BldgType','RoofMatl','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','HeatingQC'
,'CentralAir','Electrical','KitchenQual','Functional','GarageFinish','GarageQual','GarageCond','PavedDrive','SaleCondition']]

categorical_features_Test_Not_Ordinal = categorical_features_Test[['Street','LotShape','LandContour','LotConfig','Neighborhood','Condition1','Condition2','HouseStyle','RoofStyle','Exterior1st','Exterior2nd'
,'MasVnrType','Foundation','Heating','GarageType','SaleType']]



# print(categorical_features_Test_Not_Ordinal.head(5))


# ----------------------------------------------------------------- for Kaggle ----------------------------------------------------------------------
categorical_features_Kaggle_Testing = Kaggle_testing_data.select_dtypes('object')

categorical_features_Kaggle_Testing_Ordinal = categorical_features_Kaggle_Testing[['MSZoning', 'LandSlope','BldgType','RoofMatl','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','HeatingQC'
,'CentralAir','Electrical','KitchenQual','Functional','GarageFinish','GarageQual','GarageCond','PavedDrive','SaleCondition']]

categorical_features_Kaggle_Testing_Not_Ordinal = categorical_features_Kaggle_Testing[['Street','LotShape','LandContour','LotConfig','Neighborhood','Condition1','Condition2','HouseStyle','RoofStyle','Exterior1st','Exterior2nd'
,'MasVnrType','Foundation','Heating','GarageType','SaleType']]
# ---------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
# categorical_features_Train_Ordinal['MSZoning'].unique()
# categorical_features_Train_Not_Ordinal.info()

#### For Ordinal Features

In [12]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def get_Encoded_OneHot_Encoder(categorical_features_Not_Ord):
    cat_cols = categorical_features_Not_Ord.columns.values
    cols_encoded = []
    for col in cat_cols:
        cols_encoded += [f"{col}_{cat}" for cat in list(categorical_features_Not_Ord[col].unique())]
    oh_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoded_cols = oh_encoder.fit_transform(categorical_features_Not_Ord[cat_cols])
    df_enc = pd.DataFrame(encoded_cols, columns=cols_encoded)

    return df_enc

def get_Encoded_Ordinal(categorical_features_Ord):
    cat_encoded = pd.DataFrame()
    for i in range(len(categorical_features_Ord.columns)):
        ColName = categorical_features_Ord.columns[i]
        enc = OrdinalEncoder()
        temp = enc.fit_transform(categorical_features_Ord[[ColName]])
        finall = pd.concat([
        cat_encoded,
        pd.DataFrame(
            temp,
            columns=[ColName]
        )
        ],axis=1)
        cat_encoded = finall
    return cat_encoded

Train_ordinal = get_Encoded_Ordinal(categorical_features_Train_Ordinal)
Val_ordinal = get_Encoded_Ordinal(categorical_features_Val_Ordinal)
Test_ordinal = get_Encoded_Ordinal(categorical_features_Test_Ordinal)
Kaggle_Testing_ordinal = get_Encoded_Ordinal(categorical_features_Kaggle_Testing_Ordinal)

# print(categorical_features_Train_Not_Ordinal.info())

#1. Function to replace NAN values with mode value
def impute_nan_most_frequent_category(DataFrame,ColName):
    # .mode()[0] - gives first category name
     most_frequent_category=DataFrame[ColName].mode()[0]
    
    # replace nan values with most occured category
     DataFrame[ColName] = DataFrame[ColName]
     DataFrame[ColName].fillna(most_frequent_category,inplace=True)

#2. Call function to impute most occured category
for Columns in categorical_features_Train_Not_Ordinal.columns:
    impute_nan_most_frequent_category(categorical_features_Train_Not_Ordinal,Columns)

for Columns in categorical_features_Val_Not_Ordinal.columns:
    impute_nan_most_frequent_category(categorical_features_Val_Not_Ordinal,Columns)

for Columns in categorical_features_Test_Not_Ordinal.columns:
    impute_nan_most_frequent_category(categorical_features_Test_Not_Ordinal,Columns)
    
for Columns in categorical_features_Kaggle_Testing_Not_Ordinal.columns:
    impute_nan_most_frequent_category(categorical_features_Kaggle_Testing_Not_Ordinal,Columns)    
# Display imputed result
categorical_features_Train_Not_Ordinal.info()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DataFrame[ColName] = DataFrame[ColName]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DataFrame[ColName].fillna(most_frequent_category,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DataFrame[ColName] = DataFrame[ColName]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 1130 to 1306
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Street        700 non-null    object
 1   LotShape      700 non-null    object
 2   LandContour   700 non-null    object
 3   LotConfig     700 non-null    object
 4   Neighborhood  700 non-null    object
 5   Condition1    700 non-null    object
 6   Condition2    700 non-null    object
 7   HouseStyle    700 non-null    object
 8   RoofStyle     700 non-null    object
 9   Exterior1st   700 non-null    object
 10  Exterior2nd   700 non-null    object
 11  MasVnrType    700 non-null    object
 12  Foundation    700 non-null    object
 13  Heating       700 non-null    object
 14  GarageType    700 non-null    object
 15  SaleType      700 non-null    object
dtypes: object(16)
memory usage: 93.0+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DataFrame[ColName] = DataFrame[ColName]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DataFrame[ColName].fillna(most_frequent_category,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DataFrame[ColName] = DataFrame[ColName]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas

In [13]:
Train_Not_ordinal = get_Encoded_OneHot_Encoder(categorical_features_Train_Not_Ordinal)
Val_Not_ordinal = get_Encoded_OneHot_Encoder(categorical_features_Val_Not_Ordinal)
Test_Not_ordinal = get_Encoded_OneHot_Encoder(categorical_features_Test_Not_Ordinal)
Kaggle_Testing_Not_ordinal = get_Encoded_OneHot_Encoder(categorical_features_Kaggle_Testing_Not_Ordinal)

Train_Not_ordinal, Val_Not_ordinal = Train_Not_ordinal.align(Val_Not_ordinal, join='left', axis=1)
Train_Not_ordinal, Test_Not_ordinal = Train_Not_ordinal.align(Test_Not_ordinal, join='left', axis=1)
# Train_Not_ordinal, Val_Not_ordinal = Train_Not_ordinal.align(Val_Not_ordinal, join='left', axis=1)
print(Val_Not_ordinal.info())
print(Train_Not_ordinal.info())
Val_Not_ordinal.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 0 to 467
Columns: 124 entries, Street_Pave to SaleType_Oth
dtypes: float64(124)
memory usage: 453.5 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Columns: 124 entries, Street_Pave to SaleType_Oth
dtypes: float64(124)
memory usage: 678.2 KB
None


Unnamed: 0,Street_Pave,Street_Grvl,LotShape_Reg,LotShape_IR1,LotShape_IR2,LotShape_IR3,LandContour_Lvl,LandContour_Low,LandContour_Bnk,LandContour_HLS,...,GarageType_Basment,SaleType_WD,SaleType_New,SaleType_COD,SaleType_ConLI,SaleType_CWD,SaleType_ConLD,SaleType_ConLw,SaleType_Con,SaleType_Oth
0,1.0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,,,0.0,0.0,,1.0
1,1.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,,,0.0,0.0,,1.0
2,1.0,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,,,0.0,0.0,,1.0
3,1.0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,,,0.0,0.0,,1.0
4,1.0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,,,0.0,0.0,,1.0


In [14]:
Train_Not_ordinal.replace(np.nan, 0, inplace=True)
Val_Not_ordinal.replace(np.nan, 0, inplace=True)
Test_Not_ordinal.replace(np.nan, 0, inplace=True)
Kaggle_Testing_Not_ordinal.replace(np.nan, 0, inplace=True)
# Val_Not_ordinal['SaleType_ConLI'].head()

In [15]:
# Val_Not_ordinal = Val_Not_ordinal.fillna(0)
Train_Not_ordinal = Train_Not_ordinal.fillna(Train_Not_ordinal.median())
Val_Not_ordinal = Val_Not_ordinal.fillna(Val_Not_ordinal.median())
Test_Not_ordinal = Test_Not_ordinal.fillna(Test_Not_ordinal.median())
Kaggle_Testing_Not_ordinal = Kaggle_Testing_Not_ordinal.fillna(Kaggle_Testing_Not_ordinal.median())


# Train_Not_ordinal.dropna( axis=1,
#                 thresh=min_count_Train, inplace=True)
# Val_Not_ordinal.dropna( axis=1,
#                 thresh=min_count_Val, inplace=True)
# Test_Not_ordinal.dropna( axis=1,
#                 thresh=min_count_Test, inplace=True)
# Kaggle_Testing_Not_ordinal.dropna(axis=1,
#                 thresh=min_count_Kaggle_Test, inplace=True)
print(Val_Not_ordinal.info())
print(Train_Not_ordinal.info())
Val_Not_ordinal.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 0 to 467
Columns: 124 entries, Street_Pave to SaleType_Oth
dtypes: float64(124)
memory usage: 453.5 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Columns: 124 entries, Street_Pave to SaleType_Oth
dtypes: float64(124)
memory usage: 678.2 KB
None


Unnamed: 0,Street_Pave,Street_Grvl,LotShape_Reg,LotShape_IR1,LotShape_IR2,LotShape_IR3,LandContour_Lvl,LandContour_Low,LandContour_Bnk,LandContour_HLS,...,GarageType_Basment,SaleType_WD,SaleType_New,SaleType_COD,SaleType_ConLI,SaleType_CWD,SaleType_ConLD,SaleType_ConLw,SaleType_Con,SaleType_Oth
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
numeric_features_Train = X_final_train.select_dtypes('number')
numeric_features_Val = X_val.select_dtypes('number')
numeric_features_Test = X_test.select_dtypes('number')
numeric_features_Kaggle_Testing = Kaggle_testing_data.select_dtypes('number')

f_Train = numeric_features_Train.join(Train_ordinal)
f_Val = numeric_features_Val.join(Val_ordinal)
f_Test = numeric_features_Test.join(Test_ordinal)
f_Kaggle_Testing = numeric_features_Kaggle_Testing.join(Kaggle_Testing_ordinal)
        
f_Train = f_Train.fillna(f_Train.median())
f_Val = f_Val.fillna(f_Val.median())
f_Test = f_Test.fillna(f_Test.median())
f_Kaggle_Testing = f_Kaggle_Testing.fillna(f_Kaggle_Testing.median())

# print(f_Train.info())
# f_Val.info()

In [17]:

# f_Val.head(5)

In [18]:
# temp = f_Train.join(Y)

# correlation = temp.corr()
# # # print(correlation)
# correlation.to_csv('../training_featuresss.csv')

# temp.info()

In [19]:


final_Train = f_Train[['OverallQual','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF','1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','GarageYrBlt','GarageCars']].join(Train_Not_ordinal)
final_Val = f_Val[['OverallQual','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF','1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','GarageYrBlt','GarageCars']].join(Val_Not_ordinal)
final_Test = f_Test[['OverallQual','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF','1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','GarageYrBlt','GarageCars']].join(Test_Not_ordinal)
final_Kaggle_Testing = f_Kaggle_Testing[['OverallQual','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF','1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','GarageYrBlt','GarageCars']].join(Kaggle_Testing_Not_ordinal)



final_Train.interpolate(method ='linear', limit_direction ='forward', inplace=True)
final_Train.interpolate(method ='linear', limit_direction ='backward', inplace=True)

final_Val.interpolate(method ='linear', limit_direction ='forward', inplace=True)
final_Val.interpolate(method ='linear', limit_direction ='backward', inplace=True)
# print(final_Val.head(5))

final_Test.interpolate(method ='linear', limit_direction ='forward', inplace=True)
final_Test.interpolate(method ='linear', limit_direction ='backward', inplace=True)

final_Kaggle_Testing.interpolate(method ='linear', limit_direction ='forward', inplace=True)
final_Kaggle_Testing.interpolate(method ='linear', limit_direction ='backward', inplace=True)


# final_Train.to_csv('../checkNOTordinal.csv')
print(final_Train.info())
final_Val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 1130 to 1306
Columns: 135 entries, OverallQual to SaleType_Oth
dtypes: float64(126), int64(9)
memory usage: 759.9 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 468 entries, 1425 to 370
Columns: 135 entries, OverallQual to SaleType_Oth
dtypes: float64(126), int64(9)
memory usage: 513.4 KB


In [20]:
# final_Train.head(30)

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for i in final_Train.columns:
    final_Train[i] = scaler.fit_transform(final_Train[[i]])

for i in final_Val.columns:
    final_Val[i] = scaler.fit_transform(final_Val[[i]])

for i in final_Test.columns:
    final_Test[i] = scaler.fit_transform(final_Test[[i]])

for i in final_Kaggle_Testing.columns:
    final_Kaggle_Testing[i] = scaler.fit_transform(final_Kaggle_Testing[[i]])

In [22]:
from sklearn.metrics import mean_squared_log_error


def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [23]:
final_Val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 468 entries, 1425 to 370
Columns: 135 entries, OverallQual to SaleType_Oth
dtypes: float64(135)
memory usage: 513.4 KB


In [24]:
final_Train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 1130 to 1306
Columns: 135 entries, OverallQual to SaleType_Oth
dtypes: float64(135)
memory usage: 759.9 KB


In [25]:
from sklearn.ensemble import RandomForestRegressor

mod = RandomForestRegressor()
mod.fit(final_Train,y_final_train)
y_pred = mod.predict(final_Val)
compute_rmsle(y_val,y_pred)



0.17

In [27]:
# from sklearn.linear_model import Lasso
# Lasso = Lasso(alpha=2)


# Lasso.fit(final_Train, y_final_train)
# y_pred = Lasso.predict(final_Val)

# compute_rmsle(y_val,y_pred)

* in this cell, I splitted the features to numeric & categorical.  
* I filled the numeric missing data with the median of each feature.  
* I Encoded the categorical features using OrdinalEncoder.  
* I used Interpolator in order to fill the missing values in the categorical features.  
* I merged the features (continuous & categorical) together.  

In [None]:


# categorical_features = X.select_dtypes('object')
# continuous_features = X.select_dtypes('number')
# continuous_features = continuous_features.fillna(continuous_features.median())

# test_continuous_features = Kaggle_testing_data.select_dtypes('number')
# test_categorical_features = Kaggle_testing_data.select_dtypes('object')
# test_continuous_features = test_continuous_features.fillna(test_continuous_features.median())


# cat_encoded = get_Encoded(categorical_features)
# test_cat_encoded = get_Encoded(test_categorical_features)




# cat_encoded.interpolate(method ='linear', limit_direction ='forward', inplace=True)
# cat_encoded.interpolate(method ='linear', limit_direction ='backward', inplace=True)
# f = pd.concat([
#     continuous_features,
#     pd.DataFrame(
#         cat_encoded
#     )
#     ],axis=1)


# test_cat_encoded.interpolate(method ='linear', limit_direction ='forward', inplace=True)
# test_cat_encoded.interpolate(method ='linear', limit_direction ='backward', inplace=True)
# test_f = pd.concat([
#     test_continuous_features,
#     pd.DataFrame(
#         test_cat_encoded
#     )
#     ],axis=1)
# f.head(10)

### Third Step: Feature Selection

* I concatenated SalePrice to my preprocessed_X and I computed the correlation matrix to see if there is any linear correlation between features.  
* I printed the result on a file because it is huge.

In [None]:
# temp = pd.concat([
#     f,
#     pd.DataFrame(
#         Y,
#         index=Y.index,
#         columns=['SalePrice']
#     )
# ],axis=1)

# correlation = temp.corr()
# # print(correlation)
# correlation.to_csv('../training_features.csv')


* I checked the correlation matrix file, I found out that these are the features that have 0.5 or more, -0.5 or less linear correlation wth SalePrice.

In [None]:
# my_final_X = f[['OverallQual','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF',
# '1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','Fireplaces','GarageYrBlt',
# 'GarageCars','GarageArea','ExterQual','BsmtQual','HeatingQC','KitchenQual','GarageFinish']]

# test_my_final_X = test_f[['OverallQual','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF',
# '1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','Fireplaces','GarageYrBlt',
# 'GarageCars','GarageArea','ExterQual','BsmtQual','HeatingQC','KitchenQual','GarageFinish']]

# my_final_X.head(10)

In [None]:
# import matplotlib.pyplot as plt


# plt.figure(figsize=(12, 8))
# plt.scatter(my_final_X['OverallQual'], Y, c='lightgray')
# # plt.scatter(X_val['TV'], y_val, c='black')
# # plt.plot(X_plot, y_single_plot, c='lightblue', linewidth=2, label='single linear')
# # plt.plot(X_plot, y_poly_plot, c='blue', linewidth=2, label='polynomial')

# plt.xlabel("OverallQual")
# plt.ylabel("SalePrice")
# plt.legend()
# plt.show()

* I concatenated the Extracted features with SalePrice and plotted a heatmap of their correlation matrix.  
* I did that in order to remove any redundency between features.

In [None]:
# final_dataset = pd.concat([
#     my_final_X,
#     pd.DataFrame(
#         Y,
#         index=Y.index,
#         columns=['SalePrice']
#     )
# ],axis=1)

# correlation = final_dataset.corr()
# sns.set(rc = {'figure.figsize':(16,10)})
# mask = np.triu(np.ones_like(correlation))
# hmap = sns.heatmap(correlation, annot=True, mask=mask, cmap="Spectral")

- GarageArea and GarageCars feature are strongly positively correlated so I dropped GarageArea feature to avoid redundency and bias.  
- Same thing, 1stFlrSF and TotalBsmtSF features strongly positively correlated so I dropped TotalBsmtSF feature.  
- again same thing for TotRmsAbvGrd and GrLivArea, so I dropped TotRmsAbvGrd feature.

In [None]:
# final_dataset = final_dataset.drop(['GarageArea','TotalBsmtSF', 'TotRmsAbvGrd'], axis=1)
# test_my_final_X = test_my_final_X.drop(['GarageArea','TotalBsmtSF', 'TotRmsAbvGrd'], axis=1)
# # test_my_final_X.info()

* I Scaled our X features in the training and testing sets using StandardScaler.

In [None]:
# from sklearn.preprocessing import StandardScaler
# Y = final_dataset['SalePrice']
# final_dataset_X = final_dataset.drop(['SalePrice'], axis=1)

# scaler = StandardScaler()
# for i in final_dataset_X.columns:
#     final_dataset_X[i] = scaler.fit_transform(final_dataset_X[[i]])

# for i in test_my_final_X.columns:
#     test_my_final_X[i] = scaler.fit_transform(test_my_final_X[[i]])

# print(final_dataset_X.head(10))

### Fourth Step: Split Data, Train Model and Evaluate it 

* I splitted the training set to (train & validate)

In [None]:
# from sklearn.metrics import mean_squared_log_error


# def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
#     rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
#     return round(rmsle, precision)

# # X_train, X_test, y_train, y_test = train_test_split(final_dataset_X, Y, test_size = 0.1, random_state = 0)

I defined a Lasso regression model and trained it and use it to predict.

In [None]:
# from sklearn.linear_model import Lasso
# Lasso = Lasso(alpha=2)


# Lasso.fit(X_train, y_train)
# y_pred = Lasso.predict(X_test)

# compute_rmsle(y_test,y_pred)

* I used Random Forest regression model to test on it as well and compare wit hthe previous model.

In [None]:
# from sklearn.ensemble import RandomForestRegressor

# mod = RandomForestRegressor()
# mod.fit(X_train,y_train)
# y_pred = mod.predict(X_test)
# compute_rmsle(y_test,y_pred)


* I thought of plotting SalePrice & OverallQual (because it is the most correlated feature with SalePrice), and I noticed a curve like parabola  
so I thought of fitting a polinomial regression model (because it is more flexible than linear single regression).  
but it didn't give good results!

In [None]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression

# poly = PolynomialFeatures(degree=2, include_bias=False)
# x_poly_train = poly.fit_transform(X_train)
# x_poly_test = poly.fit_transform(X_test)
# model = LinearRegression()
# model.fit(x_poly_train,y_train)
# y_poly_pred = model.predict(x_poly_test)


# x_poly_test_kaggle = poly.fit_transform(test_my_final_X)

# compute_rmsle(y_test,y_poly_pred)


### Fifth Step: Submission File Preparation

* This cell just to prepare a submission file for Kaggle.

In [None]:
# test_y_pred = model.predict(x_poly_test_kaggle)
# Output = pd.DataFrame()
# Output['Id'] = ids['Id']
# Output['SalePrice'] = pd.DataFrame(test_y_pred, columns=['SalePrice'])
# Output.to_csv('../submission_1.txt', index=False)