In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')
import seaborn as sns

from scipy.stats import randint
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [167]:
#Importing the dataset
df_train= pd.read_csv("train.csv")
df_test= pd.read_csv("test.csv")

In [168]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [169]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [170]:
df_train.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [171]:
# identify the columns of missing data 
missing_data_columns = df_train.columns[df_train.isnull().any()]
print(f'the missing columns: {missing_data_columns}')

the missing columns: Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')


In [172]:
# drop Id column from train data
df_train.drop(['Id'],axis=1, inplace = True)

In [173]:
# create X_train (every column of df_train except SalePrice)
X_train = df_train.drop(['SalePrice'], axis=1)

# create y_train (the price column of df_train)
y_train = df_train['SalePrice']

print(f'Shape of X train:{X_train.shape}')
print(f'Shape of y train:{y_train.shape}')

Shape of X train:(1460, 79)
Shape of y train:(1460,)


In [174]:
# Define the categorical train features
categorical_features = X_train.select_dtypes(include=['object']).columns
print("Categorical features:", categorical_features)

Categorical features: Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [175]:
#create categarical train tranformer Pipeline
categorical_tranformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant", fill_value="missing")),
    ('onhot', OneHotEncoder(handle_unknown="ignore"))

])

In [176]:
# Define the Numerical train features
numerical_features = X_train.select_dtypes(include=['number']).columns
print("Numerical features:", numerical_features)

Numerical features: Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')


In [177]:
# create numeric train transformer
numerical_transformer= Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="median")),
    ('minmax scaler', MinMaxScaler())
])

In [178]:
# drop Id column from test data
df_test.drop(['Id'],axis=1, inplace = True)

In [179]:
# show the information of test data
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1455 non-null   object 
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          107 non-null    object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearBuil

In [180]:
X_test = df_test

In [181]:
# Define the categorical test features
categorical_test__features = X_test.select_dtypes(include=['object']).columns
print("Categorical y features:", categorical_test__features)

Categorical y features: Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [182]:
#create categarical test tranformer Pipeline
categorical_test_tranformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant", fill_value="missing")),
    ('onhot', OneHotEncoder(handle_unknown="ignore"))

])

In [183]:
# Define the Numerical test features
numerical_test_features = X_test.select_dtypes(include=['number']).columns

In [184]:
# create numeric test tranformer pipeline
numerical_test_transformer= Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="median")),
    ('minmax scaler', MinMaxScaler())
])

In [185]:
# put all tranformer Pipeline's in single ColumnTransformer 
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',categorical_tranformer, categorical_features),
        ('num', numerical_transformer, numerical_features),
        ('cat_y',categorical_test_tranformer, categorical_test__features),
        ('num_y', numerical_test_transformer, numerical_test_features)
    ]
)

In [186]:
# create a dictionary on models
regression_models={
    "linear": LinearRegression(),
    "Ridge": Ridge(),
    "SVR-linear": SVR(kernel="linear"),
    "SVR-rbf": SVR(kernel="rbf"),
    "Decision tree": DecisionTreeRegressor(random_state=0),
    "Random forest":RandomForestRegressor(n_estimators=15)
}

# creat an empty results dictionary
regression_results={}

In [187]:
# loop through the items in regression_models dictionary 
for model_name, model in regression_models.items():

    # create a model Pipeline with a preprocessor step and model step
    model_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    # fit the model Pipeline with train data
    print(f"fitting{model_name}....")
    model_pipeline.fit(X_train,y_train)

    # Score the model Pipeline 
    print(f"scoring {model_name}...")
    scores = cross_val_score(model_pipeline, X_train, y_train, cv=5,scoring="r2")
    regression_results[model_name] = scores.mean()


fittinglinear....
scoring linear...
fittingRidge....
scoring Ridge...
fittingSVR-linear....
scoring SVR-linear...
fittingSVR-rbf....
scoring SVR-rbf...
fittingDecision tree....
scoring Decision tree...
fittingRandom forest....
scoring Random forest...


In [188]:
regression_results

{'linear': np.float64(0.7958989714419926),
 'Ridge': np.float64(0.8254777111716848),
 'SVR-linear': np.float64(0.020082370159532247),
 'SVR-rbf': np.float64(-0.05107415259676693),
 'Decision tree': np.float64(0.7422819076596825),
 'Random forest': np.float64(0.8441089448765544)}

In [219]:
# splitting the train data by kf
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [220]:
#create a Random forest Pipoline with Preprocessor as a "preprocessor" and RandomForestRegressor() as a "model"
RandomForest_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=15))
])

In [221]:
# different RandomForestRegressor() hyperparamters
RandomForest_grid= { 'model__n_estimators': randint(50, 200),
                     'model__max_depth': [None, 10, 20, 30],
                      'model__min_samples_split': randint(2, 10),
                        'model__min_samples_leaf': randint(1, 10) }

In [222]:
#import RandomizedSearchCV 
from sklearn.model_selection import RandomizedSearchCV

# Setup an insrance of RandomizedSearchCV with RandomForestRegressor() estimator
# our RandomForest_grid as the param_distributions, a cv of kf and n_iter of 5.
rs_RandomForest = RandomizedSearchCV(estimator=RandomForest_pipeline,
                                  param_distributions= RandomForest_grid,
                                  cv=kf,
                                  n_iter=5,
                                  verbose=1
                                  ,random_state=42)

In [225]:
# fit the RandomForest pipeline with train data
rs_RandomForest.fit(X_train,y_train)

# score the Random forest regressor model
score = cross_val_score(rs_RandomForest, X_train, y_train,cv=kf, scoring="r2")

score

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits


array([0.88392311, 0.86868877, 0.67061732, 0.87205431, 0.89262827])

In [226]:
# Find the best parameters of the RandomizedSearchCV instance using the best_params_ attribute
rs_RandomForest.best_params_

{'model__max_depth': 20,
 'model__min_samples_leaf': 3,
 'model__min_samples_split': 8,
 'model__n_estimators': 124}

In [227]:
# Score the instance if RandomizedSearchCV using the test data
rs_RandomForest.best_score_

np.float64(0.8407055479629577)

In [228]:
# make a prediction on the test data usinf RandomForest Pipeline
Y_pred = rs_RandomForest.predict(X_test)

In [229]:
# type of the Y_pred
type(Y_pred)

numpy.ndarray

In [230]:
#list first 50 record of data predicted data
Y_pred[:50]

array([128896.12647017, 154418.58405676, 174817.46709561, 182874.94616741,
       205078.02627409, 184064.86746869, 170303.70735056, 176778.55773447,
       182467.682823  , 119671.09017414, 196654.88887515,  95545.45970743,
        99624.91919692, 156536.65386473, 145389.03511444, 380618.05154688,
       253059.68844567, 306945.8409526 , 270792.69968924, 452803.82618585,
       307909.2182076 , 213226.16682522, 178603.67869057, 179411.22896875,
       172305.59419535, 198272.75624689, 341520.3081635 , 241173.48935229,
       215713.85444529, 195887.31643836, 190869.03790775,  94556.95052528,
       173563.34922526, 291235.13836667, 295177.89025542, 230132.62482556,
       189217.82719015, 153318.39716399, 151938.02659249, 150629.94515282,
       172933.31577255, 166862.00453941, 285305.32315458, 231493.430539  ,
       216030.09731783, 182879.99542028, 218853.27771418, 201483.70292288,
       164518.06178745, 151919.63448818])