# Project 2: Ames House Price Prediction
---

## Problem Statement

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge, Lasso, LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

## Load Data

In [2]:
train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')
train.head()

FileNotFoundError: [Errno 2] No such file or directory: './datasets/train.csv'

In [None]:
test.head()

In [None]:
#rename columns
train = train.rename(columns = str.lower)
test = test.rename(columns = str.lower)

train.columns = train.columns.str.replace(' ', '_')
test.columns = test.columns.str.replace(' ', '_')

In [None]:
test_id = test['id'].to_frame()

In [None]:
#check
print(f'train : {train.columns[:5]}')
print(f'test : {test.columns[:5]}')

## Data Cleaning

In [None]:
#shape
print(f'train : {train.shape}')
print(f'test : {test.shape}')

In [None]:
#train set data
train.info()

In [None]:
#test set data
test.info()

**Comments:** To create new features first before dealing with the missing data 

**Features to add:** 
1. age of the house
2. age of remodel
3. age of garage garage
4. total square feet of the house

In [None]:
#create features for train
train['age_home'] = train['yr_sold'] - train['year_built']
train['age_remodel'] = train['yr_sold'] - train['year_remod/add']
train['age_garage'] = train['yr_sold'] - train['garage_yr_blt']

In [None]:
#create features for test
test['age_home'] = test['yr_sold'] - test['year_built']
test['age_remodel'] = test['yr_sold'] - test['year_remod/add']
test['age_garage'] = test['yr_sold'] - test['garage_yr_blt']

Let's take a look at the correlation with the saleprice

In [None]:
plt.figure(figsize= (5,12))
sns.heatmap(train.corr()[['saleprice']].sort_values(by= 'saleprice',ascending=False), annot = True, cmap = "Spectral")
plt.show()

**Comment:** I will remove features that has low correlation with saleprice (<= 0.2 and <= -0.2)

In [None]:
# #to remove correlation <= 0.2 and >= -0.2 from train data set
# train.drop(columns = ['bsmt_full_bath',
#                       'half_bath',
#                       '2nd_flr_sf',
#                       'bsmt_unf_sf', 
#                       'bedroom_abvgr', 
#                       'screen_porch', 
#                       '3ssn_porch', 
#                       'mo_sold', 
#                       'pool_area', 
#                       'bsmtfin_sf_2', 
#                       'misc_val', 
#                       'yr_sold', 
#                       'low_qual_fin_sf', 
#                       'bsmt_half_bath',
#                       'ms_subclass', 
#                       'overall_cond', 
#                       'kitchen_abvgr', 
#                       'enclosed_porch',
#                       'id',
#                       'pid'], inplace = True)

train.drop(columns = ['bsmt_full_bath', 'bsmt_unf_sf', 'bedroom_abvgr',
       'screen_porch', '3ssn_porch', 'exter_cond', 'mo_sold', 'pool_qc',
       'pool_area', 'bsmtfin_sf_2', 'misc_val', 'yr_sold', 'low_qual_fin_sf',
       'bsmt_half_bath', 'ms_subclass', 'overall_cond', 'kitchen_abvgr',
       'enclosed_porch'], inplace = True)

In [None]:
# #to remove correlation <= 0.2 and >= -0.2 from test data set
# test.drop(columns = [ 'bsmt_full_bath',
#                       'half_bath',
#                       '2nd_flr_sf',
#                       'bsmt_unf_sf', 
#                       'bedroom_abvgr', 
#                       'screen_porch', 
#                       '3ssn_porch', 
#                       'mo_sold', 
#                       'pool_area', 
#                       'bsmtfin_sf_2', 
#                       'misc_val', 
#                       'yr_sold', 
#                       'low_qual_fin_sf', 
#                       'bsmt_half_bath',
#                       'ms_subclass', 
#                       'overall_cond', 
#                       'kitchen_abvgr', 
#                       'enclosed_porch',
#                       'id',
#                       'pid'], inplace = True)

test.drop(columns = ['bsmt_full_bath', 'bsmt_unf_sf', 'bedroom_abvgr',
       'screen_porch', '3ssn_porch', 'exter_cond', 'mo_sold', 'pool_qc',
       'pool_area', 'bsmtfin_sf_2', 'misc_val', 'yr_sold', 'low_qual_fin_sf',
       'bsmt_half_bath', 'ms_subclass', 'overall_cond', 'kitchen_abvgr',
       'enclosed_porch'], inplace = True)

Further explore on the heatmap for **numerical features**

In [None]:
# #heatmap
# mask = np.zeros_like(train.corr()[abs(train.corr()) >= 0.8])
# mask[np.triu_indices_from(mask)] = True

# plt.figure(figsize= (12,8))
# sns.heatmap(train.corr()[abs(train.corr()) >= 0.8], annot= True, cmap = "Spectral", mask = mask)
# title = plt.title('Correlation Heatmap')
# plt.show()

**Comment:** Multicollinearity is found in the training data which may cause negative impact on the predicition model. Below features that are highly correlated:
1. year_built vs age_garage
2. year_built vs age_home
3. year_built vs garage_yr_blt
4. year_remod/add vs age_remodel
5. total_bsmt_sf vs 1st_flr_sf
6. gr_liv_area vs total_sf
7. gr_liv_area vs totrms_abvgrd
8. garage_yr_blt vs age_garage
9. garage_yr_blt vs age_home
10. garage_cars vs garage_area
11. age_home vs age_garage

Hence, i will remove a feature from the pair that has lower correlation with saleprice.

In [None]:
#remove from train data set
train.drop(columns = ['year_built',
                     'year_remod/add',
                     '1st_flr_sf',
                     'totrms_abvgrd',
                     'garage_yr_blt',
                     'garage_cars',
                     'age_garage'], inplace = True)

In [None]:
test.drop(columns = ['year_built',
                     'year_remod/add',
                     '1st_flr_sf',
                     'totrms_abvgrd',
                     'garage_yr_blt',
                     'garage_cars',
                     'age_garage'], inplace = True)

In [None]:
print(train.shape)
print(test.shape)

Now i will tackle on the **missing values**.

In [None]:
#sum of null values in train data
train.isna().sum().sort_values(ascending=False).head(25)

Clean train data set

In [None]:
#pool qc, no pool = 'none'
# train['pool_qc'].fillna('None', inplace= True)
# bookmark, removed pool qc above, low correlation

#no misc feature = 'none'
train['misc_feature'].fillna('None', inplace= True)

#no alley = 'none'
train['alley'].fillna('None', inplace= True)

#no fence = 'none'
train['fence'].fillna('None', inplace= True)

#fireplace qu, no fireplace = 'none'
train['fireplace_qu'].fillna('None', inplace= True)

#assuming every house has a frontage, i will apply mean to the missing value
train['lot_frontage'].fillna(train['lot_frontage'].mean(), inplace= True)

#no garage = 'none'
train['garage_finish'].fillna('None', inplace= True)
train['garage_qual'].fillna('None', inplace = True)
train['garage_cond'].fillna('None', inplace = True)
train['garage_type'].fillna('None', inplace = True)

#no bsmt = 'none'
train['bsmt_exposure'].fillna('None', inplace= True)
train['bsmtfin_type_2'].fillna('None', inplace= True)
train['bsmtfin_type_1'].fillna('None', inplace= True)
train['bsmt_cond'].fillna('None', inplace= True)
train['bsmt_qual'].fillna('None', inplace= True)

#no mas vnr = 'none'
train['mas_vnr_type'].fillna('None', inplace= True)

#no mas vnr = 0
train['mas_vnr_area'].fillna(0, inplace= True)

#no bsmt, bsmt full bath, bsmtfin sf 1 = 0
train['total_bsmt_sf'].fillna(0, inplace= True)
train['bsmtfin_sf_1'].fillna(0, inplace= True)

#no garage, garage_area = 0
train['garage_area'].fillna(0, inplace= True)

In [None]:
#sum of null values in test data
test.isna().sum().sort_values(ascending=False).head(25)

Clean test data set

In [None]:
#non-categorical features
test['lot_frontage'].fillna(test['lot_frontage'].mean(), inplace= True)
test['mas_vnr_area'].fillna(test['mas_vnr_area'].mean(), inplace= True)

#categorical features, i will apply mode values.
#test['pool_qc'].fillna(test['pool_qc'].mode()[0], inplace= True)
#bookmark, same as above
test['misc_feature'].fillna(test['misc_feature'].mode()[0], inplace= True)
test['alley'].fillna(test['alley'].mode()[0], inplace= True)
test['fence'].fillna(test['fence'].mode()[0], inplace= True)
test['fireplace_qu'].fillna(test['fireplace_qu'].mode()[0], inplace= True)
test['garage_finish'].fillna(test['garage_finish'].mode()[0], inplace= True)
test['garage_qual'].fillna(test['garage_qual'].mode()[0], inplace= True)
test['garage_cond'].fillna(test['garage_cond'].mode()[0], inplace= True)
test['garage_type'].fillna(test['garage_type'].mode()[0], inplace= True)
test['bsmt_exposure'].fillna(test['bsmt_exposure'].mode()[0], inplace= True)
test['bsmtfin_type_2'].fillna(test['bsmtfin_type_2'].mode()[0], inplace= True)
test['bsmtfin_type_1'].fillna(test['bsmtfin_type_1'].mode()[0], inplace= True)
test['bsmt_cond'].fillna(test['bsmt_cond'].mode()[0], inplace= True)
test['bsmt_qual'].fillna(test['bsmt_qual'].mode()[0], inplace= True)
test['mas_vnr_type'].fillna(test['mas_vnr_type'].mode()[0], inplace= True)
test['electrical'].fillna(test['electrical'].mode()[0], inplace= True)

Check again for any missing values

In [None]:
print('---train---')
print(train.isna().sum().sort_values(ascending=False))
print()
print('---test---')
print(test.isna().sum().sort_values(ascending=False))

**Comment:** Now, we have 0 missing values.

I will plot the scattergraph to check if there are any **outliers** for non-categorical features.

In [None]:
#list of non-categorical features for train data set
train_num_cols = train._get_numeric_data()
train_num_cols.columns.to_list()

In [None]:
#using scatter graphs for non-categorical columns to identify outliners
for col in list(train_num_cols.corr()[['saleprice']].sort_values(by = 'saleprice', ascending=False).index):
    if col != 'saleprice':
        plt.ylabel('saleprice')
        plt.xlabel(col)
        sns.scatterplot(data = train_num_cols, x=col, y='saleprice')
        plt.show()
    else:
        pass

From the scattergraph i have identified some of the outliers and will remove them accordingly.

In [None]:
train = train[train['gr_liv_area'] <=4000]
train[train['gr_liv_area'] > 4000].count().sum()

In [None]:
train = train[train['garage_area'] <= 1400]
train[train['garage_area'] > 1400].count().sum()

In [None]:
train = train[train['total_bsmt_sf'] <= 5000]
train[train['total_bsmt_sf'] > 5000].count().sum()

In [None]:
train = train[train['mas_vnr_area'] <= 1400]
train[train['mas_vnr_area'] > 1400].count().sum()

In [None]:
train = train[train['bsmtfin_sf_1'] <= 3000]
train[train['bsmtfin_sf_1'] > 3000].count().sum()

In [None]:
train = train[train['open_porch_sf'] <=500]
train[train['open_porch_sf'] > 500].count().sum()

In [None]:
train = train[train['wood_deck_sf'] <= 1000]
train[train['wood_deck_sf'] > 1000].count().sum()

In [None]:
train = train[train['lot_frontage'] <= 300]
train[train['lot_frontage'] > 300].count().sum()

In [None]:
train = train[train['lot_area'] <= 100000]
train[train['lot_area'] > 100000].count().sum()

In [None]:
train = train[train['age_remodel'] >= 0]
train[train['age_remodel'] < 0].count().sum()

**Comment:** Outliers removed

Let's take a look at the distribution of the non-categorical features

In [None]:
for col in list(train_num_cols.corr()[['saleprice']].sort_values('saleprice', ascending=False).index):
    plt.figure(figsize=(8,5))
    sns.histplot(data=train_num_cols, x=col, kde=True, color = 'teal', alpha = 0.3)
    plt.axvline(x=train[col].mean(), color='crimson', linestyle = 'dashed', label="mean", alpha=1)
    plt.axvline(x=train[col].median(), color='purple', linestyle= 'dashed', label="median", alpha=1)
    plt.title(f'Distribution of {col}')
    plt.legend(['kde','mean', 'median'])
    plt.show()

**Comment:** The distribution for the target 'saleprice' is positively skewed, i will use log transformation to normalise the target and this will help to improve for the linear regression model.

In [None]:
#to make more normal distribution for saleprice
#train = np.log(train['saleprice'])         #to log later
#plt.figure(figsize=(8,5))
#sns.histplot(train['saleprice'], color = 'teal', kde = True, alpha = 0.3)
#plt.axvline(x= train['saleprice'].mean(), color = 'crimson', linestyle = 'dashed', label = 'mean', alpha = 1)
#plt.axvline(x= train['saleprice'].median(), color = 'purple', linestyle = 'dashed', label = 'mean', alpha = 1)
#plt.show()

Next, i will examine the rest of the non-categorical features and use log transformation to fix them, if necessary.

In [None]:
#to log transform the rest of the non-categorical features which are positively skewed.
#train['mas_vnr_area'] = np.log1p(train['mas_vnr_area'])
#train['open_porch_sf'] = np.log1p(train['open_porch_sf'])
#train['wood_deck_sf'] = np.log1p(train['wood_deck_sf'])
#train['bsmt_full_bath'] = np.log1p(train['bsmt_full_bath'])
#train['half_bath'] = np.log1p(train['half_bath'])
#train['2nd_flr_sf'] = np.log1p(train['2nd_flr_sf'])
#train['age_remodel'] = np.log1p(train['age_remodel'])

**Comment:** Above are the features that i have used log transformation. However, log(0) is Nan, i will use log1p instead.

Now, i will examine for **categorical features**

In [None]:
#categorical features for train data
train_non_num_cols = train.select_dtypes(include=['object'])
train_non_num_cols.columns.tolist()

In [None]:
#categorical features for test data
test_non_num_cols = test.select_dtypes(include=['object'])
test_non_num_cols.columns.tolist()

In [None]:
#using boxplot to analyze the relationship for non-categorical vs saleprice using
for col in list(train_non_num_cols):
    plt.figure(figsize=(7,5))
    sns.boxplot(data = train, x = col, y = 'saleprice')
    plt.show()

By analyzing on the boxplot, i will remove the feature that has insignificant impact to the saleprice.

**Comment:** <br> 
To drop: 
1. Alley - Despite majority of the houses have no alley, many of the houses are still able to fetch at a higher selling price.
2. Utilities - Almost all of the houses have all public utilities, hence it has no useful information for our model.
3. Lot Config - Overlapping without any clear distinction.
4. Land Slope - Overlapping without any clear distinction.
5. Heating - Majority of the houses using 'GasA' type heating, hence will not provide any meaningful information.
6. Pool QC -  Majority of the houses has no pool, hence will not provide any meaningful information.
7. Fence - Majority has no fence and yet they are able to fetch a similar price as good privacy.
8. Misc Feature - Majority of the houses has no miscellaneous feature.

In [None]:
train.drop(columns = ['ms_zoning',
                      'alley',
                      'lot_shape',
                      'utilities',
                      'roof_style',
                      'heating',
                      'electrical',
                      'lot_config',
                      'land_slope',
                      'paved_drive',
# removed above                      'pool_qc',
                      'fence',
                      'misc_feature'], inplace = True)

In [None]:
test.drop(columns = ['ms_zoning',
                      'alley',
                      'lot_shape',
                      'utilities',
                      'roof_style',
                      'heating',
                      'electrical',
                      'lot_config',
                      'land_slope',
                      'paved_drive',
# removed above                      'pool_qc',
                      'fence',
                      'misc_feature'], inplace = True)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
#for col in list(train_non_numerical_cols):
#    sns.histplot(data = train, x = col)
 #   plt.show()

I will change the quality and condition to numeric. 

In [None]:
#changing the quality to numeric
#train['exter_qual'] = train['exter_qual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po' : 1, 'None' : 0})
#train['bsmt_qual'] = train['bsmt_qual'].map({'Ex' : 5, 'Gd' : 4, 'TA': 3, 'Fa' : 2, 'Po' : 1, 'None' : 0})
#train['bsmt_cond'] = train['bsmt_cond'].map({'Ex' : 5, 'Gd' : 4, 'TA': 3, 'Fa' : 2, 'Po' : 1, 'None' : 0})
#train['heating_qc'] = train['heating_qc'].map({'Ex' : 5, 'Gd' : 4, 'TA': 3, 'Fa' : 2, 'Po' : 1, 'None' : 0})
#train['kitchen_qual'] = train['kitchen_qual'].map({'Ex' : 5, 'Gd' : 4, 'TA': 3, 'Fa' : 2, 'Po' : 1, 'None' : 0})
#train['garage_qual'] = train['garage_qual'].map({'Ex' : 5, 'Gd' : 4, 'TA': 3, 'Fa' : 2, 'Po' : 1, 'None' : 0})
#train['garage_cond'] = train['garage_cond'].map({'Ex' : 5, 'Gd' : 4, 'TA': 3, 'Fa' : 2, 'Po' : 1, 'None' : 0})

In [None]:
#changing the quality to numeric
#rating = {'Ex': 5, 'Gd': 4, 'TA':3 , 'Fa': 2, 'Po': 1, 'None': 0}
#train['exter_qual'] = train['exter_qual'].replace(rating)
#train['bsmt_qual'] = train['bsmt_qual'].replace(rating)
#train['bsmt_cond'] = train['bsmt_cond'].replace(rating)
#train['heating_qc'] = train['heating_qc'].replace(rating)
#train['kitchen_qual'] = train['kitchen_qual'].replace(rating)
#train['garage_qual'] = train['garage_qual'].replace(rating)
#train['garage_cond'] = train['garage_cond'].replace(rating)

In [None]:
#changing the quality to numeric
#test['exter_qual'] = test['exter_qual'].replace(rating)
#test['bsmt_qual'] = test['bsmt_qual'].replace(rating)
#test['bsmt_cond'] = test['bsmt_cond'].replace(rating)
#test['heating_qc'] = test['heating_qc'].replace(rating)
#test['kitchen_qual'] = test['kitchen_qual'].replace(rating)
#test['garage_qual'] = test['garage_qual'].replace(rating)
#test['garage_cond'] = test['garage_cond'].replace(rating)

Converting categorical columns into a one-hot encoded matrix

In [None]:
#converting train data
train_cat_var = list(train.dtypes[train.dtypes =='object'].index)

In [None]:
train_cat_var

In [None]:
train = pd.get_dummies(data=train, columns=train_cat_var,drop_first=True)

In [None]:
train.head()

In [None]:
#converting test data
test_cat_vars = list(test.dtypes[test.dtypes =='object'].index)

In [None]:
test = pd.get_dummies(data=test, columns=test_cat_vars,drop_first=True)

In [None]:
test.head()

In [None]:
print(f'{train.shape}')
print(f'{test.shape}')

Create list of features including dummies

In [None]:
# features = [col for col in train._get_numeric_data().columns if col != ['saleprice']]

In [None]:
# missing_cols = set(train.columns) - set(test.columns)
# # Add a missing column in test with default value equal to 0
# for c in missing_cols:
#     test[c] = 0
# # Ensure the order of columns in test is the same as in train
# test = test[features]

In [None]:
#bookmark

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train_column_list = list(train.columns)

In [None]:
test_column_list = list(test.columns)

In [None]:
for c in train_column_list:
    if c != 'saleprice':
        if c not in test_column_list:
            train.drop(columns = c, inplace = True)
        
for c in test_column_list:
    if c not in train_column_list:
        test.drop(columns = c, inplace = True)

In [None]:
print(f'{train.shape}')
print(f'{test.shape}')

**Comment:** Columns are aligned

# Model

**Train test split**

In [None]:
#create X and y
X = train.drop(columns = 'saleprice')
y = np.log(train['saleprice'])

#verify dimensions
print('X: ', X.shape)
print('y: ', y.shape)

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#instantiate standard scaler
ss = StandardScaler()
#fit standard scaler to X_train and transform 
X_train_sc = ss.fit_transform(X_train)
#fit standard scaler to X_test and transform
X_test_sc = ss.transform(X_test)

#standardize test set data 
test_sc = ss.fit_transform(test)

**Linear Regression**

In [None]:
#instantiate and fit model
lr = LinearRegression()
lr.fit(X_train_sc, y_train)

In [None]:
print(f'Training Score: {lr.score(X_train_sc, y_train)}')
print(f'Testing Score: {lr.score(X_test_sc, y_test)}')
print(f'Cross Val Score: {cross_val_score(lr, X_train_sc, y_train, cv=5).mean()}')

In [None]:
#coefficients
lr.coef_

In [None]:
#create a dictionary
dict(zip(train.columns,lr.coef_))

In [None]:
#y intercept
lr.intercept_

In [None]:
#generate predictions
lr_predicition = np.exp(lr.predict(X_test_sc))
# bookmark
y_test_exp = np.exp(y_test)

In [None]:
#MSE
print(f'MSE: {mean_squared_error(y_test_exp, lr_predicition)}')
#RMSE
print(f'RMSE: {np.sqrt(mean_squared_error(y_test_exp, lr_predicition))}')
#R^2
print(f'R^2: {r2_score(y_test_exp, lr_predicition)}')

**Comment:** Model is overfit and is a bad model

In [None]:
#baseline model
print(f'MSE of baseline model: {mean_squared_error(y_test_exp, [np.mean(y_train)] * len(y_test))}')
print(f'RMSE of baseline model: {np.sqrt(mean_squared_error(y_test_exp, [np.mean(y_train)] * len(y_test)))}')
print(f'R^2 of baseline model: {r2_score(y_test_exp, [np.mean(y_train)] * len(y_test))}')

**Ridge**

In [None]:
ridge_alphas = np.logspace(0, 5, 100)

ridge_model = RidgeCV(alphas= ridge_alphas, cv=5)
ridge_model.fit(X_train_sc, y_train)

print (ridge_model.alpha_)

In [None]:
print(ridge_model.score(X_train_sc, y_train))
print(ridge_model.score(X_test_sc, y_test))
print(cross_val_score(ridge_model, X_train_sc, y_train, cv=5).mean())

In [None]:
ridge_model.coef_

In [None]:
dict(zip(train.columns,ridge_model.coef_))

In [None]:
#y intercept
ridge_model.intercept_

In [None]:
#generate predictions
ridge_predicition = ridge_model.predict(X_test_sc)

In [None]:
#MSE
print(f'MSE: {mean_squared_error(y_test_exp, ridge_predicition)}')
#RMSE
print(f'RMSE: {np.sqrt(mean_squared_error(y_test_exp, ridge_predicition))}')
#R^2
print(f'R^2: {r2_score(y_test_exp, ridge_predicition)}')

**Lasso**

In [None]:
#lasso_alphas = np.logspace(-5, 5, 500)

#lasso_model = LassoCV(alphas= lasso_alphas, cv=5, max_iter=50000)
lasso_model = LassoCV(n_alphas=500, cv=5, verbose=1)
lasso_model.fit(X_train_sc, y_train)

print (lasso_model.alpha_)

In [None]:
lasso = Lasso(alpha=lasso_model.alpha_).fit(X_train_sc, y_train)

In [None]:
print(lasso_model.score(X_train_sc, y_train))
print(lasso_model.score(X_test_sc, y_test))
print(cross_val_score(lasso_model, X_train_sc, y_train, cv=5).mean())

In [None]:
lasso_model.coef_

In [None]:
dict(zip(train.columns,lasso_model.coef_))

In [None]:
#y intercept
lasso_model.intercept_

In [None]:
#generate predictions
lasso_predicition = np.exp(lasso_model.predict(X_test_sc))

In [None]:
#MSE
print(f'MSE: {mean_squared_error(y_test_exp, lasso_predicition)}')
#RMSE
print(f'RMSE: {np.sqrt(mean_squared_error(y_test_exp, lasso_predicition))}')
#R^2
print(f'R^2: {r2_score(y_test_exp, lasso_predicition)}')

Submission to Kaggle

In [None]:
y_pred = np.exp(lasso_model.predict(test_sc))

In [None]:
submission = pd.DataFrame(y_pred, columns=['saleprice'])
submission['id'] = test['id']
submission = submission[['id', 'saleprice']]

In [None]:
submission.head()

In [None]:
submission.info

In [None]:
submission.to_csv('../datasets/Lasso_06.csv', index=False)