In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv('HPP(train).csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Data Cleaning, Feature Engineering, Outlier Detection and Removal

In [3]:
df1 = df[['LotFrontage','LotArea','LotShape','LandContour','LandSlope','LotConfig',
         'Neighborhood','BldgType','HouseStyle','YearBuilt','TotalBsmtSF','BedroomAbvGr',
        'FullBath','HalfBath','GarageArea','PoolArea','SalePrice']]
df1.head()

Unnamed: 0,LotFrontage,LotArea,LotShape,LandContour,LandSlope,LotConfig,Neighborhood,BldgType,HouseStyle,YearBuilt,TotalBsmtSF,BedroomAbvGr,FullBath,HalfBath,GarageArea,PoolArea,SalePrice
0,65.0,8450,Reg,Lvl,Gtl,Inside,CollgCr,1Fam,2Story,2003,856,3,2,1,548,0,208500
1,80.0,9600,Reg,Lvl,Gtl,FR2,Veenker,1Fam,1Story,1976,1262,3,2,0,460,0,181500
2,68.0,11250,IR1,Lvl,Gtl,Inside,CollgCr,1Fam,2Story,2001,920,3,2,1,608,0,223500
3,60.0,9550,IR1,Lvl,Gtl,Corner,Crawfor,1Fam,2Story,1915,756,3,1,0,642,0,140000
4,84.0,14260,IR1,Lvl,Gtl,FR2,NoRidge,1Fam,2Story,2000,1145,4,2,1,836,0,250000


In [4]:
df2 = df1.copy()
df2.shape

(1460, 17)

In [5]:
df2.dropna(inplace = True)
df2.isna().sum()

LotFrontage     0
LotArea         0
LotShape        0
LandContour     0
LandSlope       0
LotConfig       0
Neighborhood    0
BldgType        0
HouseStyle      0
YearBuilt       0
TotalBsmtSF     0
BedroomAbvGr    0
FullBath        0
HalfBath        0
GarageArea      0
PoolArea        0
SalePrice       0
dtype: int64

In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotFrontage   1201 non-null   float64
 1   LotArea       1201 non-null   int64  
 2   LotShape      1201 non-null   object 
 3   LandContour   1201 non-null   object 
 4   LandSlope     1201 non-null   object 
 5   LotConfig     1201 non-null   object 
 6   Neighborhood  1201 non-null   object 
 7   BldgType      1201 non-null   object 
 8   HouseStyle    1201 non-null   object 
 9   YearBuilt     1201 non-null   int64  
 10  TotalBsmtSF   1201 non-null   int64  
 11  BedroomAbvGr  1201 non-null   int64  
 12  FullBath      1201 non-null   int64  
 13  HalfBath      1201 non-null   int64  
 14  GarageArea    1201 non-null   int64  
 15  PoolArea      1201 non-null   int64  
 16  SalePrice     1201 non-null   int64  
dtypes: float64(1), int64(9), object(7)
memory usage: 168.9+ KB


In [7]:
df2['Bathrooms'] = df2['FullBath'] + df2['HalfBath']

In [8]:
df2.head()

Unnamed: 0,LotFrontage,LotArea,LotShape,LandContour,LandSlope,LotConfig,Neighborhood,BldgType,HouseStyle,YearBuilt,TotalBsmtSF,BedroomAbvGr,FullBath,HalfBath,GarageArea,PoolArea,SalePrice,Bathrooms
0,65.0,8450,Reg,Lvl,Gtl,Inside,CollgCr,1Fam,2Story,2003,856,3,2,1,548,0,208500,3
1,80.0,9600,Reg,Lvl,Gtl,FR2,Veenker,1Fam,1Story,1976,1262,3,2,0,460,0,181500,2
2,68.0,11250,IR1,Lvl,Gtl,Inside,CollgCr,1Fam,2Story,2001,920,3,2,1,608,0,223500,3
3,60.0,9550,IR1,Lvl,Gtl,Corner,Crawfor,1Fam,2Story,1915,756,3,1,0,642,0,140000,1
4,84.0,14260,IR1,Lvl,Gtl,FR2,NoRidge,1Fam,2Story,2000,1145,4,2,1,836,0,250000,3


In [9]:
import openpyxl

In [10]:
df2.to_excel(r'C:\Users\ADMIN\HPP(clean).xlsx', index = False, header = True)

def regression_model(X,y):
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import Lasso
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.linear_model import LogisticRegression
    from sklearn import svm
    from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
    from sklearn.model_selection import train_test_split
    #X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)
    #rg = model.fit(X_train,y_train)
    #rg_score = model.score(X_test,y_test)
    #print('Model Score:',rg_score)
    algorithm = {'Linear Regression':{'model':LinearRegression(),'params':{'normalize':[True,False]}},
            'Lasso':{'model':Lasso(),'params':{'alpha': [1,2],'selection': ['random', 'cyclic']}},
            'Decision Tree':{'model':DecisionTreeRegressor(),'params':{'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']}},
            'Logistic Regression':{'model':LogisticRegression(),
                                   'params':{'solver':['newton-cg','lbfgs', 'liblinear', 'sag', 'saga'],
                                            'C': np.logspace(-4, 4, 50),'penalty':['l1', 'l2']}},
            'SVM':{'model':svm.SVC(),'params':{'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}},
            'Random Forest':{'model':RandomForestRegressor(),'params':{'bootstrap': [True, False],
                                                                       'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                                                                       'max_features': ['auto', 'sqrt'],
                                                                       'min_samples_leaf': [1, 2, 4],
                                                                       'min_samples_split': [2, 5, 10],
                                                                       'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]}},
            'GradientBoostRegressor':{'model':GradientBoostingRegressor(),'params':{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90],
                                                                                   'n_estimators':[500,1000,2000],
                                                                                    'learning_rate':[.001,0.01,.1],
                                                                                   'subsample':[.5,.75,1]}}}
    from sklearn.model_selection import ShuffleSplit
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    from sklearn.model_selection import GridSearchCV
    scores = []
    for algo, config in algorithm.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])
    

In [11]:
df3 = df2.drop(['LotShape','LandContour','LandSlope','LotConfig','BldgType','FullBath','HalfBath','PoolArea'],
              axis = 'columns')
df3.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,HouseStyle,YearBuilt,TotalBsmtSF,BedroomAbvGr,GarageArea,SalePrice,Bathrooms
0,65.0,8450,CollgCr,2Story,2003,856,3,548,208500,3
1,80.0,9600,Veenker,1Story,1976,1262,3,460,181500,2
2,68.0,11250,CollgCr,2Story,2001,920,3,608,223500,3
3,60.0,9550,Crawfor,2Story,1915,756,3,642,140000,1
4,84.0,14260,NoRidge,2Story,2000,1145,4,836,250000,3


In [18]:
df3['Price per sqft'] = df3['SalePrice']*100/df3['LotArea']
df3.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,HouseStyle,YearBuilt,TotalBsmtSF,BedroomAbvGr,GarageArea,SalePrice,Bathrooms,Price per sqft
0,65.0,8450,CollgCr,2Story,2003,856,3,548,208500,3,2467.455621
1,80.0,9600,Veenker,1Story,1976,1262,3,460,181500,2,1890.625
2,68.0,11250,CollgCr,2Story,2001,920,3,608,223500,3,1986.666667
3,60.0,9550,Crawfor,2Story,1915,756,3,642,140000,1,1465.968586
4,84.0,14260,NoRidge,2Story,2000,1145,4,836,250000,3,1753.15568


In [23]:
df3[df3['Price per sqft'] < 500]

Unnamed: 0,LotFrontage,LotArea,Neighborhood,HouseStyle,YearBuilt,TotalBsmtSF,BedroomAbvGr,GarageArea,SalePrice,Bathrooms,Price per sqft
30,50.0,8500,IDOTRR,2Story,1920,649,3,250,40000,1,470.588235
313,150.0,215245,Timber,1Story,1965,2136,3,513,375000,2,174.220075
411,100.0,34650,Gilbert,1Story,1955,1056,3,572,145000,1,418.470418
451,62.0,70761,ClearCr,1Story,1975,1533,2,576,280000,2,395.698195
495,60.0,7879,IDOTRR,1Story,1920,720,2,0,34900,1,442.949613
523,130.0,40094,Edwards,2Story,2007,3138,3,884,184750,4,460.792138
916,50.0,9000,IDOTRR,1Story,1949,480,1,308,35311,0,392.344444
1061,120.0,18000,IDOTRR,1Story,1935,894,2,1248,81000,1,450.0
1298,313.0,63887,Edwards,2Story,2008,6110,3,1418,160000,3,250.442187
1453,90.0,17217,Mitchel,1Story,2006,1140,3,0,84500,1,490.793983


In [24]:
df4 = df3[~((df3.BedroomAbvGr==3) & (df3.LotArea > 13000))]
df4.shape

(1118, 11)

In [26]:
df5 = df4[~(df4['Price per sqft'] < 500)]

In [27]:
df5.shape

(1113, 11)

In [39]:
df6 = df5[~(df5.LotFrontage < 30)]

In [138]:
df7 = df6[~(df6['Price per sqft'] > 7000)]
df7.shape

(1067, 11)

In [273]:
df8 = df7[~(df7.BedroomAbvGr < df7.Bathrooms)]
df8.shape

(1029, 11)

In [274]:
df9 = df8.drop(['GarageArea'], axis = 'columns')
df9.shape

(1029, 10)

In [275]:
location_stats = df9.Neighborhood.value_counts()

In [276]:
len(location_stats[location_stats<=30])

10

In [277]:
location_stats_less_than_20 = location_stats[location_stats<=20]
location_stats_less_than_20

StoneBr    14
Blmngtn    10
ClearCr     9
MeadowV     5
Veenker     4
NPkVill     2
Name: Neighborhood, dtype: int64

In [278]:
df10 = df9[~(df9.TotalBsmtSF < 300)]
df11 = df10.copy()
df11.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,HouseStyle,YearBuilt,TotalBsmtSF,BedroomAbvGr,SalePrice,Bathrooms,Price per sqft
0,65.0,8450,CollgCr,2Story,2003,856,3,208500,3,2467.455621
1,80.0,9600,Veenker,1Story,1976,1262,3,181500,2,1890.625
2,68.0,11250,CollgCr,2Story,2001,920,3,223500,3,1986.666667
3,60.0,9550,Crawfor,2Story,1915,756,3,140000,1,1465.968586
4,84.0,14260,NoRidge,2Story,2000,1145,4,250000,3,1753.15568


In [279]:
df12 = df11.drop(['Price per sqft'], axis = 'columns')
df12.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,HouseStyle,YearBuilt,TotalBsmtSF,BedroomAbvGr,SalePrice,Bathrooms
0,65.0,8450,CollgCr,2Story,2003,856,3,208500,3
1,80.0,9600,Veenker,1Story,1976,1262,3,181500,2
2,68.0,11250,CollgCr,2Story,2001,920,3,223500,3
3,60.0,9550,Crawfor,2Story,1915,756,3,140000,1
4,84.0,14260,NoRidge,2Story,2000,1145,4,250000,3


In [280]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [282]:
df12.nunique()

LotFrontage      99
LotArea         720
Neighborhood     23
HouseStyle        8
YearBuilt       110
TotalBsmtSF     562
BedroomAbvGr      8
SalePrice       512
Bathrooms         5
dtype: int64

In [303]:
year_stats = df12.YearBuilt.value_counts()
year_stats[year_stats<=2]

1885    2
1892    2
1986    2
1919    2
1989    2
1908    2
1890    2
1981    2
1983    1
1882    1
1904    1
1875    1
1898    1
1913    1
1905    1
1906    1
1911    1
1942    1
1917    1
1985    1
1872    1
Name: YearBuilt, dtype: int64

In [307]:
df12 = df12[~((df12.YearBuilt == 1872) | (df12.YearBuilt == 1892))]

# One Hot Encoding

In [308]:
X = df12.drop('SalePrice',axis = 'columns')
y = df12['SalePrice']

In [309]:
columns = ['Neighborhood','YearBuilt','HouseStyle']
X_ohe = pd.get_dummies(X, columns = columns, drop_first = True)
X_num = X.drop(columns, axis = 'columns')
sts = StandardScaler()
X_num = pd.DataFrame(sts.fit_transform(X_num), index = X.index)
X_new = pd.concat([X_ohe, X_num], axis = 1)
X_new.shape

(988, 145)

In [328]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn import svm
#num_feat = ['LotFrontage', 'LotArea','TotalBsmtSF','BedroomAbvGr','Bathrooms']
#num_trans = StandardScaler()

#cat_feat = ['Neighborhood','HouseStyle','YearBuilt']
#cat_trans = OneHotEncoder(handle_unknown='ignore')
#prepro = ColumnTransformer(transformers=[('num', num_trans, num_feat),('cat', cat_trans, cat_feat)])
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2,
                                                    random_state=0)
#Linear Regression
lr_rg = LinearRegression()
lr_rg.fit(X_train, y_train)
lr_score = lr_rg.score(X_test, y_test)

#Lasso
lasso = Lasso()
lasso.fit(X_train, y_train)
las_score = lasso.score(X_test, y_test)

#Decision Tree
dt_rg = DecisionTreeRegressor()
dt_rg.fit(X_train, y_train)
dt_score = dt_rg.score(X_test, y_test)

#Random Forest
rf_rg = RandomForestRegressor(n_estimators=500, random_state=0)
rf_rg.fit(X_train, y_train)
rf_score = rf_rg.score(X_test, y_test)

#Support Vector Machine
svm_rg = svm.SVR()
svm_rg.fit(X_train,y_train)
svm_score = svm_rg.score(X_test,y_test)

#Logistic Regression
log_rg = LogisticRegression()
log_rg.fit(X_train, y_train)
log_score = log_rg.score(X_train, y_train)

  positive)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [329]:
Model = ['Linear Regression', 'Lasso','Decision Tree','Random Forest','SVR', 'Logistic Regression']
Score = [lr_score, las_score, dt_score, rf_score, svm_score, log_score]
final_score = pd.DataFrame({'Model': Model,'Score': Score})
final_score

Unnamed: 0,Model,Score
0,Linear Regression,0.76234
1,Lasso,0.763627
2,Decision Tree,0.62549
3,Random Forest,0.799475
4,SVR,-0.069806
5,Logistic Regression,0.017722


In [200]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
def best_model(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [208]:
df_test = pd.read_csv('HPP(test).csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [212]:
dft1 = df_test[['LotFrontage','LotArea','Neighborhood','HouseStyle','YearBuilt','TotalBsmtSF','BedroomAbvGr',
        'FullBath','HalfBath']]
dft1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotFrontage   1232 non-null   float64
 1   LotArea       1459 non-null   int64  
 2   Neighborhood  1459 non-null   object 
 3   HouseStyle    1459 non-null   object 
 4   YearBuilt     1459 non-null   int64  
 5   TotalBsmtSF   1458 non-null   float64
 6   BedroomAbvGr  1459 non-null   int64  
 7   FullBath      1459 non-null   int64  
 8   HalfBath      1459 non-null   int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 102.7+ KB


In [213]:
dft1.isna().sum()

LotFrontage     227
LotArea           0
Neighborhood      0
HouseStyle        0
YearBuilt         0
TotalBsmtSF       1
BedroomAbvGr      0
FullBath          0
HalfBath          0
dtype: int64

In [247]:
dft1['LotFrontage']=dft1['LotFrontage'].fillna(dft1['LotFrontage'].mean())
dft1['TotalBsmtSF']=dft1['TotalBsmtSF'].fillna(dft1['TotalBsmtSF'].mean())
dft1['Bathrooms'] = dft1['FullBath'] + dft1['HalfBath']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [249]:
dft2 = dft1.drop(['FullBath','HalfBath'], axis = 'columns')

In [339]:
dft2.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,HouseStyle,YearBuilt,TotalBsmtSF,BedroomAbvGr,Bathrooms
0,80.0,11622,NAmes,1Story,1961,882.0,2,1
1,81.0,14267,NAmes,1Story,1958,1329.0,3,2
2,74.0,13830,Gilbert,2Story,1997,928.0,3,3
3,78.0,9978,Gilbert,2Story,1998,926.0,3,3
4,43.0,5005,StoneBr,1Story,1992,1280.0,2,2


In [338]:
columns = ['Neighborhood','YearBuilt','HouseStyle']
test_ohe = pd.get_dummies(dft2, columns = columns, drop_first = True)
test_num = dft2.drop(columns, axis = 'columns')
sts = StandardScaler()
test_num = pd.DataFrame(sts.fit_transform(test_num), index = dft2.index)
test_new = pd.concat([test_ohe, test_num], axis = 1)
test_new.head()

Unnamed: 0,LotFrontage,LotArea,TotalBsmtSF,BedroomAbvGr,Bathrooms,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,...,HouseStyle_1Story,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,0,1,2,3,4
0,80.0,11622,882.0,2,1,0,0,0,0,0,...,1,0,0,0,0,0.555587,0.363929,-0.368484,-1.029543,-1.166305
1,81.0,14267,1329.0,3,2,0,0,0,0,0,...,1,0,0,0,0,0.604239,0.897861,0.639542,0.175997,0.063203
2,74.0,13830,928.0,3,3,0,0,0,0,0,...,0,0,1,0,0,0.263676,0.809646,-0.26475,0.175997,1.292711
3,78.0,9978,926.0,3,3,0,0,0,0,0,...,0,0,1,0,0,0.458284,0.032064,-0.26926,0.175997,1.292711
4,43.0,5005,1280.0,2,2,0,0,0,0,0,...,1,0,0,0,0,-1.244533,-0.971808,0.529042,-1.029543,0.063203


In [330]:
y_pred = rf_rg.predict(test_new)

In [337]:
y_test = pd.read_csv('sample_submission.csv')
y_test.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [340]:
final_data = {'Id': y_test['Id'] , 'SalePrice': y_pred}
sub = pd.DataFrame(data=final_data)
sub.to_csv('subfile1.csv',index =False)