### Problem Statement

 *It is your job to predict the sales price for each house.* 
 *For each Id in the test set, you must predict the value of the SalePrice variable* 


### Importing Libraries

In [4]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder,OrdinalEncoder,OneHotEncoder,MinMaxScaler
from sklearn.compose import  ColumnTransformer
from sklearn.impute import  SimpleImputer


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error


from sklearn.feature_selection import SequentialFeatureSelector,RFE

from sklearn.decomposition import PCA

### DATA Gathering

In [5]:
house=pd.read_csv('data/training_set.csv')

In [6]:
house.shape

(1460, 81)

In [7]:
for i in house.columns:
    print( i,'____',house[i].nunique())


Id ____ 1460
MSSubClass ____ 15
MSZoning ____ 5
LotFrontage ____ 110
LotArea ____ 1073
Street ____ 2
Alley ____ 2
LotShape ____ 4
LandContour ____ 4
Utilities ____ 2
LotConfig ____ 5
LandSlope ____ 3
Neighborhood ____ 25
Condition1 ____ 9
Condition2 ____ 8
BldgType ____ 5
HouseStyle ____ 8
OverallQual ____ 10
OverallCond ____ 9
YearBuilt ____ 112
YearRemodAdd ____ 61
RoofStyle ____ 6
RoofMatl ____ 8
Exterior1st ____ 15
Exterior2nd ____ 16
MasVnrType ____ 3
MasVnrArea ____ 327
ExterQual ____ 4
ExterCond ____ 5
Foundation ____ 6
BsmtQual ____ 4
BsmtCond ____ 4
BsmtExposure ____ 4
BsmtFinType1 ____ 6
BsmtFinSF1 ____ 637
BsmtFinType2 ____ 6
BsmtFinSF2 ____ 144
BsmtUnfSF ____ 780
TotalBsmtSF ____ 721
Heating ____ 6
HeatingQC ____ 5
CentralAir ____ 2
Electrical ____ 5
1stFlrSF ____ 753
2ndFlrSF ____ 417
LowQualFinSF ____ 24
GrLivArea ____ 861
BsmtFullBath ____ 4
BsmtHalfBath ____ 3
FullBath ____ 4
HalfBath ____ 3
BedroomAbvGr ____ 8
KitchenAbvGr ____ 4
KitchenQual ____ 4
TotRmsAbvGrd ____ 12

In [8]:
house

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [9]:
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [10]:
df=house.drop(['SalePrice','Alley','PoolQC','Fence','MiscFeature'],axis=1)     
target=house['SalePrice']
df


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,61,0,0,0,0,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,42,0,0,0,0,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,84,0,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,...,40,0,0,0,0,0,8,2007,WD,Normal
1456,1457,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,2,2010,WD,Normal
1457,1458,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,...,60,0,0,0,0,2500,5,2010,WD,Normal
1458,1459,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,...,0,112,0,0,0,0,4,2010,WD,Normal


In [11]:
cat=df.select_dtypes(include='object').columns
con=df.select_dtypes(exclude='object').columns

num_pipe=Pipeline(steps=[('Impute',SimpleImputer(strategy='median')),('scalar',StandardScaler())])
cat_pipe=Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),('Encode',OrdinalEncoder())])

pre=ColumnTransformer([('num_pipe',num_pipe,con),('cat_pipe',cat_pipe,cat)])

x1=pd.DataFrame(pre.fit_transform(df),columns=pre.get_feature_names_out())

x_train,x_test,y_train,y_test=train_test_split(x1,target,test_size=0.2,random_state=34)


In [12]:
target.shape

(1460,)

In [13]:
x_train.shape



(1168, 76)

In [14]:
lr=LinearRegression()
lr.fit(x_train, y_train)



In [15]:
def Regression_evaluate(predicted,actual):
    # Calculate the mean squared error
    mse=round(mean_squared_error(predicted,actual),3)
    rmse=round(np.sqrt(mse),3)
    mae=round(mean_absolute_error(predicted,actual),3)
    r2=round(r2_score(predicted,actual),3)

    print('_'*70)

    print('Mean_squared_error:',mse)
    print('Root_Mean_squared_error:',rmse)
    print('Mean_absolute_error:',mae)
    print("R2_score:",r2)
    print('_'*70)

  



In [16]:
print('for BASE MODEL')

tr_pred=lr.predict(x_train)
Regression_evaluate(tr_pred,y_train)

ts_pred=lr.predict(x_test)

Regression_evaluate(ts_pred,y_test)



for BASE MODEL
______________________________________________________________________
Mean_squared_error: 600790002.577
Root_Mean_squared_error: 24511.018
Mean_absolute_error: 16910.276
R2_score: 0.887
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 3493788898.392
Root_Mean_squared_error: 59108.281
Mean_absolute_error: 23738.227
R2_score: 0.617
______________________________________________________________________


using Sfs

In [17]:
sfs=SequentialFeatureSelector(lr,n_features_to_select=40,direction='backward')



In [18]:
x2=pd.DataFrame(sfs.fit_transform(x1,target),columns=sfs.get_feature_names_out())
x2


Unnamed: 0,num_pipe__Id,num_pipe__MSSubClass,num_pipe__LotArea,num_pipe__OverallQual,num_pipe__OverallCond,num_pipe__YearBuilt,num_pipe__MasVnrArea,num_pipe__BsmtFinSF1,num_pipe__BsmtFinSF2,num_pipe__BsmtUnfSF,...,cat_pipe__Exterior1st,cat_pipe__ExterQual,cat_pipe__BsmtQual,cat_pipe__BsmtCond,cat_pipe__BsmtExposure,cat_pipe__HeatingQC,cat_pipe__KitchenQual,cat_pipe__Functional,cat_pipe__FireplaceQu,cat_pipe__GarageCond
0,-1.730865,0.073375,-0.207142,0.651479,-0.517200,1.050994,0.514104,0.575425,-0.288653,-0.944591,...,12.0,2.0,2.0,3.0,3.0,0.0,2.0,6.0,2.0,4.0
1,-1.728492,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.570750,1.171992,-0.288653,-0.641228,...,8.0,3.0,2.0,3.0,1.0,0.0,3.0,6.0,4.0,4.0
2,-1.726120,0.073375,0.073480,0.651479,-0.517200,0.984752,0.325915,0.092907,-0.288653,-0.301643,...,12.0,2.0,2.0,3.0,2.0,0.0,2.0,6.0,4.0,4.0
3,-1.723747,0.309859,-0.096897,0.651479,-0.517200,-1.863632,-0.570750,-0.499274,-0.288653,-0.061670,...,13.0,3.0,3.0,1.0,3.0,2.0,2.0,6.0,2.0,4.0
4,-1.721374,0.073375,0.375148,1.374795,-0.517200,0.951632,1.366489,0.463568,-0.288653,-0.174865,...,12.0,2.0,2.0,3.0,0.0,0.0,2.0,6.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1.721374,0.073375,-0.260560,-0.071836,-0.517200,0.918511,-0.570750,-0.973018,-0.288653,0.873321,...,12.0,3.0,2.0,3.0,3.0,0.0,3.0,6.0,4.0,4.0
1456,1.723747,-0.872563,0.266407,-0.071836,0.381743,0.222975,0.087911,0.759659,0.722112,0.049262,...,9.0,3.0,2.0,3.0,3.0,4.0,3.0,2.0,4.0,4.0
1457,1.726120,0.309859,-0.147810,0.651479,3.078570,-1.002492,-0.570750,-0.369871,-0.288653,0.701265,...,5.0,0.0,3.0,1.0,3.0,0.0,2.0,6.0,2.0,4.0
1458,1.728492,-0.872563,-0.080160,-0.795151,0.381743,-0.704406,-0.570750,-0.865548,6.092188,-1.284176,...,8.0,3.0,3.0,3.0,2.0,2.0,2.0,6.0,2.0,4.0


In [19]:
x2.shape

(1460, 40)

In [20]:
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1=train_test_split(x2,target,test_size=0.2,random_state=23)

In [21]:
def Model_fitting(algorithm,x_training,y_training,x_testing,y_testing):
    algorithm.fit(x_training,y_training)

    tr=algorithm.predict(x_training)
    Regression_evaluate(tr,y_training)
    ts=algorithm.predict(x_testing)
    Regression_evaluate(ts,y_testing)



In [22]:
print("")
Model_fitting(lr,x_train1,y_train1,x_test1,y_test1)


______________________________________________________________________
Mean_squared_error: 1102669797.866
Root_Mean_squared_error: 33206.472
Mean_absolute_error: 19479.123
R2_score: 0.8
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 581025016.049
Root_Mean_squared_error: 24104.461
Mean_absolute_error: 18199.77
R2_score: 0.876
______________________________________________________________________


using rfe

In [24]:
rfe=RFE(lr,n_features_to_select=40)

In [25]:
x3=pd.DataFrame(rfe.fit_transform(x1,target),columns=rfe.get_feature_names_out())

In [26]:
x_train_2,x_test_2,y_train_2,y_test_2=train_test_split(x3,target,test_size=0.2,random_state=23)

In [27]:
lr.fit(x_train_2,y_train_2)

In [40]:


print('FOR RFE')
Model_fitting(lr,x_train_2,y_train_2,x_test_2,y_test_2)


FOR RFE
______________________________________________________________________
Mean_squared_error: 1081792328.026
Root_Mean_squared_error: 32890.612
Mean_absolute_error: 19472.454
R2_score: 0.805
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 579838507.446
Root_Mean_squared_error: 24079.836
Mean_absolute_error: 17936.921
R2_score: 0.874
______________________________________________________________________


### PCA

In [29]:
from sklearn.decomposition import PCA

In [30]:
from sklearn.decomposition import PCA
pc=PCA(n_components='mle')
x4=pd.DataFrame(pc.fit_transform(x1),columns=pc.get_feature_names_out())
x4
 

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca64,pca65,pca66,pca67,pca68,pca69,pca70,pca71,pca72,pca73
0,-7.316807,3.206463,-2.968458,0.785920,2.348880,0.921104,0.691384,0.344259,-0.690565,-1.031856,...,0.212552,0.036414,0.079295,-0.038954,-0.134470,-0.029596,0.032604,0.010170,-0.005681,0.000361
1,11.944782,-1.431653,0.205872,-3.172036,0.539613,-0.328487,-1.025816,1.290964,-0.883241,-1.192756,...,0.064780,-0.124759,-0.056012,0.082431,-0.026490,0.013921,-0.066994,-0.151612,-0.018174,-0.007944
2,-7.277346,3.312415,-3.774619,0.513198,2.163005,0.100763,-0.400984,0.146203,-0.794940,-0.935579,...,-0.390656,0.132447,0.236870,0.087930,0.001890,-0.034415,-0.151091,0.027170,-0.000878,0.007353
3,-6.879541,4.344567,1.177893,-0.005712,2.937234,-4.189119,-1.293630,0.946779,3.951299,0.242144,...,-0.342673,-0.897114,-0.386447,-0.414879,0.182284,0.331441,-0.039079,0.214838,-0.022659,-0.010326
4,2.868818,4.837854,-4.517943,0.817442,1.664825,-1.940920,-0.685454,0.281324,0.076739,0.092049,...,-0.017188,-0.061642,0.282148,0.052072,-0.192011,-0.036892,-0.080745,-0.030129,0.013168,0.003606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-4.611236,3.565041,-1.393624,3.241936,0.478281,1.370140,-0.278564,-0.619320,-2.086364,-0.829159,...,0.221506,-0.001241,-0.054705,0.045002,0.012450,0.038723,-0.019461,-0.065319,-0.015453,0.006099
1456,2.028068,-0.760008,-0.859852,-3.672875,-0.208559,-2.234960,2.849303,-0.615660,-1.799650,1.178306,...,-0.196319,0.166861,-0.503994,0.158939,0.875939,0.121803,-0.230878,-0.097673,-0.023773,0.000301
1457,-5.052453,-7.070693,-3.822161,2.499113,-0.272963,-0.599713,0.976929,1.794157,-2.085287,-2.563242,...,0.045792,-0.390656,0.313910,-0.181919,-0.115814,0.003181,-0.029656,0.078992,0.003236,0.000101
1458,-0.173925,-3.478236,2.128160,-3.034987,0.090929,0.130426,0.457973,-0.137118,-1.789039,-1.247199,...,0.255866,0.090769,-0.011111,-0.257659,0.092108,-0.153645,-0.101906,0.119801,-0.009798,-0.017147


In [31]:
x_train_3,x_test_3,y_train_3,y_test_3=train_test_split(x4,target,test_size=0.2,random_state=23)
Model_fitting(lr,x_train_3,y_train_3,x_test_3,y_test_3)

______________________________________________________________________
Mean_squared_error: 1048854750.935
Root_Mean_squared_error: 32386.027
Mean_absolute_error: 19457.182
R2_score: 0.812
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 633091120.666
Root_Mean_squared_error: 25161.302
Mean_absolute_error: 18767.414
R2_score: 0.864
______________________________________________________________________


### Regularization

In [42]:
from sklearn.linear_model import Lasso,Ridge

In [43]:
la=Lasso()
la.fit(x_train_3,y_train_3)

In [45]:
Model_fitting(la,x_train_3,y_train_3,x_test_3,y_test_3)

______________________________________________________________________
Mean_squared_error: 1048855265.955
Root_Mean_squared_error: 32386.035
Mean_absolute_error: 19456.052
R2_score: 0.812
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 632963690.904
Root_Mean_squared_error: 25158.77
Mean_absolute_error: 18765.593
R2_score: 0.864
______________________________________________________________________


#### Cross Validation(Grid,Randomized)

In [46]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import numpy as np
grid={
    'alpha':np.arange(0.01,3,0.01)
}
grid

{'alpha': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
        0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
        0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
        0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
        0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55,
        0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
        0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77,
        0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
        0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99,
        1.  , 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1 ,
        1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2 , 1.21,
        1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3 , 1.31, 1.32,
        1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4 , 1.41, 1.42, 1.43,
        1.44, 1.45, 1.46, 1.4

In [47]:
import warnings
warnings.filterwarnings('ignore')

gs=GridSearchCV(la,param_grid=grid,cv=3)
gs.fit(x_train_3,y_train_3)

In [50]:
gs.best_params_


{'alpha': np.float64(2.9899999999999998)}

In [52]:

la1=gs.best_estimator_
la1

In [53]:
Model_fitting(la1,x_train_3,y_train_3,x_test_3,y_test_3)

______________________________________________________________________
Mean_squared_error: 1048859354.919
Root_Mean_squared_error: 32386.098
Mean_absolute_error: 19453.864
R2_score: 0.812
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 632687700.396
Root_Mean_squared_error: 25153.284
Mean_absolute_error: 18761.745
R2_score: 0.864
______________________________________________________________________


RandomizedSearchCV

In [54]:
rs=RandomizedSearchCV(la,param_distributions=grid,cv=2)
rs.fit(x_train_3,y_train_3)

In [55]:
la2=rs.best_estimator_
la2

In [56]:
Model_fitting(la2,x_train_3,y_train_3,x_test_3,y_test_3)

______________________________________________________________________
Mean_squared_error: 1048859324.175
Root_Mean_squared_error: 32386.098
Mean_absolute_error: 19453.874
R2_score: 0.812
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 632689074.035
Root_Mean_squared_error: 25153.311
Mean_absolute_error: 18761.764
R2_score: 0.864
______________________________________________________________________


### Ridge

In [58]:
ra=Ridge()
ra.fit(x_train_3,y_train_3)

In [59]:
Model_fitting(ra,x_train_3,y_train_3,x_test_3,y_test_3)

______________________________________________________________________
Mean_squared_error: 1048978639.923
Root_Mean_squared_error: 32387.94
Mean_absolute_error: 19456.913
R2_score: 0.811
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 632073443.235
Root_Mean_squared_error: 25141.071
Mean_absolute_error: 18758.5
R2_score: 0.864
______________________________________________________________________


GridSearchCV

In [61]:
gs=GridSearchCV(ra,param_grid=grid,cv=2)
gs.fit(x_train_3,y_train_3)

In [62]:
ra1=gs.best_estimator_
ra1

In [63]:
Model_fitting(ra1,x_train_3,y_train_3,x_test_3,y_test_3)

______________________________________________________________________
Mean_squared_error: 1049453201.837
Root_Mean_squared_error: 32395.265
Mean_absolute_error: 19461.117
R2_score: 0.811
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 630660194.274
Root_Mean_squared_error: 25112.949
Mean_absolute_error: 18739.264
R2_score: 0.864
______________________________________________________________________


RandomizedSearchCV

In [64]:
rs=RandomizedSearchCV(ra,param_distributions=grid,cv=2)
rs.fit(x_train_3,y_train_3)

In [66]:
ra2=rs.best_estimator_

In [67]:
Model_fitting(ra2,x_train_3,y_train_3,x_test_3,y_test_3)

______________________________________________________________________
Mean_squared_error: 1049445886.969
Root_Mean_squared_error: 32395.152
Mean_absolute_error: 19461.101
R2_score: 0.811
______________________________________________________________________
______________________________________________________________________
Mean_squared_error: 630678955.68
Root_Mean_squared_error: 25113.322
Mean_absolute_error: 18739.56
R2_score: 0.864
______________________________________________________________________


so now we can see that there is not muchh effect on accuarcy after using regulariztion technique so we can continue with LR model with PCA only 

### Prediction on testing unseen dataset

In [33]:
df1=pd.read_csv('data/testing_set.csv')
df1

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [34]:

def Prediction(data,pipe,fs,model):
   
    
    
    # Preprocessing by using pre built pipeline
    df=pd.DataFrame(pipe.transform(data),columns=pipe.get_feature_names_out())

    # Feature selection
    df2=pd.DataFrame(fs.transform(df),columns=fs.get_feature_names_out())

    # Prediction
    pred=model.predict(df2)

    return pred

In [35]:
pred=Prediction(df1,pre,pc,lr)
pred

array([100862.22551152, 152575.49399326, 160251.50295524, ...,
       132494.29056816, 115813.58373932, 244210.83222143], shape=(1459,))

In [36]:
result=df1[['Id']]
result

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


In [37]:
result['Prediction']=pred


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Prediction']=pred


In [38]:
result

Unnamed: 0,Id,Prediction
0,1461,100862.225512
1,1462,152575.493993
2,1463,160251.502955
3,1464,182896.183015
4,1465,191662.974988
...,...,...
1454,2915,68545.104110
1455,2916,61876.741139
1456,2917,132494.290568
1457,2918,115813.583739


In [110]:
result.to_csv('data/House_Predictions.csv',index=False)