In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

url = "https://drive.google.com/file/d/1mnHCEXUJz5pxyHi41f0OMEFlKvt1sDGn/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
training_data = pd.read_csv(path)

In [None]:
training_data.drop(['Id'], axis=1, inplace=True)

In [None]:

X = training_data.drop(columns=['SalePrice'])
y = training_data['SalePrice']


# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)



# building the pipeline
X_cat = X_train.select_dtypes(exclude="number").copy()
X_num = X_train.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

full_pipeline = make_pipeline(preprocessor, 
                              LinearRegression())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"]
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)
 

print(
        f"""
        MSE: {mean_squared_error(search.predict(X_test), y_test)}
        RMSE: {mean_squared_error(search.predict(X_test), y_test)**0.5}
        MAE: {mean_absolute_error(search.predict(X_test), y_test)}
        MAPE: {mean_absolute_percentage_error(search.predict(X_test), y_test)}
        R2 Score: {r2_score(search.predict(X_test), y_test)}
        """
    )

Fitting 5 folds for each of 2 candidates, totalling 10 fits

        MSE: 667488112.1581452
        RMSE: 25835.79130118033
        MAE: 16793.382699581376
        MAPE: 0.09595493449984398
        R2 Score: 0.8743812381734002
        


In [None]:
submission = pd.DataFrame(columns=["Id", "SalePrice"])

In [None]:
url = "https://drive.google.com/file/d/1fOdT7-Nev6wWiOfynV3dy7RDR4rOSE69/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
test_data =  pd.read_csv(path)

In [None]:
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [None]:
submission['Id'] = test_data['Id'] 

In [None]:
submission

Unnamed: 0,Id,SalePrice
0,1461,
1,1462,
2,1463,
3,1464,
4,1465,
...,...,...
1454,2915,
1455,2916,
1456,2917,
1457,2918,


In [None]:
test_data.drop(['Id'], axis=1, inplace=True)

In [None]:
test_data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [None]:
pred = search.predict(test_data)

In [None]:
submission['SalePrice'] = pred

In [None]:
submission

Unnamed: 0,Id,SalePrice
0,1461,105656.982390
1,1462,145866.363030
2,1463,183034.879686
3,1464,194712.313893
4,1465,202062.580467
...,...,...
1454,2915,87139.793937
1455,2916,76064.352027
1456,2917,174343.800504
1457,2918,120852.583041


In [None]:
#submission.to_csv("submission.csv", index=False)

In [None]:
from google.colab import files
submission.to_csv('submission_init.csv', index=False) 
files.download('submission_init.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>