# Importing **Libraries** **bold text**

In [None]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_regression
from sklearn import set_config

set_config(transform_output='pandas')

In [None]:
#reading data
# reading csv file
url   =  'https://drive.google.com/file/d/1dvoTDXAKmlgXVZwl1YLdsEf9mRpWdV6j/view?usp=drive_link'# itr_7_regression
path  = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
house =  pd.read_csv(path)

In [None]:
house.info()

In [None]:
#creation of X and y
house.drop('Id',axis=1,inplace =True)
X = house
y = house.pop('SalePrice')

In [None]:
#splitting the data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123)

In [None]:
X.isnull().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 79, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# distinguish between categorical and numerical features
X_cat_columns = X.select_dtypes(exclude="number").copy()
X_num_columns = X.select_dtypes(include="number").copy()

In [None]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
KBest = SelectKBest(score_func=f_regression)

In [None]:
ordered_categories_column_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                   'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence']
# 'OverallQual','OverallCond'

ExterQual_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
ExterCond_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtQual_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtCond_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtExposure_cat = ['NA', 'No', 'Mn', 'Av', 'Gd']
BsmtFinType1_cat = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
BsmtFinType2_cat = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
HeatingQC_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
KitchenQual_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
FireplaceQu_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
GarageFinish_cat = ['NA', 'Unf', 'RFn', 'Fin']
GarageQual_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
GarageCond_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
PoolQC_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
Fence_cat = ['Na', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

ordinal_cats_list = [ExterQual_cat, ExterCond_cat, BsmtQual_cat, BsmtCond_cat, BsmtExposure_cat, BsmtFinType1_cat,
                     BsmtFinType2_cat, HeatingQC_cat, KitchenQual_cat, FireplaceQu_cat, GarageFinish_cat, GarageQual_cat, GarageCond_cat, PoolQC_cat, Fence_cat]

In [None]:
X_cat_columns = X.select_dtypes(exclude='number').copy()

# variables for the three pipes
X_num_columns = X.select_dtypes(include='number').copy()
X_cat_ordered_columns = X_cat_columns[ordered_categories_column_names]
X_cat_unordered_columns = X_cat_columns.drop(
    ordered_categories_column_names, axis=1)


# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(),
)
numeric_pipe

# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
unordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='ignore')
)
unordered_categoric_pipe
# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
ordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OrdinalEncoder(categories=ordinal_cats_list,
                   handle_unknown='use_encoded_value', unknown_value=-1)
)
ordered_categoric_pipe

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, X_num_columns.columns),
        ('unordered', unordered_categoric_pipe, X_cat_unordered_columns.columns),
        ('ordered', ordered_categoric_pipe, X_cat_ordered_columns.columns)
    ])


# Model and their *Scores* **bold text**

In [None]:
scores = pd.DataFrame(
    {"model": ['dtr'], "R^2 Score Train": [""], "R^2 Score Test": [""]})

scores

Unnamed: 0,model,R^2 Score Train,R^2 Score Test
0,dtr,,


1.Modelling Linear Regression
# **New Section**
```
# This is formatted as code
```



In [None]:
#always first creat pipeline
full_pipeline = make_pipeline(preprocessor,LinearRegression())

In [None]:
#fit the pipeline to train data

full_pipeline.fit(X_train,y_train)

In [None]:
# Make predictions
y_train_pred = full_pipeline.predict(X_train)
y_test_pred = full_pipeline.predict(X_test)


In [None]:
# Evaluate the model using RMSE
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

In [None]:
print(f"Improved Model Train RMSE: {train_rmse:.2f}")
print(f"Improved Model Test RMSE: {test_rmse:.2f}")

Improved Model Train RMSE: 22000.88
Improved Model Test RMSE: 27183.17


In [None]:
print(f"Improved Model Train RMSE: {train_rmse:.2f}")
print(f"Improved Model Test RMSE: {test_rmse:.2f}")

Improved Model Train RMSE: 22000.88
Improved Model Test RMSE: 27183.17


In [None]:
# Evaluate the model using score
train_r2_m1 = r2_score(y_train, y_train_pred)
test_r2_m1 = r2_score(y_test, y_test_pred)

print(f"Improved Model Train R^2  Score: {train_r2_m1:.2f}")
print(f"Improved Model Test R^2 Score: {test_r2_m1:.2f}")

Improved Model Train R^2  Score: 0.92
Improved Model Test R^2 Score: 0.88


In [None]:
#checking the score for Linear regression
scores = pd.concat([scores,pd.DataFrame(
    {'model':['1'],'R^2 Score Train':[train_r2_m1],'R^2 Score Test':[test_r2_m1]})],axis=0)
scores


Unnamed: 0,model,R^2 Score Train,R^2 Score Test,R^2 score Train,R^2 score Test
0,dtr,,,,
0,1,,,0.923634,0.880425
0,1,0.923634,0.880425,,


# **2.Model with DecissionTreeRegressor**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

In [None]:
#creating pipeline
full_pipeline_dtR = make_pipeline(preprocessor,DecisionTreeRegressor())

In [None]:
# train a DecisionTree with GridSearch cross validation
# parameter grid
param_grid_dtR = {
    'columntransformer__num__simpleimputer__strategy': ['mean', 'median'],
    'decisiontreeregressor__max_depth': range(3, 10),
    'decisiontreeregressor__min_samples_leaf': range(3, 10, 2),
    'decisiontreeregressor__min_samples_split': range(3, 39, 3),

}

In [None]:
search_dtR= RandomizedSearchCV(full_pipeline_dtR,param_grid_dtR,
                               n_iter=2, cv=7, scoring='r2', random_state=123, verbose=0)

In [None]:
#fit the data
search_dtR.fit(X_train,y_train)

In [None]:
y_train_pred_dtR = search_dtR.predict(X_train)
y_test_pred_dtR = search_dtR.predict(X_test)

In [None]:
train_rmse = mean_squared_error(y_train,y_train_pred_dtR,squared=False)
test_rmse = mean_squared_error (y_test,y_test_pred_dtR,squared =False)

In [None]:
print (f'decisionTreeRegressor Train RMSE: {train_rmse:.2f}')
print (f'decisionTreeRegressor Test RMSE: {test_rmse:.2f}')


decisionTreeRegressor Train RMSE: 30118.25
decisionTreeRegressor Test RMSE: 43289.78


In [None]:
print(f'The best parameters are {search_dtR.best_params_}')

The best parameters are {'decisiontreeregressor__min_samples_split': 21, 'decisiontreeregressor__min_samples_leaf': 7, 'decisiontreeregressor__max_depth': 6, 'columntransformer__num__simpleimputer__strategy': 'median'}


In [None]:
# Evaluate the model using R^2 score
train_r2_m2 = r2_score(y_train, y_train_pred_dtR)
test_r2_m2 = r2_score(y_test, y_test_pred_dtR)

print(f"Train R^2 Score: {train_r2_m2:.2f}")
print(f"Model Test R^2 Score: {test_r2_m2:.2f}")

Train R^2 Score: 0.86
Model Test R^2 Score: 0.70


In [None]:
scores = pd.concat([scores, pd.DataFrame(
    {"model": ['2'], "R^2 Score Train":[train_r2_m2], "R^2 Score Test": [test_r2_m2]})], axis=0)
scores

In [None]:
scores = pd.concat([scores,pd.DataFrame(
    {'model':['2'],'R^2 Score Train':[train_r2_m2],'R^2 Score Test':[test_r2_m2]})],axis=0)
scores


Unnamed: 0,model,R^2 Score Train,R^2 Score Test,R^2 score Train,R^2 score Test
0,dtr,,,,
0,1,,,0.923634,0.880425
0,1,0.923634,0.880425,,
0,2,0.856886,0.696744,,


3.Decisiontree with RFE
# **New Section**
```
# This is formatted as code
```



In [None]:
from sklearn.feature_selection import RFE

# initialize the pipe
full_pipeline_dtR = make_pipeline(
    preprocessor, RFE(estimator=DecisionTreeRegressor()))

# train a DecisionTree with GridSearch cross validation
# parameter grid
param_grid_dtR_RFE = {
    # 'rfe__feature':column_names,
    # 'rfe__importance':feature_importances,
    'columntransformer__num__simpleimputer__strategy': ['mean', 'median'],
    'rfe__estimator__min_samples_leaf': range(2),
    'rfe__estimator__max_leaf_nodes': range(1, 40),
    'rfe__estimator__max_features': range(1, 79)
}

# define cross validation
#search_dt = GridSearchCV(full_pipeline_dt,param_grid_dt,cv=5,verbose=0)
search_dtR = RandomizedSearchCV(full_pipeline_dtR, param_grid_dtR_RFE,
                                n_iter=10, cv=7, scoring='r2', random_state=123, verbose=0)

# fit
search_dtR.fit(X_train, y_train)
y_train_pred_dtR = search_dtR.predict(X_train)
y_test_pred_dtR = search_dtR.predict(X_test)

train_rmse = mean_squared_error(y_train, y_train_pred_dtR, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred_dtR, squared=False)

print(f"decisionTreeRegressor Train RMSE: {train_rmse:.2f}")
print(f"decisionTreeRegressor Test RMSE: {test_rmse:.2f}")

print(f"The best parameters are {search_dtR.best_params_}")

# Evaluate the model using R^2 score
train_r2_m3 = r2_score(y_train, y_train_pred_dtR)
test_r2_m3 = r2_score(y_test, y_test_pred_dtR)

print(f"Train R^2 Score: {train_r2_m3:.2f}")
print(f"Model Test R^2 Score: {test_r2_m3:.2f}")


42 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
42 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_selection/_rfe.py", line 251, in fit
    return self._fit(X, y, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_selection/_rfe.py", line 299, in _fit
    estimator.fit(X[:, feat

decisionTreeRegressor Train RMSE: 37232.15
decisionTreeRegressor Test RMSE: 44356.19
The best parameters are {'rfe__estimator__min_samples_leaf': 1, 'rfe__estimator__max_leaf_nodes': 18, 'rfe__estimator__max_features': 48, 'columntransformer__num__simpleimputer__strategy': 'median'}
Train R^2 Score: 0.78
Model Test R^2 Score: 0.68




In [None]:
scores = pd.concat([scores, pd.DataFrame(
    {"model": ['3'], "R^2 Score Train":[train_r2_m3], "R^2 Score Test": [test_r2_m3]})], axis=0)
scores


Unnamed: 0,model,R^2 Score Train,R^2 Score Test,R^2 score Train,R^2 score Test
0,dtr,,,,
0,1,,,0.923634,0.880425
0,1,0.923634,0.880425,,
0,2,0.856886,0.696744,,
0,8,0.786968,0.663899,,
0,8,0.781295,0.681619,,
0,3,0.781295,0.681619,,


4.Modelling with randomforest with variance threshold
# **New Section**



In [None]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)

# initialize the pipe
full_pipeline_ran = make_pipeline(
    preprocessor, StandardScaler(), KBest, selector, RandomForestRegressor(random_state=123))

# train a DecisionTree with GridSearch cross validation
# parameter grid
param_grid_ran = {
    "columntransformer__num__simpleimputer__strategy": ["mean", "median"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "randomforestregressor__n_estimators": [100, 200, 300],
    "randomforestregressor__max_depth": [None, 5, 11],
    "randomforestregressor__min_samples_split": [2, 5, 10],
    'selectkbest__k': range(20, 65)
}

# define cross validation
#search_dt = GridSearchCV(full_pipeline_dt,param_grid_dt,cv=5,verbose=0)
search_ran = RandomizedSearchCV(full_pipeline_ran, param_grid_ran,
                                n_iter=2, cv=7, scoring='r2', random_state=123, verbose=0)

# fit
search_ran.fit(X_train, y_train)
y_train_pred_ran = search_ran.predict(X_train)
y_test_pred_ran = search_ran.predict(X_test)

train_rmse = mean_squared_error(y_train, y_train_pred_ran, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred_ran, squared=False)

print(f"decisionTreeRegressor Train RMSE: {train_rmse:.2f}")
print(f"decisionTreeRegressor Test RMSE: {test_rmse:.2f}")

print(f"The best parameters are {search_ran.best_params_}")

# Evaluate the model using R^2 score
train_r2_m4 = r2_score(y_train, y_train_pred_ran)
test_r2_m4 = r2_score(y_test, y_test_pred_ran)

print(f"Train R^2 Score: {train_r2_m4:.2f}")
print(f"Model Test R^2 Score: {test_r2_m4:.2f}")

decisionTreeRegressor Train RMSE: 12087.20
decisionTreeRegressor Test RMSE: 26608.85
The best parameters are {'standardscaler__with_std': True, 'standardscaler__with_mean': False, 'selectkbest__k': 60, 'randomforestregressor__n_estimators': 200, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__max_depth': 11, 'columntransformer__num__simpleimputer__strategy': 'mean'}
Train R^2 Score: 0.98
Model Test R^2 Score: 0.89


In [None]:
scores = pd.concat([scores, pd.DataFrame(
    {"model": ['4'], "R^2 Score Train":[train_r2_m4], "R^2 Score Test": [test_r2_m4]})], axis=0)
scores


Unnamed: 0,model,R^2 Score Train,R^2 Score Test,R^2 score Train,R^2 score Test
0,dtr,,,,
0,1,,,0.923634,0.880425
0,1,0.923634,0.880425,,
0,2,0.856886,0.696744,,
0,8,0.786968,0.663899,,
0,8,0.781295,0.681619,,
0,3,0.781295,0.681619,,
0,4,0.97695,0.885425,,


# ** 5 DecisionTreeRegressor with Kbest**

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
KBest = SelectKBest(score_func=f_regression)

In [None]:

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

In [None]:
#intialize the pipeline
#from sklearn import preprocessor
full_pipeline_dtR = make_pipeline(
    preprocessor, KBest, DecisionTreeRegressor())

In [None]:
param_grid_dtR = {
    'columntransformer__num__simpleimputer__strategy': ['mean', 'median'],
    'decisiontreeregressor__max_depth': range(3, 10),
    'decisiontreeregressor__min_samples_leaf': range(3, 10, 2),
    'decisiontreeregressor__min_samples_split': range(3, 39, 3),
    'selectkbest__k': range(20, 65)

}


In [None]:
# define cross validation
search_dtR = RandomizedSearchCV(full_pipeline_dtR, param_grid_dtR,
                                n_iter=2, cv=7, scoring='r2', random_state=123, verbose=0)

In [None]:
# fit
search_dtR.fit(X_train, y_train)
y_train_pred_dtR = search_dtR.predict(X_train)
y_test_pred_dtR = search_dtR.predict(X_test)


In [None]:
train_rmse = mean_squared_error(y_train, y_train_pred_dtR, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred_dtR, squared=False)

In [None]:
print(f"decisionTreeRegressor Train RMSE: {train_rmse:.2f}")
print(f"decisionTreeRegressor Test RMSE: {test_rmse:.2f}")

print(f"The best parameters are {search_dtR.best_params_}")

In [None]:
# Evaluate the model using R^2 score
train_r2_m5 = r2_score(y_train, y_train_pred_dtR)
test_r2_m5 = r2_score(y_test, y_test_pred_dtR)

print(f"Train R^2 Score: {train_r2_m5:.2f}")
print(f"Model Test R^2 Score: {test_r2_m5:.2f}")


In [None]:
scores = pd.concat([scores, pd.DataFrame(
    {"model": ['5'], "R^2 Score Train":[train_r2_m5], "R^2 Score Test": [test_r2_m5]})], axis=0)
scores

# **6.KNeighborsRegressor**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
neigh = KNeighborsRegressor(n_neighbors=1)


# initialize the pipe
full_pipeline_neigh = make_pipeline(
    preprocessor, neigh)
full_pipeline_neigh

# train a DecisionTree with GridSearch cross validation
# parameter grid
param_grid_neigh = {
    'columntransformer__num__simpleimputer__strategy': ['mean', 'median'],
}

# define cross validation
#search_dt = GridSearchCV(full_pipeline_dt,param_grid_dt,cv=5,verbose=0)
search_neigh = RandomizedSearchCV(full_pipeline_neigh, param_grid_neigh,
                                  n_iter=2, cv=7, scoring='neg_mean_absolute_percentage_error', random_state=123, verbose=0)

# fit
search_neigh.fit(X_train, y_train)
y_train_pred_neigh = search_dtR.predict(X_train)
y_test_pred_neigh = search_dtR.predict(X_test)

train_rmse = mean_squared_error(y_train, y_train_pred_neigh, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred_neigh, squared=False)

print(f"KNeighborsRegressor Train RMSE: {train_rmse:.2f}")
print(f"KNeighborsRegressor Test RMSE: {test_rmse:.2f}")

print(f"The best parameters are {search_neigh.best_params_}")

# Evaluate the model using R^2 score
train_r2_m6 = r2_score(y_train, y_train_pred_neigh)
test_r2_m6 = r2_score(y_test, y_test_pred_neigh)

print(f"Train R^2 Score: {train_r2_m6:.2f}")
print(f"Model Test R^2 Score: {test_r2_m6:.2f}")



In [None]:
scores = pd.concat([scores, pd.DataFrame(
    {"model": ['4'], "R^2 Score Train":[train_r2_m4], "R^2 Score Test": [test_r2_m4]})], axis=0)
scores

# Adding Training *data* **bold text**

In [None]:
# Kaggle training data
url = "https://drive.google.com/file/d/1iVBv5R6U53mofNpI9EkpFUQfwhYBk9MZ/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
train = pd.read_csv(path)

In [None]:
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
X = train.drop(columns=['Id'])
y=X.pop('SalePrice')

In [None]:
full_pipeline.fit(X_train, y_train)

In [None]:
train_pred= full_pipeline.predict(train)

# Test for kaggle** **bold text**

In [None]:
# import Kaggle test data
url = "https://drive.google.com/file/d/1jnn7sVeWjrKyWe2DDkpbtGpM-vCmWsnW/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_test = pd.read_csv(path)


In [None]:
id_column = competition_test.pop("Id")

In [None]:
X = competition_test.drop(columns=['Id'])

In [None]:
id_column = competition_test.pop("Id")

In [None]:
test_pred= full_pipeline.predict(competition_test)

In [None]:
competition_test

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


# **Submission file**

In [None]:
submission_file= pd.DataFrame({'Id':id_column,'SalePrice':test_pred})

In [None]:
submission_file

Unnamed: 0,Id,SalePrice
0,1461,149900.0
1,1462,145000.0
2,1463,190000.0
3,1464,178000.0
4,1465,245500.0
...,...,...
1454,2915,75000.0
1455,2916,84900.0
1456,2917,160000.0
1457,2918,100000.0


In [None]:
from google.colab import files
submission_file.to_csv('filename.csv',index=False)
files.download('filename.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>