In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:

from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

test=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
train=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
train.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
#Remove rows in train where sale price is unavailable
train = train.dropna(subset=["SalePrice"])

# X contains the features that help predict the price (SalePrice), and y contains the actual sale prices.
X = train.drop(["SalePrice", "Id"], axis=1)  # Drop SalePrice and Id (not useful for prediction)
y = train["SalePrice"]

In [4]:
#Because we have no sale price in test, we drop ID.
X_test = test.drop(["Id"], axis=1)

In [5]:
catcols = [cname for cname in X.columns if X[cname].dtype == "object"]  # Text columns
numcols = [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]]  # Number columns

In [6]:
numerical_transformer = SimpleImputer(strategy="mean")
#Values missing in numerical cols are replaced with the mean of the col.

In [7]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing text with most common value
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  # Convert data (text) into numbers
])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numcols),  # Apply numerical transformation
        ("cat", categorical_transformer, catcols)  # Apply categorical transformation
    ]
)

In [9]:
model = LinearRegression()
clf=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Step 10: Create a pipeline that first preprocesses the data, then trains the model
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", clf)])

In [10]:
pipeline.fit(X,y)


ypred = pipeline.predict(X_test)

In [11]:
out = pd.DataFrame({'Id': test['Id'], 'SalePrice': ypred})
out.to_csv('house_price_predictions.csv', index=False)