In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
trainFilePath = '/kaggle/input/home-data-for-ml-course/train.csv'
train_data = pd.read_csv(trainFilePath)

train_data.shape

(1460, 81)

In [3]:
from sklearn.model_selection import train_test_split
y = train_data.SalePrice
excluded_columns = ['MoSold','YrSold','SaleType','SaleCondition','SalePrice','Id'] #Due to possible data leak problems

X = train_data.drop(excluded_columns,axis = 1)
X.shape
X_train_full,X_valid_full,y_train,y_valid = train_test_split(X,y,train_size = 0.8,test_size = 0.2)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

num_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64','float64']]
cat_cols = [col for col in X_train_full.columns if X_train_full[col].nunique() <15 and X_train_full[col].dtype == 'object']

total_cols = num_cols + cat_cols
X_train = X_train_full[total_cols].copy()
X_valid = X_valid_full[total_cols].copy()

#print(num_cols)
#print(cat_cols)

numerical_transformer = SimpleImputer(strategy = 'mean')
categorical_transformer = Pipeline([('SimpleImputer',SimpleImputer(strategy ='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore'))])

preprocessing = ColumnTransformer([('num',numerical_transformer,num_cols),('cat',categorical_transformer,cat_cols)])

In [5]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators = 200,learning_rate = 0.05)
final_pipeline = Pipeline([('preprocessing',preprocessing),('model',model)])

final_pipeline.fit(X_train,y_train)
prediction = final_pipeline.predict(X_valid)

score = mean_absolute_error(y_valid,prediction)
print(score)

17425.83850599315


In [6]:
X_final = X[total_cols].copy()
final_model = XGBRegressor(n_estimators = 200,learning_rate = 0.05)
final_pipeline2 = Pipeline([('preprocessing',preprocessing),('final_model',final_model)])

final_pipeline2.fit(X_final,y)

In [7]:
test_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

test_X = test_data[total_cols]

test_preds = final_pipeline2.predict(test_X)

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)