In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv') 
test  = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train.shape

In [None]:
# Dropping rows where the target is missing
Target = 'SalePrice'
train.dropna(axis = 0, subset = [Target], inplace = True)

In [None]:
# Combine Test and Training sets to maintain consistancy.
data = pd.concat([train.iloc[:, :-1],test], axis = 0)

print('train df has {} rows and {} features'.format(train.shape[0], train.shape[1]))
print('test df has {} rows and {} features'.format(test.shape[0], test.shape[1]))
print('Combined df has {} rows and {} features'.format(data.shape[0], data.shape[1]))

In [None]:
# Dropping unwanted columns
data = data.drop(columns = ['Id'], axis = 1)

In [None]:
# Looking for Missing Values
def missingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False) / len(df) * 100, 2)
    temp = pd.concat([total, percent], axis = 1,keys = ['Total', 'Percent'])
    return temp.loc[(temp['Total'] > 0)]

missingValuesInfo(train)

In [None]:
def dropColumnsWithHighMissingValues(df, test, data, threshold = 80):
    missing_percent = (df.isnull().sum() / len(df)) * 100
    columns_to_drop = missing_percent[missing_percent > threshold].index
    
    # Drop the selected columns
    df = df.drop(columns = columns_to_drop)
    data = data.drop(columns = columns_to_drop)
    test = test.drop(columns = columns_to_drop)
    return df

train = dropColumnsWithHighMissingValues(train, test, data)
missingValuesInfo(train)

In [None]:
# Missing Value Handling
def HandleMissingValues(df):
    num_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
    cat_cols = [cname for cname in df.columns if df[cname].dtype == "object"]
    values = {}

    for a in cat_cols:
        values[a] = 'UNKNOWN'

    for a in num_cols:
        values[a] = df[a].median()
        
    df.fillna(value = values, inplace = True)
    
    
HandleMissingValues(data)
data.head()

In [None]:
#Categorical Feature Encoding
def getObjectColumnsList(df):
    return [cname for cname in df.columns if df[cname].dtype == "object"]

def PerformOneHotEncoding(df,columnsToEncode):
    return pd.get_dummies(df,columns = columnsToEncode)

cat_cols = getObjectColumnsList(data)
data = PerformOneHotEncoding(data,cat_cols)

In [None]:
data.shape

In [None]:
#spliting the data into train and test datasets
train_data = data.iloc[:1460, :]
test_data = data.iloc[1460:, :]

In [None]:
# Get X,y for modelling
X = train_data
y = train.loc[:, 'SalePrice']

In [None]:
import xgboost as xgb
from sklearn.pipeline import Pipeline

model_xgb = xgb.XGBRegressor(n_estimators = 400, max_depth = 2, learning_rate = 0.2)
pipeline = Pipeline([
    ('xgboost', model_xgb)
])

pipeline.fit(X, y)
xgb_preds = pipeline.predict(test_data)

In [None]:
predictions = xgb_preds

In [None]:
#make the submission data frame
submission = {
    'Id': test.Id.values,
    'SalePrice': predictions
}
solution = pd.DataFrame(submission)
solution.head()

In [None]:
#make the submission file
solution.to_csv('submission.csv', index = False)