In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv') 
test  = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
train.shape

(1460, 81)

In [4]:
# Dropping rows where the target is missing
Target = 'SalePrice'
train.dropna(axis = 0, subset = [Target], inplace = True)

In [5]:
# Combine Test and Training sets to maintain consistancy.
data = pd.concat([train.iloc[:, :-1],test], axis = 0)

print('train df has {} rows and {} features'.format(train.shape[0], train.shape[1]))
print('test df has {} rows and {} features'.format(test.shape[0], test.shape[1]))
print('Combined df has {} rows and {} features'.format(data.shape[0], data.shape[1]))

train df has 1460 rows and 81 features
test df has 1459 rows and 80 features
Combined df has 2919 rows and 80 features


In [6]:
# Dropping unwanted columns
data = data.drop(columns = ['Id'], axis = 1)

In [7]:
# Looking for Missing Values
def missingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False) / len(df) * 100, 2)
    temp = pd.concat([total, percent], axis = 1,keys = ['Total', 'Percent'])
    return temp.loc[(temp['Total'] > 0)]

missingValuesInfo(train)

Unnamed: 0,Total,Percent
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
MasVnrType,872,59.73
FireplaceQu,690,47.26
LotFrontage,259,17.74
GarageYrBlt,81,5.55
GarageCond,81,5.55
GarageType,81,5.55


In [8]:
def dropColumnsWithHighMissingValues(df, test, data, threshold = 80):
    missing_percent = (df.isnull().sum() / len(df)) * 100
    columns_to_drop = missing_percent[missing_percent > threshold].index
    
    # Drop the selected columns
    df = df.drop(columns = columns_to_drop)
    data = data.drop(columns = columns_to_drop)
    test = test.drop(columns = columns_to_drop)
    return df

train = dropColumnsWithHighMissingValues(train, test, data)
missingValuesInfo(train)

Unnamed: 0,Total,Percent
MasVnrType,872,59.73
FireplaceQu,690,47.26
LotFrontage,259,17.74
GarageCond,81,5.55
GarageYrBlt,81,5.55
GarageFinish,81,5.55
GarageQual,81,5.55
GarageType,81,5.55
BsmtFinType2,38,2.6
BsmtExposure,38,2.6


In [9]:
# Missing Value Handling
def HandleMissingValues(df):
    num_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
    cat_cols = [cname for cname in df.columns if df[cname].dtype == "object"]
    values = {}

    for a in cat_cols:
        values[a] = 'UNKNOWN'

    for a in num_cols:
        values[a] = df[a].median()
        
    df.fillna(value = values, inplace = True)
    
    
HandleMissingValues(data)
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,UNKNOWN,Reg,Lvl,AllPub,Inside,...,0,0,UNKNOWN,UNKNOWN,UNKNOWN,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,UNKNOWN,Reg,Lvl,AllPub,FR2,...,0,0,UNKNOWN,UNKNOWN,UNKNOWN,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,UNKNOWN,IR1,Lvl,AllPub,Inside,...,0,0,UNKNOWN,UNKNOWN,UNKNOWN,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,UNKNOWN,IR1,Lvl,AllPub,Corner,...,0,0,UNKNOWN,UNKNOWN,UNKNOWN,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,UNKNOWN,IR1,Lvl,AllPub,FR2,...,0,0,UNKNOWN,UNKNOWN,UNKNOWN,0,12,2008,WD,Normal


In [10]:
#Categorical Feature Encoding
def getObjectColumnsList(df):
    return [cname for cname in df.columns if df[cname].dtype == "object"]

def PerformOneHotEncoding(df,columnsToEncode):
    return pd.get_dummies(df,columns = columnsToEncode)

cat_cols = getObjectColumnsList(data)
data = PerformOneHotEncoding(data,cat_cols)

In [11]:
data.shape

(2919, 310)

In [12]:
#spliting the data into train and test datasets
train_data = data.iloc[:1460, :]
test_data = data.iloc[1460:, :]

In [13]:
# Get X,y for modelling
X = train_data
y = train.loc[:, 'SalePrice']

In [14]:
from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV(alphas = (0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))
ridge_cv.fit(X, y)
ridge_cv_preds = ridge_cv.predict(test_data)

In [15]:
import xgboost as xgb

model_xgb = xgb.XGBRegressor(n_estimators = 340, max_depth = 2, learning_rate = 0.2)
model_xgb.fit(X, y)
xgb_preds = model_xgb.predict(test_data)

In [16]:
predictions = ( ridge_cv_preds + xgb_preds ) / 2

In [17]:
#make the submission data frame
submission = {
    'Id': test.Id.values,
    'SalePrice': predictions
}
solution = pd.DataFrame(submission)
solution.head()

Unnamed: 0,Id,SalePrice
0,1461,111441.638859
1,1462,155726.410806
2,1463,183348.687168
3,1464,190753.241019
4,1465,194928.811074


In [18]:
#make the submission file
solution.to_csv('submission.csv', index = False)