In [1]:
import numpy as np
import pandas as pd
import math

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

# Import Data

In [2]:
# Import training and holdout dataframes
train_df = pd.read_csv('kc_house_data_train.csv')
hold_df = pd.read_csv('kc_house_data_test_features.csv')

# Create Features

In [3]:
# Print column names
print(train_df.keys())

Index(['Unnamed: 0', 'id', 'date', 'price', 'bedrooms', 'bathrooms',
       'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition',
       'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


In [4]:
# Function to create house ages
def set_ages(df):
    df['age'] = 2015 - df['yr_built']
    return df

# Create house ages for dataframes
# train_df = set_age(train_df)
# hold_df = set_age(hold_df)

In [5]:
# Function to transform year renovated
def trans_ren(df):
    df['yr_renovated'] = df['yr_renovated'] - 1933
    return df

# Transorm year renovated for dataframes
# train_df = trans_ren(train_df)
# hold_df = trans_ren(hold_df)

In [6]:
# Function for creating grade dummy variables
def set_grades(df):
    dummies = pd.get_dummies(df['grade'], prefix='grade', drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    for grade in dummies.keys():
        df[grade] = df[grade] * df['sqft_living']
    return df

# Create grade dummy variables for dataframes
# train_df = set_grades(train_df)
# hold_df = set_grades(hold_df)


In [7]:
# Function for creating condition dummy variables
def set_conditions(df):
    dummies = pd.get_dummies(df['condition'], prefix='condition', drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    for condition in dummies.keys():
        df[condition] = df[condition] * df['sqft_living']
    return df

In [8]:
# Function for creating year dummy variables
def set_years(df):
    df['year'] = df['date'].apply(lambda x: int(x[0:4]))
    dummies = pd.get_dummies(df['year'], drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    for year in dummies.keys():
        df[year] = df[year] * df['sqft_living']
    df.drop('year', axis=1, inplace=True)
    return df

# Create year dummy variables for dataframes
# train_df = set_years(train_df)
# hold_df = set_years(hold_df)

In [9]:
# Function for creating month dummy variables
def set_months(df):
#     months = {1:'jan', 2:'feb', 3:'mar', 4:'apr', 5:'may', 6:'jun',
#               7:'jul', 8:'aug', 9:'sep', 10:'oct', 11:'nov', 12:'dec'}
    df['month'] = df['date'].apply(lambda x: int(x[4:6]))
    dummies = pd.get_dummies(df['month'], drop_first=True)
#     for key in dummies.keys():
#         if key in months.keys():
#             dummies.rename({key : months[key]}, axis=1, inplace=True)
    df = pd.concat([df, dummies], axis=1)
    for month in dummies.keys():
        df[month] = df[month] * df['sqft_living']
    df.drop('month', axis=1, inplace=True)
    return df

# Create month dummy variables for dataframes
# train_df = set_months(train_df)
# hold_df = set_months(hold_df)

In [10]:
# Function for creating zipcode dummy variables
def set_zipcodes(df):
    dummies = pd.get_dummies(df['zipcode'], drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    for zipcode in dummies.keys():
        df[zipcode] = df[zipcode] * df['sqft_living']
    return df

# Create zipcode dummy variables for dataframes
# train_df = set_zipcodes(train_df)
# hold_df = set_zipcodes(hold_df)

In [11]:
# Function to transform waterfront variable
def trans_waterfront(df):
    df['waterfront'] = df['waterfront'] * df['sqft_living']
    return df

# Transform waterfront variable for dataframes
# train_df = trans_waterfront(train_df)
# hold_df = trans_waterfront(hold_df)

# Apply Features to Dataframes

In [12]:
# Function to create all features for dataframes
def create_featuers(df):
    df = set_ages(df)
    df = trans_ren(df)
    df = set_years(df)
    df = set_months(df)
    df = set_zipcodes(df)
    df = trans_waterfront(df)
    df = set_grades(df)
    df = set_conditions(df)
    return df

# Create all features for dataframes
train_df = create_featuers(train_df)
hold_df = create_featuers(hold_df)

# Fix Outliers

In [13]:
# Function to transform extreme values
def fix_outliers(df, cols):
    for col in cols:
        if df[col].eq(0).any():
            filt = df[col] == 0
            df[f'{col}_nan'] = np.where(filt, np.nan, df[col])
            std = df[f'{col}_nan'].std()
            mean = df[f'{col}_nan'].mean()
            del df[f'{col}_nan']
        else:
            std = df[col].std()
            mean = df[col].mean()
        value = mean+(8*std)
        df[col] = df[col].apply(lambda x: value if (x>value) else x)
    return df

# Transform extreme values for specified columns dataframes
outlier_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above',
                'sqft_basement', 'sqft_living15', 'sqft_lot15']

train_df = fix_outliers(train_df, outlier_cols)
hold_df = fix_outliers(hold_df, outlier_cols)

# EDA

In [14]:
# # Function to create feature pplots
# def pplot_features(df, features):
#     pp_rows = [features[i:i+4] for i in range(0, len(features), 4)]
#     for row in pp_rows:
#         pp = sns.pairplot(data=df, y_vars=['price'], x_vars=row, kind='reg', height=3)
#     return None

# # Create feature pplots for training df
# pplot_features(train_df, features)

# Split Data Into Train, Test Data

In [15]:
def tt_split(df):
    ignore = ['Unnamed: 0', 'id', 'price', 'date', 'yr_built', 'zipcode',
          'lat', 'long', 'sqft_living', 'sqft_above','grade', 'condition']
    Y = df['price']
    for col in ignore:
        df = df.drop(columns=col, axis=1)
    x_train, x_test, y_train, y_test = train_test_split(df, Y, random_state=22,test_size=0.2)
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = tt_split(train_df)


In [16]:
x_hold = hold_df.drop(columns=['Unnamed: 0', 'id', 'date', 'yr_built', 'zipcode',
          'lat', 'long', 'sqft_living', 'sqft_above','grade', 'condition'])

# Run Model

In [17]:
model = sm.OLS(y_train, sm.add_constant(x_train[[key for key in x_train]])).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.892
Model:,OLS,Adj. R-squared:,0.892
Method:,Least Squares,F-statistic:,1064.0
Date:,"Mon, 04 May 2020",Prob (F-statistic):,0.0
Time:,11:57:44,Log-Likelihood:,-181730.0
No. Observations:,13832,AIC:,363700.0
Df Residuals:,13724,BIC:,364500.0
Df Model:,107,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.091e+05,1.04e+04,20.151,0.000,1.89e+05,2.29e+05
bedrooms,-9880.4666,1557.358,-6.344,0.000,-1.29e+04,-6827.832
bathrooms,2.077e+04,2542.288,8.171,0.000,1.58e+04,2.58e+04
sqft_lot,0.5416,0.055,9.909,0.000,0.434,0.649
floors,-3.401e+04,2994.874,-11.357,0.000,-3.99e+04,-2.81e+04
waterfront,218.1411,3.617,60.310,0.000,211.051,225.231
view,4.314e+04,1676.618,25.730,0.000,3.99e+04,4.64e+04
sqft_basement,-75.9596,3.605,-21.071,0.000,-83.026,-68.893
yr_renovated,23.3993,2.870,8.154,0.000,17.774,29.024

0,1,2,3
Omnibus:,5660.646,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,262906.4
Skew:,1.228,Prob(JB):,0.0
Kurtosis:,24.216,Cond. No.,438000.0


# Feature Selection

In [18]:
lasso = Lasso(alpha=0.01, normalize=False)
lasso.fit(x_train, y_train)

y_train_pred = lasso.predict(x_train)
y_pred = lasso.predict(x_test)

  positive)


In [19]:
train_rmse = metrics.mean_absolute_error(y_train, y_train_pred)
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print('Training Error: '+ str(train_rmse) )
print('Testing Error: '+ str(test_rmse) )

Training Error: 75673.14766397967
Testing Error: 129709.4618667734


In [20]:
lasso_coef = pd.DataFrame(data=lasso.coef_).T
lasso_coef.columns = x_train.columns
lasso_coef = lasso_coef.T.sort_values(by=0).T

lasso_coef

Unnamed: 0,floors,bedrooms,grade_3,grade_4,grade_5,grade_6,sqft_basement,grade_7,grade_8,grade_9,...,waterfront,98105,98109,98119,98112,98102,98004,98039,bathrooms,view
0,-34104.552036,-10025.174444,-268.95006,-154.262145,-112.715802,-89.58916,-76.088031,-68.023611,-48.479312,-26.610308,...,218.032761,234.174704,265.780238,274.706294,280.069725,281.292279,300.953774,378.869292,20661.204939,43149.076256


# Final Model Without Test 

In [21]:
y_all = train_df['price']
ignore = ['Unnamed: 0', 'id', 'price', 'date', 'yr_built', 'zipcode',
          'lat', 'long', 'sqft_living', 'sqft_above','grade', 'condition']
x_all = train_df
x_all = x_all.drop(columns=ignore, axis=1)

In [22]:
model_all = sm.OLS(y_all, sm.add_constant(x_all[[key for key in x_all]])).fit()
model_all.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.89
Model:,OLS,Adj. R-squared:,0.889
Method:,Least Squares,F-statistic:,1294.0
Date:,"Mon, 04 May 2020",Prob (F-statistic):,0.0
Time:,11:58:18,Log-Likelihood:,-227320.0
No. Observations:,17290,AIC:,454800.0
Df Residuals:,17182,BIC:,455700.0
Df Model:,107,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.151e+05,9305.171,23.116,0.000,1.97e+05,2.33e+05
bedrooms,-8823.6541,1411.189,-6.253,0.000,-1.16e+04,-6057.580
bathrooms,2.171e+04,2285.022,9.502,0.000,1.72e+04,2.62e+04
sqft_lot,0.5550,0.050,11.058,0.000,0.457,0.653
floors,-3.108e+04,2694.308,-11.534,0.000,-3.64e+04,-2.58e+04
waterfront,221.6211,3.337,66.415,0.000,215.080,228.162
view,4.306e+04,1505.846,28.594,0.000,4.01e+04,4.6e+04
sqft_basement,-70.3771,3.234,-21.762,0.000,-76.716,-64.038
yr_renovated,25.8073,2.564,10.064,0.000,20.781,30.833

0,1,2,3
Omnibus:,7326.552,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,332440.706
Skew:,1.308,Prob(JB):,0.0
Kurtosis:,24.322,Cond. No.,432000.0


In [23]:
lasso_all = Lasso(alpha=0.01, normalize=False)
lasso_all.fit(x_all, y_all)


  positive)


Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
y_hold = lasso_all.predict(x_hold)
y_hold

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 107 is different from 106)

In [None]:
#x_all.keys

# Save Results

# Notes (TESTING)

In [None]:
# # Recreate original dataframe without any added columns
# TESTING = pd.read_csv('kc_house_data_train.csv')

# # Create features list for original dataframe
# TESTING_FEATURES = set_features(TESTING)

# # Run model on original dataframe
# set_model(TESTING, TESTING_FEATURES)

# Unused Functions

In [None]:
# # Function to create target variable
# def set_target(df):
#     target = df['price']
#     return target

# # Create target variable for dataframes
# train_target = set_target(train_df)
# test_target = set_target(test_df)

In [None]:
# # Function for creating grade dummy variables
# def set_grades(df):
#     dummies = pd.get_dummies(df['grade'], prefix='grade', drop_first=True)
#     df = pd.concat([df, dummies], axis=1)
#     return df

# # Create grade dummy variables for dataframes
# train_df = set_grades(train_df)
# holdout_df = set_grades(holdout_df)

In [None]:
# # Function to transform bedrooms outliers
# def bedrooms_trans(df):
#     filt = df['bedrooms'] < 10
#     df['bedrooms'] = np.where(filt, df['bedrooms'], 10)
#     return df

# # Transform bedrooms outliers in dataframes 
# train_df = bedrooms_trans(train_df)
# hold_df = bedrooms_trans(hold_df)

In [None]:
# # Function to transform bathrooms outliers
# def bathrooms_trans(df):
#     low_filt = df['bathrooms'] > .5
#     df['bathrooms'] = np.where(low_filt, df['bathrooms'], .5)
#     high_filt = df['bathrooms'] < 5
#     df['bathrooms'] = np.where(high_filt, df['bathrooms'], 5)
#     return df

# # Transform bedrooms outliers in dataframes 
# train_df = bathrooms_trans(train_df)
# hold_df = bathrooms_trans(hold_df)

In [None]:
# # Function to create regression modeln (smf method)
# def set_model(df, features):
#     formula = 'price~' + '+'.join([f'{ft}' for ft in features])
#     model = smf.ols(formula=formula, data=df).fit()
#     return model.summary()

# # Run model on train dataframe
# set_model(train_df, train_features)

In [None]:
# # Function to create age range dummy variables
# def set_ages(df):
#     year = df['yr_built']
#     # df['age_new'] = np.where(age==2015, 1, 0)
#     df['age_10'] = np.where((year>2004) & (year<2015), 1, 0)
#     df['age_20'] = np.where((year>1994) & (year<2005), 1, 0)
#     df['age_30'] = np.where((year>1984) & (year<1995), 1, 0)
#     df['age_40'] = np.where((year>1974) & (year<1985), 1, 0)
#     df['age_50'] = np.where((year>1964) & (year<1975), 1, 0)
#     df['age_60'] = np.where((year>1954) & (year<1965), 1, 0)
#     df['age_70'] = np.where((year>1944) & (year<1955), 1, 0)
#     df['age_80'] = np.where((year>1934) & (year<1945), 1, 0)
#     df['age_90'] = np.where((year>1924) & (year<1935), 1, 0)
#     df['age_100'] = np.where((year>1914) & (year<1925), 1, 0)
#     df['age_old'] = np.where((year<1915), 1, 0)
#     return df

# # Create age range dummy variables for dataframes
# # train_df = set_ages(train_df)
# # hold_df = set_ages(hold_df)

In [None]:
# # Function to create renovation age range dummy variables
# def set_ren_ages(df):
#     year = df['yr_built']
#     ren_yr = df['yr_renovated']
#     df['not_ren'] = np.where((year!=2015) & (ren_yr==0), 1, 0)
#     df['ren_5'] = np.where((ren_yr>2010), 1, 0)
#     df['ren_10'] = np.where((ren_yr>2005) & (ren_yr<2011), 1, 0)
#     df['ren_15'] = np.where((ren_yr>2000) & (ren_yr<2006), 1, 0)
#     df['ren_20'] = np.where((ren_yr>1995) & (ren_yr<2001), 1, 0)
#     df['ren_25'] = np.where((ren_yr>1990) & (ren_yr<1996), 1, 0)
#     df['ren_30'] = np.where((ren_yr>1985) & (ren_yr<1991), 1, 0)
#     # df['ren_old'] = np.where((ren_yr>0) & (ren_yr<1986), 1, 0)
#     return df

# # Create renovation age range dummy variables for dataframes
# # train_df = set_ren_ages(train_df)
# # hold_df = set_ren_ages(hold_df)

In [None]:
# # Transform grade variable
# df_test = train_df
# df_test['grade_log'] = df_test['grade'].apply(lambda x: math.log(x))

# df_test = train_df
# df_test['grade_exp'] = df_test['grade'].apply(lambda x: math.exp(x))

# df_test = train_df
# df_test['price_log'] = df_test['price'].apply(lambda x: math.log(x))


# test1 = df_test.groupby('grade_exp')['price'].mean()
# fig, ax = plt.subplots()
# ax.scatter(test1.index, test1.values)

In [None]:
# # Function to create features list
# def set_features(df):
#     ignore = ['Unnamed: 0', 'id', 'price', 'date', 'yr_built', 'zipcode',
#               'lat', 'long', 'sqft_living', 'sqft_above']
#     features = list(df.keys())
#     for feature in ignore:
#         if feature in features:
#             features.remove(feature)
#     return features

# # Create features list for dataframes
# features = set_features(train_df)

In [None]:
# # Function to create regression model
# def set_model(df, features):
#     model = sm.OLS(y_train, sm.add_constant(
#         df[[key for key in features]])).fit()
#     return model.summary()
# # Run model on training dataframe
# set_model(train_df, features)

In [None]:
# # Split into x dataframes and y series for train and test data
# Y = train_df['price']
# X = train_df.drop(['price'], axis=1)
# x_train, x_test, y_train, y_test = train_test_split(
#     X, Y, random_state=22,test_size=0.2)