# Step 1

**Load libraries**

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt 
import warnings
warnings.filterwarnings("ignore")
import pandas
from pathlib import Path
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

# Step 2

**Read in data**

In [None]:
# import kc_house_data_train.csv, drop 'Unnamed' column
df_train = pd.read_csv('kc_house_data_train.csv', index_col = 'id').drop(columns='Unnamed: 0')
pd.set_option("display.max_columns", None)
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.describe()

In [None]:
df_train.isna().any()

In [None]:
df_train.nunique(axis=0)

# Step 3


**EDA**

In [None]:
# distribution of values for all variables
sns.set_theme(style="darkgrid")
df_train.hist(bins=50, figsize=(20,15))
plt.show()

# Observations

bedrooms: the min is 0. All houses generally should have bedrooms. This could be a case of missing values.

bedrooms: the max is 33. The square footage of this house will have to be looked at to affirm this makes sense. This could also be a case of mistype.

bathrooms: the min is 0. All houses have at least 1 bathroom. This could be a case of missing value or non traditional properties, like barns.

floors: The number of floors seems to be a float with .5. In real-estate definition a 1.5 storey house means that the house has a floor that only spans half the size of the main floor. The data is valid here.

waterfront: the max is 1. The value is 0 at 25%, 50% and 75%. This makes sense since this variable is a dummy. Houses that have a waterfront view have a value of 1 and for no waterfront view the value is 0.

view: the max is 4 but the value is 0 at 25%, 50% and 75%. This tells me that some houses seem to have a view rating of even 0.
sqft_basement: the values at the 25th and 50th percentile is 0. This could mean that only some houses have basements and others don't.

yr_renovated: The value is 0 at 25%, 50% and 75%. This could mean that some house were never renovated once built.

In [None]:
#sns.set_theme(style="darkgrid")
#df_train['price'].hist(bins=50, figsize=(11.7,8.27))
#plt.show()

In [None]:
corr = df_train.corr()

sns.set(rc={'figure.figsize':(12,9)})
sns.heatmap(corr, xticklabels=corr.columns, \
            yticklabels=corr.columns, \
            annot=False, cmap='Blues', center= 0)
plt.title('Degrees of Correlation in dataset');

In [None]:
#corr = df_train.corr().abs()

#sns.set(rc={'figure.figsize':(12,9)})
#sns.heatmap(corr, xticklabels=corr.columns, \
#            yticklabels=corr.columns, \
#            annot=False, cmap='Blues', center= 0)
#plt.title('Degrees of Correlation in dataset');

## Date

In [None]:
type(df_train['date'])

In [None]:
# Removing the time variable from date
df_train['date'] = df_train['date'].map(lambda x: x[:8])

# Altering the cleaned date into datetime type
df_train['date'] = pd.to_datetime(df_train['date'])


In [None]:
df_train.corr()['price'].abs().sort_values()

## Year renovated

In [None]:
#Change values of years renovated
renovated = np.where(df_train['yr_renovated'] == 0, df_train['yr_built'], df_train['yr_renovated'])

In [None]:
renovated = df_train['yr_renovated']

In [None]:
df_train.info()

In [None]:
df_train.nunique(axis=0)

In [None]:
#df_train['second_sale'] = [1 if x == True else 0 for x in df_train.duplicated(subset='id', keep='last')]


# Bedrooms

In [None]:
# fix the value for the house with 33 bedrooms
df_train['bedrooms'][2402100895] = 3

In [None]:
df_train.corr()['price']['bedrooms']

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["bedrooms"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
sns.set_style('darkgrid')
x = df_train['bedrooms']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()


In [None]:
# maybe drop >10 bedrooms
sns.set(style="darkgrid")
sns.boxplot(x=df_train["bedrooms"],y=df_train["price"])
plt.title("Number of bedrooms in homes sold")

In [None]:
# H0: number of bedrooms does not affect house price
# Ha: number of bedrooms affects house price
# Anova (f test)

import scipy.stats as st

st.f_oneway(df_train['bedrooms'],df_train['price'])
#pvalue is less than .05 therefore we reject the null hypothesis

In [None]:
#run linear regression model. low R-squared score
ols(formula='price~bedrooms', data = df_train).fit().summary()

In [None]:
#Run linear regression model on dummied bedrooms. R-squared is higher but still low overall.
ols(formula='price~C(bedrooms)', data = df_train).fit().summary()

In [None]:
df_train = df_train.merge((pd.get_dummies(df_train['bedrooms'], drop_first = True)),
                          how = 'left', left_index = True, right_index = True)
df_train

In [None]:
dfpd.get_dummies(df)

In [None]:
df_train.shape

# Bathrooms

In [None]:
df_train.corr()['price']['bathrooms']

In [None]:
sns.set_style('darkgrid')
x = df_train['bathrooms']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_style('darkgrid')
x = df_train['bathrooms']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.boxplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["bathrooms"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
ols(formula='price~bathrooms', data = df_train).fit().summary()

In [None]:
ols(formula='price~C(bathrooms)', data = df_train).fit().summary()

In [None]:
df_train['bathrooms'].mean()

In [None]:
df_train['bathrooms'].value_counts()

In [None]:
#df_train['1_bathroom'] = np.select([df_train['bathrooms'] <= 1, df_train['bathrooms']>1], [1, 0])
#df_train['2_bathroom'] = np.select([df_train['bathrooms'] <= 2, df_train['bathrooms']>1], [1, 0])
#df_train['3_bathroom'] = np.select([df_train['bathrooms'] <= 3, df_train['bathrooms']>2], [1, 0])
#df_train['4_bathroom'] = np.select([df_train['bathrooms'] <= 4, df_train['bathrooms']>3], [1, 0])
#df_train['5_bathroom'] = np.select([df_train['bathrooms'] <= 5, df_train['bathrooms']>4], [1, 0])
#df_train['6_bathroom'] = np.select([df_train['bathrooms'] <= 6, df_train['bathrooms']>5], [1, 0])
#df_train['7_bathroom'] = np.select([df_train['bathrooms'] <= 7, df_train['bathrooms']>6], [1, 0])
#df_train['8_bathroom'] = np.select([df_train['bathrooms'] <= 8, df_train['bathrooms']>7], [1, 0])

In [None]:
df_train['1_bathroom'] = np.where((df_train['bathrooms'] <= 1) & (df_train['bathrooms']>1), 1, 0)
df_train['2_bathroom'] = np.where((df_train['bathrooms'] <= 2) & (df_train['bathrooms']>1), 1, 0)
df_train['3_bathroom'] = np.where((df_train['bathrooms'] <= 3) & (df_train['bathrooms']>2), 1, 0)
df_train['4_bathroom'] = np.where((df_train['bathrooms'] <= 4) & (df_train['bathrooms']>3), 1, 0)
df_train['5_bathroom'] = np.where((df_train['bathrooms'] <= 5) & (df_train['bathrooms']>4), 1, 0)
df_train['6_bathroom'] = np.where((df_train['bathrooms'] <= 6) & (df_train['bathrooms']>5), 1, 0)
df_train['7_bathroom'] = np.where((df_train['bathrooms'] <= 7) & (df_train['bathrooms']>6), 1, 0)
df_train['8_bathroom'] = np.where((df_train['bathrooms'] <= 8) & (df_train['bathrooms']>7), 1, 0)

In [None]:
df_train.head()

# Square footage

In [None]:
df_train.corr()['price']['sqft_living']

In [None]:
sns.set_style('darkgrid')
x = df_train['sqft_living']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.lineplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_style('darkgrid')
x = df_train['sqft_living']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.scatterplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
# amount of squarefootage data distribution
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["sqft_living"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
sns.set_theme(style="darkgrid")
df_train['sqft_living'].hist(bins=50, figsize=(11.7,8.27))
plt.show()

In [None]:
ols(formula='price~sqft_living', data = df_train).fit().summary()

# Lot size

In [None]:
df_train.corr()['price']['sqft_lot']

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["sqft_lot"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Lot size of 15 closest neighbors

In [None]:
df_train.corr()['price']['sqft_lot15']

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["sqft_lot15"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
df_train.sort_values('sqft_lot15', ascending = True)

In [None]:
len(df_train['zipcode'].unique())

In [None]:
print(df_train['lat'].min())
print(df_train['lat'].max())

In [None]:
print(df_train['long'].min())
print(df_train['long'].max())

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["zipcode"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["lat"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["long"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Floors

In [None]:
df_train.corr()['price']['floors']

In [None]:
df_train['floors'].value_counts()

In [None]:
sns.set_style('darkgrid')
x = df_train['floors']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_style('darkgrid')
x = df_train['grade']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.boxplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x=df_train["floors"])
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
sns.set_style('darkgrid')
x = df_train['floors']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.scatterplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
#column_1 = df["a"]
#column_2 = df["c"]
#correlation = column_1. corr(column_2) calculate correlation between `column_1` and `column_2`
#print(correlation)

In [None]:
df_train['condition'].value_counts()

# Zipcode

In [None]:
df_train.info

In [None]:
#need to figure out how to use the dummies here
df_train['price'].corr(df_train['zipcode'])

In [None]:
sns.set_style('darkgrid')
x = df_train['zipcode']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.scatterplot(data = df_train, x = df_train['zipcode'], y = df_train['price'])

plt.show()

In [None]:
df_top5zip_price = df_train.groupby("zipcode")["price"].mean().sort_values(ascending = False)[:5]
df_mean_price = df_train.price.mean()
df_top5zip_price

In [None]:
df_mean_price = df_train.price.mean()
df_mean_price

In [None]:
mean_price_by_zip = df_train.groupby("zipcode")["price"].mean().sort_values(ascending = False)[:70]

In [None]:
count_by_zip = df_train.groupby("zipcode")["price"].count().sort_values(ascending = False)[:70]
mean_price_by_zip = mean_price_by_zip.to_frame()
mean_price_by_zip['count'] = count_by_zip
mean_price_by_zip

In [None]:
mean_price_by_zip.sort_values('count')
#pandas.set_option('display.max_rows', None)

In [None]:
mean_price_by_zip.sort_values('price')

In [None]:
top5_zip_by_mean_price = df_train.groupby("zipcode")['price'].mean().sort_values(ascending = False)[:5]
total_mean_price = df_train.price.mean()
top5_zip_by_mean_price

In [None]:
top5_zips_by_avg_number_of_bedrooms = df_train.groupby("zipcode")['bedrooms'].mean().sort_values(ascending = False)[:5]
total_mean_bedrooms = df_train.price.mean()

In [None]:
sns.set_style('whitegrid')
x = ['98039', '98004', '98040', '98112', '98102']
y = [2.202790e+06, 1.396883e+06, 1.183018e+06, 1.119458e+06, 9.336715e+05]
fig, ax = plt.subplots(figsize = (15, 5))
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')

ax.set(xlabel='Zip Code', ylabel='Avg. Price')
plt.title("Average Home Price  ") # You can comment this line out if you don't need title
plt.hlines(total_mean_price, -.5 ,4.5, colors="red", label="Average Price")

plt.show()


In [None]:
# add dummy columns for zipcodes
df_train = df_train.merge((pd.get_dummies(df_train['zipcode'], drop_first = True)),
                          how = 'left', left_index = True, right_index = True)
df_train

In [None]:
from statsmodels.formula.api import ols
ols(formula='price~sqft_living+waterfront', data= df_train).fit().summary()

In [None]:
from statsmodels.formula.api import ols
ols(formula='price~zipcode', data= df_train).fit().summary()

In [None]:
# drop zipcodes with p value higher than .05
from statsmodels.formula.api import ols
ols(formula='price~C(zipcode)', data= df_train).fit().summary()

# Basement

In [None]:
# add a 
df_train['has_basement'] = [0 if x == 0 else 1 for x in df_train.sqft_basement]
df_train

In [None]:
df_train['price'].corr(df_train['has_basement'])

In [None]:
sns.set(style="darkgrid")
sns.boxplot(x=df_train["has_basement"],y=df_train["price"])
plt.title("Median prices of homes with basements and without")

In [None]:
df_train.corr()['price']['has_basement']

In [None]:
# 2 sample t test
# H0: There is no difference in price between homes with a basement and those without.
# Ha: The is a difference in price between homes with a basement and those without.

no_basement = df_train[df_train['has_basement']==0]['price']
basement = df_train[df_train['has_basement']==1]['price']

st.ttest_ind(no_basement, basement)

#  Price per square foot

In [None]:
# add new feature--price per square foot
df_train['price_sqft'] = df_train['price']/df_train['sqft_living']
df_train

In [None]:
df_train['price'].corr(df_train['price_sqft'])

In [None]:
sns.scatterplot(data = df_train, x = df_train['price_sqft'], y = df_train['price'])

plt.show()

In [None]:
# distribution of price per squarefoot
sns.set_theme(style="darkgrid")
df_train['price_sqft'].hist(bins=50, figsize=(11.7,8.27))
plt.show()

# Bedroom to bathroom ratio

In [None]:
# add new featre--bedroom bathroom ratio
df_train['bed_bath_ratio'] = round(df_train['bedrooms']/df_train['bathrooms'], 2)
df_train

In [None]:
df_train.corr()['price']['bed_bath_ratio']

In [None]:
sns.scatterplot(data = df_train, x = df_train['bed_bath_ratio'], y = df_train['price'])

plt.show()

In [None]:
ols(formula='price~C(bed_bath_ratio)', data= df_train).fit().summary()

# Yard space

**There does not appear to be a correlation between price and yard space**

In [None]:
df_train['yard_space'] = round(df_train.sqft_lot - (df_train.sqft_living / df_train.floors), 2)
df_train

In [None]:
# very low to no correlation
df_train['price'].corr(df_train['yard_space'])

In [None]:
sns.set_style('darkgrid')
x = df_train['yard_space']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.scatterplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
ols(formula='price~yard_space', data = df_train).fit().summary()

# Grade

In [None]:
df_train['grade'].value_counts()

In [None]:
df_train['price'].corr(df_train['grade'])

In [None]:
sns.set_style('darkgrid')
x = df_train['grade']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_style('darkgrid')
x = df_train['grade']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.boxplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_style('darkgrid')
x = df_train['grade']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.scatterplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
#df_train['low_grade'] = np.where(df_train['grade'] < 5, 1, 0)
#df_train['mid_grade'] = np.where(df_train['grade'].isin(range(5,11)), 1, 0)
#df_train['high_grade'] = np.where(df_train['grade'] > 10, 1, 0)


In [None]:
#df_train['price'].corr(df_train['high_grade'])

In [None]:
df_train['price'].corr(df_train['grade'])

In [None]:
ols(formula='price~grade', data = df_train).fit().summary()

In [None]:
df_train['low_grade'] = np.where(df_train['grade'] < 6, 1, 0)
df_train['below_average_grade'] = np.where(df_train['grade'] == 6, 1, 0)
df_train['average_grade'] = np.where(df_train['grade'] ==7, 1, 0)
df_train['above_average_grade'] = np.where(df_train['grade'].isin(range(8,9)), 1, 0)
df_train['high_Grade'] = np.where(df_train['grade'] > 9, 1, 0)

In [None]:
df_train.head()

# View

In [None]:
df_train['view'].value_counts()

In [None]:
sns.set_style('darkgrid')
x = df_train['view']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()


In [None]:
sns.set_style('darkgrid')
x = df_train['view']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.boxplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_style('darkgrid')
x = df_train['view']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.scatterplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
df_train['view'].value_counts()

In [None]:
df_train['price'].corr(df_train['view'])

In [None]:
ols(formula='price~view', data = df_train).fit().summary()

In [None]:
ols(formula='price~C(view)', data = df_train).fit().summary()

In [None]:
# Anova (f test)

import scipy.stats as st

st.f_oneway(df_train['view'],df_train['price'])

# Waterfront

In [None]:
# a value of one indicates that a home is on the waterfront
df_train['waterfront'].value_counts()

In [None]:
sns.set_style('darkgrid')
x = df_train['waterfront']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_style('darkgrid')
x = df_train['waterfront']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.boxplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
sns.set_style('darkgrid')
x = df_train['waterfront']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.scatterplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
df_train['price'].corr(df_train['waterfront'])

In [None]:
# 2 sample t test
not_waterfront = df_train[df_train['waterfront']==0]['price']
waterfront = df_train[df_train['waterfront']==1]['price']

st.ttest_ind(not_waterfront, waterfront)

In [None]:
ols(formula='price~waterfront', data = df_train).fit().summary()

# Condition

In [None]:
df_train['price'].corr(df_train['condition'])

In [None]:
df_train['sqft_living'].corr(df_train['condition'])

In [None]:
grid = sns.FacetGrid(df_train, col = "condition", hue = "condition", col_wrap=5)
grid.map(sns.scatterplot, "sqft_living", "price")

grid.add_legend()

plt.show()


# Year built

In [None]:
df_train['price'].corr(df_train['yr_built'])

In [None]:
sns.set_style('darkgrid')
x = df_train['yr_built']
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.barplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=90)
plt.show()


# Bedrooms X Bathrooms

In [None]:
df_train['price'].corr(df_train['bedrooms'])

In [None]:
df_train['price'].corr(df_train['bathrooms'])

In [None]:
df_train['bedrooms'].corr(df_train['bathrooms'])

In [None]:
bed_x_bath = df_train['bedrooms']*df_train['bathrooms']

In [None]:
df_train['price'].corr(bed_x_bath)

In [None]:
sns.set_style('darkgrid')
x = bed_x_bath
y = df_train['price']
fig, ax = plt.subplots()
ax = sns.scatterplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

# Maps

In [None]:
sns.set_style('darkgrid')
x = df_train['long']
y = df_train['lat']
fig, ax = plt.subplots()
ax = sns.scatterplot(x=x,y=y, data=df_train)
ax.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
#import folium

#kc_map = folium.Map(location = [47.5480, -121.9836], tiles = 'OpenStreetMap', zoom_start=9)
#kc_coord = list(zip(df_train['lat'], df_train['long']))

#for coord in kc_coord:
#    folium.Marker(location = coord).add_to(kc_map)

#kc_map

# Part 2 - Modeling

In [None]:
df_train.head()

In [None]:
df_train2 = df_train.drop(['date', 'price', 'bedrooms', 'bathrooms', 'zipcode', 'grade'], axis=1)

In [None]:
df_train2.head()

In [None]:
df_train2.shape

In [None]:
#features = ['bedrooms', 'bathrooms', 'sqft_living', 'grade', 'zipcode']

In [None]:
df_features = df_train2

In [None]:
target = df_train['price']

In [None]:
from sklearn.linear_model import LinearRegression

#instantiate a linear regression object
lm = LinearRegression()

#fit the linear regression to the data
lm = lm.fit(df_features, target)

#access output
print(lm.intercept_)
print(lm.coef_)
print("R^2: ", lm.score(df_features, target))

In [None]:
#call train_test_split on the data and capture the results
X_train, X_test, y_train, y_test = train_test_split(df_features, target, random_state=1, test_size=0.2)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

selector = SelectKBest(f_regression, k=100)

selector.fit(X_train, y_train)

# Use SKlearn to create new features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)

In [None]:
poly_data = poly.fit_transform(df_features)

In [None]:
poly_data

In [None]:
len(df_features.columns)

In [None]:
poly_columns = poly.get_feature_names(df_features.columns)

In [None]:
len(poly_columns)

In [None]:
df_poly = pd.DataFrame(poly_data, columns=poly_columns)

In [None]:
df_poly.head()

In [None]:
df_features.shape

In [None]:
df_poly.shape

# Fit and assess new model

In [None]:
#instantiate a linear regression object
lm_2 = LinearRegression()

#fit the linear regression to the data

lm_2 = lm_2.fit(df_poly, target)

#access output
#print(lm_2.intercept_)
#print(lm_2.coef_)
print("R^2: ", lm_2.score(df_poly, target))

# Create Train and Test Split

In [None]:
#import train_test_split from sklearn package
from sklearn.model_selection import train_test_split

In [None]:
#call train_test_split on the data and capture the results
X_train, X_test, y_train, y_test = train_test_split(df_features, target, random_state=1, test_size=0.2)

In [None]:
#check the shape of the results
print("Training set - Features: ", X_train.shape, "Target: ", y_train.shape)
print("Training set - Features: ", X_test.shape, "Target: ",y_test.shape)

In [None]:
#fit a model
from sklearn import linear_model

#instantiate a linear regression object
lm = linear_model.LinearRegression()

#fit the linear regression to the training data
lm.fit(X_train, y_train) # use capital 'X'


print(lm.intercept_)
print(lm.coef_)

# How well did my model perform

In [None]:
print ("R^2 Score:", lm.score(X_train, y_train))

In [None]:
#predict on the training data
y_train_pred = lm.predict(X_train)

In [None]:
y_train_pred

In [None]:
#import the metrics module from sklearn
from sklearn import metrics

train_mae = metrics.mean_absolute_error(y_train, y_train_pred)
train_mse = metrics.mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))


print('Mean Absolute Error:', train_mae )
print('Mean Squared Error:',  train_mse)
print('Root Mean Squared Error:' , train_rmse)

In [None]:
price_std = target.std()

print('Z-Score of Mean Absolute Error:', train_mae/price_std )
print('Z-Score of Root Mean Squared Error:' , train_rmse/price_std)

# Predicting the Test Set

In [None]:
#predict on the test set of data
y_pred = lm.predict(X_test)

In [None]:
y_pred[:10]

In [None]:
print ("Score:", lm.score(X_test, y_test))

In [None]:
test_mae = metrics.mean_absolute_error(y_test, y_pred)
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))


print('Mean Absolute Error:' + str(metrics.mean_absolute_error(y_test, y_pred)))
print('Mean Squared Error:' + str(metrics.mean_squared_error(y_test, y_pred)))
print('Root Mean Squared Error:' + str(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

In [None]:
print('Mean Absolute Error  Z:', test_mae/price_std )
print('Root Mean Squared Error Z:' , test_rmse/price_std)

# Comparing our Model's performance on training data versus test data

In [None]:
print('Training: ', int(train_rmse), "vs. Testing: ", int(test_rmse))

# Feature Selection