In [None]:
%load_ext autoreload

from utils import set_project_dir
set_project_dir('project_4')

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import linear_model, model_selection, metrics
import missingno as miss
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('data/in/train.csv')
test = pd.read_csv('data/in/test.csv')

# column to mark outliers
train['outlier'] = False

x_columns = []

# feature engineering
## general information
### numerical features

In [None]:
train.describe()

Pearson correlation between (numerical) features and target:

In [None]:
train.corr()['SalePrice'].sort_values()

### categorical features

In [None]:
train.select_dtypes('object').describe()

- one-way ANOVA to test whether the mean of different groups are significant
- or how groups of the IV affect the DV
- have to check for all individual groups as well, see here: http://hamelg.blogspot.de/2015/11/python-for-data-analysis-part-16_23.html

In [None]:
column = train.select_dtypes('object').columns[17]
sns.boxplot(train[column], y)
# print(train[column].value_counts())
sp.stats.f_oneway(*[group for _, group in train.groupby(column)['SalePrice'].groups.items()])

## target variable: SalePrice
- the target variable has a skewed distribution (skewness of normal distribution is 0), so it is better to transform it so it more resembles a normal dist.
- also the kurtosis (measure of fat tail distributions, or likelihood of encountering extreme values) is higher than normal distribution (definition here is excess kurtosis = kurtosis - 3 = 0 for normal distribution)

In [None]:
y_column = 'SalePrice'
y = train[y_column]

sns.distplot(y)
print(y.skew(), y.kurtosis())

take the logarithm to more resemble a normal distribution:

In [None]:
y_column = 'logSalePrice'
train[y_column] = np.log(y)
y = train[y_column] # y only points to the pandas series. so in principle, we can still manipulate the underlying column, and use y as shorthand to refer to it.
sns.distplot(y)
print(y.skew(), y.kurtosis())

check if outcome is approximately normal. distribution shows fat tail deviations on lower end:

https://www.itl.nist.gov/div898/handbook/eda/section3/probplot.htm

In [None]:
fig, ax = plt.subplots()
sp.stats.probplot(y, dist='norm', plot=ax)

## feature: OverallQual
- ranked categorical feature, use spearman correlation to check for monotony instead of linearity (pearson)
- fit linear: order=1
- use x_jitter to add noise to the categorical data, more pleasant for viewing

In [None]:
x = train['OverallQual']
sns.jointplot(x, y, kind='reg', stat_func=sp.stats.spearmanr, order=1, x_jitter=0.4)
x_columns.append('OverallQual')

## feature: GrLivArea
- there are some outliers, which have to be thrown out
- also the relationship is not really linear, might rather follow $y \sim log(x)$

In [None]:
x = train['GrLivArea']
sns.jointplot(x, y, kind='reg', logx=True)

mark outliers and transform to logarithmic scale. output looks kind of better

In [None]:
train.loc[(y<12.5)&(x>4000), 'outlier'] = True
train['logGrLivArea'] = np.log(x)
x = train['logGrLivArea']
sns.jointplot(x, y, kind='reg', order=1)
x_columns.append('logGrLivArea')

## feature: GarageCars
- if one feature is categorical, it might make sense to plot the mean/median instead of all values. then we can better see if a linear relationship holds
- apparently, the data follows a linear form up until 3. after that, it doesn't hold anymore. we can encode that as a new feature

In [None]:
x = train['GarageCars']
sns.jointplot(x, y, kind='reg', order=1, x_estimator=np.median)
train['4GarageCars'] = x==4
x_columns += ['GarageCars', '4GarageCars']

we can also use a boxplot to show the progression of the median of the feature vs target. here, we can also see whether outliers are present. however, no regression line extract the linear relationship

In [None]:
sns.boxplot(x, y)

## feature: GarageArea
- garage area = 0 indicates no garage, might be worth a new feature. But already follows a linear relationship, so it won't add much information.
- also, there are some outliers

In [None]:
x = train['GarageArea']
sns.jointplot(x, y, kind='reg')
sns.jointplot(x, y, kind='reg', x_bins=np.arange(0,1000,100), x_estimator=np.median)
x_columns.append(x.name)

mark outliers and create new feature:

In [None]:
train.loc[(x>1200) & (y<12.5), 'outlier'] = True
train['noGarage'] = (x==0)
sns.boxplot(train['noGarage'], y)
x_columns.append('noGarage')

## feature: TotalBsmtSF
same story, filter out outlier

In [None]:
x = train['TotalBsmtSF']
sns.jointplot(x, y, kind='reg')
train.loc[x>4000, 'outlier'] = True

## feature: Neighborhood

In [None]:
x = train['Neighborhood']
sns.boxplot(x, y)
temp = pd.get_dummies(x, drop_first=False)
train = train.merge(temp, how='left', left_index=True, right_index=True, copy=False) #ATTENTION: creates a copy of the df, so previous references x,y are pointing to the old df
x_columns += list(temp.columns)

## feature: MSZoning

In [None]:
x = train['MSZoning']
sns.boxplot(x, y)
temp = pd.get_dummies(x, drop_first=False)
train = train.merge(temp, how='left', left_index=True, right_index=True, copy=False)
x_columns += list(temp.columns)

# handling NaN
https://medium.com/ibm-data-science-experience/missing-data-conundrum-exploration-and-imputation-techniques-9f40abe0fd87

- we definitely see some patterns for missing data. so we might just filter them out?
- also, some features aren't filled at all, so we either throw them away or encode them as new features

In [None]:
nan_columns = train.columns[train.isnull().any()]
miss.matrix(train[nan_columns])

here, we can see, whether one feature missing correlates with another feature missing:

In [None]:
miss.heatmap(train)

# predict
- $R^2$ gives the variance explained by prediction
- apparently, it makes sense to leave out GrLivArea, since we already have the logarithmic feature

In [None]:
print(x_columns)
X = train[x_columns]
# X = train[['OverallQual', 'logGrLivArea', 'GarageCars', '4GarageCars', 'GarageArea', 'noGarage']]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, # X.loc[~train['outlier']], y.loc[~train['outlier']],
                                                                    test_size=0.33, random_state=42)
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('R^2 =', clf.score(X_test, y_test))
print('RMS =', metrics.mean_squared_error(y_test, y_pred))

- try out Ridge regularization, which reduces the regression coefficients of less important features (doesn't make much of a difference here):

In [None]:
clf = linear_model.Ridge(alpha=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('R^2 =', clf.score(X_test, y_test))
print('RMS =', metrics.mean_squared_error(y_test, y_pred))

plot the predicted target vs real target. we can see, that there are some values which are far off from being predicted accurately.
also the predictions should follow on average a straight line through the origin, $y_{pred}(y) = y$, which is not the case here.

In [None]:
plot = sns.regplot(y_test, y_pred, fit_reg=True)
plt.plot([10,15], [10,15])

cannot get regression line parameters directly, so we have to dig through the matplotlib metadata. then we get the current linear relationship between $y_{pred}$ and $y$:

In [None]:
# print(plot.get_children())
regression = plot.get_lines()[0]
reg_x = regression.get_xdata()
reg_y = regression.get_ydata()
b = (reg_y[-1] - reg_y[0])/(reg_x[-1] - reg_x[0])
a = reg_y[0] - b * reg_x[0]
print('y = {a} + {b} x'.format(a=a, b=b))

## super stupid brute force model

In [None]:
X = train[[f for f in train.columns
           if f not in ['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'SalePrice', 'Id']]]
X = X.join(pd.get_dummies(X.select_dtypes('object')))
X = X[[f for f in X.columns if f not in X.select_dtypes('object').columns]]
y = train['logSalePrice']
X.fillna(X.mean(), inplace=True)
# X.dropna(inplace=True)
# y = y.loc[X.index]

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, # X.loc[~train['outlier']], y.loc[~train['outlier']],
                                                                    test_size=0.33)
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('R^2 =', clf.score(X_test, y_test))
print('RMS =', metrics.mean_squared_error(y_test, y_pred))

In [None]:
plot = sns.regplot(y_test, y_pred, fit_reg=True)
plt.plot([10,15], [10,15])

In [None]:
X_test = test[[f for f in test.columns
           if f not in ['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'SalePrice', 'Id']]]
X_test = X_test.join(pd.get_dummies(X_test.select_dtypes('object')))
X_test = X_test[[f for f in X_test.columns if f not in X_test.select_dtypes('object').columns]]
X_test.fillna(X_test.mean(), inplace=True)

In [None]:
clf = linear_model.LinearRegression()
clf.fit(X, y)
y_pred = clf.predict(X_test)
y_pred