In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from math import sqrt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

In [2]:
data = pd.read_csv('../train.csv')
testData = pd.read_csv('../test.csv')


## Here are our corr matrix for feature selection

In [None]:
#visual correlation matrix
f = plt.figure(figsize=(25,25))
plt.matshow(data.corr(), fignum=f.number)
cb = plt.colorbar()

#correlation matrix to csv
c = data.corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort")
so.to_csv('./corr.csv')

#corr matrix of fin chosen set
c = fin_set.corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort")
so.to_csv('./finalSetcorr.csv')

In [None]:
#percentage of missing per column
print((data.count() / len(data)) *100)

#total missing columns
print(data.isna().sum())

#where there is no full bath above ground, 
a = np.where(df['FullBath'].values == 0)
print(data.loc[a])

In [None]:
#Jason function for scatterplots
sns.set()
cols = ['WoodDeckSF',
'OpenPorchSF',
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch',
'PoolArea']
sns.pairplot(df[cols], size = 2.5)
plt.show();
plt.savefig('pairplot.png',bbox_inches='tight')

### Creates visualization for distributions

In [None]:
#show distribution of set
sets = ['LotShape',
'Neighborhood',
'HouseStyle',
'Heating',
'CentralAir',
'KitchenQual',
'Functional',
'GarageType',
'GarageQual',
'Fence',
'SaleType',
'SaleCondition']

#Jaye function to count unique values
for c in data.columns[:]:
    if c in sets:
        print('\nCOLUMN: "{}"\n=============================='.format(c))
        print(data[c].value_counts(dropna=False).to_string())
        # only include above columns categorical
        continue

## Features we have chosen to use after feature selection

In [42]:
#variables we chose
chosen = ['LotFrontage','LotArea','LotShape','HouseStyle',
'OverallQual','OverallCond','YearBuilt','YearRemodAdd', 'CentralAir',
'MasVnrArea','1stFlrSF','2ndFlrSF','GrLivArea','Heating','Neighborhood',
'BsmtFullBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
'KitchenQual','Fireplaces','GarageType','GarageCars',
'WoodDeckSF','OpenPorchSF','PoolArea', 'Fence','SaleCondition']

#create dataset of chosen
fin_set = data[chosen].copy()
fin_test = testData[chosen].copy()

In [35]:
fin_set.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,Fireplaces,GarageCars,WoodDeckSF,OpenPorchSF,PoolArea
count,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,1162.626712,346.992466,1515.463699,0.425342,1.565068,0.382877,2.866438,1.046575,0.613014,1.767123,94.244521,46.660274,2.758904
std,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,386.587738,436.528436,525.480383,0.518911,0.550916,0.502885,0.815778,0.220338,0.644666,0.747315,125.338794,66.256028,40.177307
min,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,334.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,882.0,0.0,1129.5,0.0,1.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0
50%,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,1087.0,0.0,1464.0,0.0,2.0,0.0,3.0,1.0,1.0,2.0,0.0,25.0,0.0
75%,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,1391.25,728.0,1776.75,1.0,2.0,1.0,3.0,1.0,1.0,2.0,168.0,68.0,0.0
max,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,4692.0,2065.0,5642.0,3.0,3.0,2.0,8.0,3.0,3.0,4.0,857.0,547.0,738.0


### Clean data further, log transformation and feature engineering

In [43]:
fin_set['PoolArea'] = fin_set['PoolArea'].apply(lambda x: 1 if x>0 else 0)
fin_set['Fireplaces'] = fin_set['Fireplaces'].apply(lambda x: 'y' if x>0 else 'n')
#fin_set['LotArea','MasVnrArea','2ndFlrSF','OpenPorchSF']

## Num_Cleaner

Takes in a dataframe and returns only int and floats with na's imputed with mean value

In [4]:
#find numeric columns in given df,replace missing rows with column mean, return dataframe
def num_cleaner (df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cl_df = pd.DataFrame()

    df2 = df.select_dtypes(include=numerics).copy() #only numeric columns
    nullCols = df2.isnull() #df of T/F, T if null (bool mask)
    means = df2.mean() #series of mean per column

    for c in df2:
        for r in range(0,len(df2)-1):
            if df2[c].iloc[r] and nullCols[c].iloc[r]:
                df2[c].iloc[r] = means[c]

    cl_df = df2.copy() #this will return only numeric features, need to add back type ob features after cleaning/dummify them
    return cl_df

## Dummify dataframe
Find object types and dummify the columns

In [5]:
#dummify object columns and return dataframe
def mass_dummify (df):
    df2 = df.select_dtypes(include=object).copy()
    dum_df = pd.get_dummies(df2) #currently 61 columns, no neighborhoods
    return dum_df

## Lasso Regression Visualization

Given many lambdas (alphas), graph an elastic net function with amount given. 

Only 10 colors available on graph

In [None]:
#lasso reg vizualization for many lamdas
def lassoReg (alphas, df, target):

    lasso  = Lasso()
    #alphas = np.linspace(0.1,20.1,20)
    lasso.set_params(normalize=False)
    coefs_lasso  = []

    for alpha in alphas:
        lasso.set_params(alpha=alpha)
        lasso.fit(df, target)  
        coefs_lasso.append(lasso.coef_)

    coefs_lasso = pd.DataFrame(coefs_lasso, index = alphas, columns = df.columns)  
    #coefs_lasso.head()
    
    plt.rcParams['figure.figsize'] = (20,20)
    for name in coefs_lasso.columns:
        plt.plot(coefs_lasso.index, coefs_lasso[name], label=name)
        
        
    plt.xlabel(r'hyperparameter $\lambda$')
    plt.ylabel(r'slope values')
    plt.legend(loc=1)

## Elastic Net Visualization
Given many lambdas (alphas), graph an elastic net function with amount given. 

rho = 1 lasso regression, rho = 0 ridge regression.

Only 10 colors available on graph

In [None]:
#elastisc net vizualization graph for many lambdas
def elasticNet (alphas, df, target, rho):

    elasticnet = ElasticNet()
    elasticnet.set_params(normalize=False)
    coefs_model  = []
    #run through many alphas(lambdas)
    for alpha in alphas:
        elasticnet.set_params(alpha=alpha, l1_ratio= rho)
        elasticnet.fit(df, target)  
        coefs_model.append(elasticnet.coef_)

    coefs_model = pd.DataFrame(coefs_model, index = alphas, columns = df.columns)  
    
    #plot all the figures for different lambda
    plt.rcParams['figure.figsize'] = (20,20)
    for name in coefs_model.columns:
        plt.plot(coefs_model.index, coefs_model[name], label=name)
            
    plt.xlabel(r'hyperparameter $\lambda$')
    plt.ylabel(r'slope values')
    plt.legend(loc=1)

In [None]:
#lambda
alphas = np.linspace(0.1,1e4,20)
#target
prices = pd.Series(data.SalePrice)
#only num & no NA
cleaned = num_cleaner(fin_set).copy()

#elasticNet visualization for multiple lambda
elasticNet(alphas,cleaned.iloc[:,:9],prices,1) #1 for lasso, 0 for ridge, mixture is elastic net

## Outputs an elastic net model that has been fitted to dataframe and target
rho = 1 is lasso regressoion, rho=0 is ridge regression

In [None]:
#returns elasticNet based on one lambda
def elasticNetModel (alpha, df, target, rho):
    
    elasticnet = ElasticNet()
    elasticnet.set_params(normalize=False,l1ratio = rho)
    model = elasticnet.fit(df,target)
    return model

### Clean code using functions

In [44]:
num_clean = num_cleaner(fin_set).copy()
ob_clean = mass_dummify(fin_set).copy()
#combine num & ob dataset for Train
cl_df = pd.concat([num_clean,ob_clean],axis=1)

df1 = num_cleaner(fin_test)
df2 = mass_dummify(fin_test)
#combine Test sets
cl_test = pd.concat([df1,df2],axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
X = cl_df.copy()
y = data.SalePrice
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.3)

## Multi-linear regression model

In [45]:
#multi-linear regression
X_add_const = sm.add_constant(X_train)
ols = sm.OLS(Y_train, X_add_const)
model = ols.fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.876
Model:                            OLS   Adj. R-squared:                  0.865
Method:                 Least Squares   F-statistic:                     85.20
Date:                Sun, 01 Mar 2020   Prob (F-statistic):               0.00
Time:                        17:29:07   Log-Likelihood:                -11944.
No. Observations:                1022   AIC:                         2.405e+04
Df Residuals:                     943   BIC:                         2.444e+04
Df Model:                          78                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                  -1.27e+

  return ptp(axis=axis, out=out, **kwargs)


## Grid search CV for finding best lambda

In [None]:
#trying to get best lambda
target = pd.Series(data.SalePrice)
params = {'alpha':np.linspace(1e-6,1000,100)}
lasso_regressor = GridSearchCV(Lasso, params, scoring = 'neg_mean_squared_error',cv=5 )
lasso_regressor.fit(cl_df,target)

## Create elastic net model and print values

In [46]:
netModel = ElasticNet(alpha = 1e-5,normalize=True,l1_ratio = .5)
netModel.fit(X_train,Y_train)
netPredict = netModel.predict(cl_df)
#mean_squared_error(Y_test,netModel.predict(X_test))
print(r'The train R^2 is %.4f' %(netModel.score(X_train, Y_train)))
print(r'The test R^2 is %.4f' %(netModel.score(X_test, Y_test)))
print(r'The MSE is %.2f' %mean_squared_error(Y_test,netModel.predict(X_test)))

The train R^2 is 0.8757
The test R^2 is 0.7834
The MSE is 1180092423.34


  positive)


In [33]:
predict = np.delete(netPredict, (-1), axis=0)
predict.size

1459

In [30]:
# save predictions to csv with two columns:  Id, SalePrice
predictionDF = pd.DataFrame()
predictionDF['Id'] = range(1461,2920)

# predictions of SalePrice are floating point numbers
# here we round the floats to the nearest integer SalePrice dollar
predictions = predict.round(0).astype('int64')
predictionDF['SalePrice'] = predictions
predictionDF.to_csv('./net_predictions.csv')

Unnamed: 0,Id,SalePrice
0,1461,196140
1,1462,216705
2,1463,201849
3,1464,303252
4,1465,161735
...,...,...
1454,2915,180817
1455,2916,237694
1456,2917,252187
1457,2918,130283


## Create lasso regression model and print values

In [49]:
lasso = ElasticNet(alpha=1e-5,normalize=True,l1_ratio=1)
lasso.fit(X_train,Y_train)
mean_squared_error(Y_test,lasso.predict(X_test))
print(r'The train R^2 is %.4f' %(lasso.score(X_train, Y_train)))
print(r'The test R^2 is %.4f' %(lasso.score(X_test, Y_test)))
print(r'The MSE is %.2f' %mean_squared_error(Y_test,lasso.predict(X_test)))

The train R^2 is 0.8757
The test R^2 is 0.7814
The MSE is 1190787359.62


  positive)


## Create ridge regression model and print values

In [48]:
ridge = ElasticNet(alpha=1e-5,normalize=False,l1_ratio=0)
ridge.fit(X_train,Y_train)
mean_squared_error(Y_test,ridge.predict(X_test))
print(r'The train R^2 is %.4f' %(ridge.score(X_train, Y_train)))
print(r'The test R^2 is %.4f' %(ridge.score(X_test, Y_test)))
print(r'The MSE is %.2f' %mean_squared_error(Y_test,ridge.predict(X_test)))

The train R^2 is 0.8757
The test R^2 is 0.7815
The MSE is 1190215512.88


  positive)
