# Kaggle House Prices

## Imports

In [8]:
# Data analysis
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

# Visualization
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

# Machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

# Processing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

#Other
from collections import OrderedDict
import math
from ipywidgets import widgets

## Read the Data
Read the data using pandas read_csv

In [9]:
testData  = pd.read_csv("data/test.csv")
trainData = pd.read_csv("data/train.csv")

## General workflow goals
**Classifying** - We may want to classify or categorize our samples. We may also want to understand the implications or correlation of different classes with our solution goal.

**Correlating** - One can approach the problem based on available features within the training dataset. Which features within the dataset contribute significantly to our solution goal? Statistically speaking is there a correlation among a feature and solution goal? As the feature values change does the solution state change as well, and visa-versa? This can be tested both for numerical and categorical features in the given dataset. We may also want to determine correlation among features other than survival for subsequent goals and workflow stages. Correlating certain features may help in creating, completing, or correcting features.

**Converting** - For modeling stage, one needs to prepare the data. Depending on the choice of model algorithm one may require all features to be converted to numerical equivalent values. So for instance converting text categorical values to numeric values.

**Completing** - Data preparation may also require us to estimate any missing values within a feature. Model algorithms may work best when there are no missing values.

**Correcting** - We may also analyze the given training dataset for errors or possibly innacurate values within features and try to corrent these values or exclude the samples containing the errors. One way to do this is to detect any outliers among our samples or features. We may also completely discard a feature if it is not contribting to the analysis or may significantly skew the results.

**Creating** - Can we create new features based on an existing feature or a set of features, such that the new feature follows the correlation, conversion, completeness goals.

**Charting** - How to select the right visualization plots and charts depending on nature of the data and the solution goals.

## Initial investigation of data

In [None]:
print(trainData.columns.values)

In [None]:
print(trainData.head())

In [None]:
trainData.info()  

In [None]:
trainData.describe()

In [None]:
trainData.describe(include=['O']) #Only include object

In [None]:
trainData.describe(include=[np.number]) #Only include numbers

### Conclusion
- PoolQC has only 7 entries
- MiscFeature only has 54 entries
- Alley has only 91 entires 
- Fence only has 281 entries 
- FireplaceQu only has 770 entries

-> Guess this is because N/A has been selected

- Mean sell price is 180'921 $
- Most houses sold by middle of 2008
- Mean LotArea is 10516.828082 sqFeet


**Classifying** - See which features mostly correlates to sale price. Check early and compare with end result.

**Correlating** - One can approach the problem based on available features within the training dataset. Which features within the dataset contribute significantly to our solution goal? Statistically speaking is there a correlation among a feature and solution goal? As the feature values change does the solution state change as well, and visa-versa? This can be tested both for numerical and categorical features in the given dataset. We may also want to determine correlation among features other than survival for subsequent goals and workflow stages. Correlating certain features may help in creating, completing, or correcting features.

**Converting** - For modeling stage, one needs to prepare the data. Depending on the choice of model algorithm one may require all features to be converted to numerical equivalent values. So for instance converting text categorical values to numeric values.

**Completing** - Data preparation may also require us to estimate any missing values within a feature. Model algorithms may work best when there are no missing values.

**Correcting** - We may also analyze the given training dataset for errors or possibly innacurate values within features and try to corrent these values or exclude the samples containing the errors. One way to do this is to detect any outliers among our samples or features. We may also completely discard a feature if it is not contribting to the analysis or may significantly skew the results.

**Creating** - Can we create new features based on an existing feature or a set of features, such that the new feature follows the correlation, conversion, completeness goals.

**Charting** - How to select the right visualization plots and charts depending on nature of the data and the solution goals.

## Exploritary analysis

In [10]:
from scipy.stats import pearsonr

features = trainData.iloc[:,1:-1].columns.tolist()
target = trainData.iloc[:,-1].name

correlations = {}
for f in features:
    data_temp = trainData[[f,target]]
    if data_temp[f].dtype == 'object':
        data_temp[f] = pd.factorize(data_temp[f])[0] + 1
    x1 = data_temp[f].values
    x2 = data_temp[target].values
    key = f + ' vs ' + target
    correlations[key] = pearsonr(x1,x2)[0]

data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Value
OverallQual vs SalePrice,0.790982
GrLivArea vs SalePrice,0.708624
GarageCars vs SalePrice,0.640409
GarageArea vs SalePrice,0.623431
TotalBsmtSF vs SalePrice,0.613581
1stFlrSF vs SalePrice,0.605852
FullBath vs SalePrice,0.560664
TotRmsAbvGrd vs SalePrice,0.533723
YearBuilt vs SalePrice,0.522897
YearRemodAdd vs SalePrice,0.507101


In [None]:
y = trainData.loc[:,['OverallQual ','GrLivArea',target]].sort_values(target, ascending=True).values
x = np.arange(y.shape[0])

%matplotlib inline
plt.subplot(3,1,1)
plt.plot(x,y[:,0])
plt.title('Sqft and Grade vs Price')
plt.ylabel('Sqft')

plt.subplot(3,1,2)
plt.plot(x,y[:,1])
plt.ylabel('Grade')

plt.subplot(3,1,3)
plt.plot(x,y[:,2],'r')
plt.ylabel("Price")

plt.show()

## Pivot features

In [None]:
#Function to check the pivot of a perticular feature compared to sale price
def checkPivot(data, feuature):
    return data[[feuature, 'SalePrice']].groupby([feuature], as_index=False).mean().sort_values(by='SalePrice', ascending=False)

In [None]:
filterFeatures = False

#Check all categorical features to see which mostly correlates to a difference in sale price
def var(feature):
    data = checkPivot(trainData, feature)
    maxPrize = data['SalePrice'].max()
    minPrize = data['SalePrice'].min()
    return maxPrize/minPrize

varThreshold = 2

catData = trainData.select_dtypes(include='object')
featureList = [x for x in catData.columns]
varDict = {}

for f in featureList:
    variance = var(f)
    if not filterFeatures:
        varThreshold = 0
    if variance > varThreshold:
        varDict[f] = variance

importantFeatures = list(varDict)
feat = OrderedDict(sorted(varDict.items(), key=lambda x: x[1], reverse=True))
print(feat)
print(importantFeatures)

### Listing the pivot of the top 5 most corrolated features
1. ExterQual
2. Exterior1st
3. Neighborhood
4. Condition2
5. BsmtCond

In [None]:
# Check type of HouseStyle
checkPivot(trainData, 'ExterQual')

In [None]:
#Check MSSubClass
checkPivot(trainData, 'Exterior1st')

In [None]:
#Check Neighborhood
checkPivot(trainData, 'Neighborhood')

In [None]:
# Check pivit of having central ac vs not
checkPivot(trainData, 'Condition2')

In [None]:
# Check pivit of having a good pool
checkPivot(trainData, 'BsmtCond')

### Analyzing continous features

In [None]:
ax = sns.regplot(x="LotArea", y="SalePrice", data=trainData)

## Creating model


### Cleaning the data

In [24]:
# Remove NaN
meanYr = math.ceil(trainData["GarageYrBlt"].mean())
print("Mean: " , meanYr)

for i,isNull in enumerate(trainData.isnull().any()):
    if isNull:
        columnName = trainData.columns[i]
        mean = math.ceil(trainData.iloc[:,i].mean())
        print(mean)
        #trainData[].fillna( mean,inplace=True )

Mean:  1979
71


TypeError: unsupported operand type(s) for +: 'int' and 'str'

#### Data processing

In [None]:
#Pre processing of data
def processData(dataset, featureList, trainSize=0.8, isTraining=True, usePCA=False):
    
    # Replace null values
    trainData["GarageYrBlt"].fillna(meanYr,inplace=True)
    
    # Split up data set
    X_train, X_test, Y_train, Y_test = train_test_split(dataset[featureList], dataset["SalePrice"], test_size=0.2, random_state=42)
    
    # Convert to numerical values
    for f in featureList:
        X_train[f] = X_train[f].astype('category')
        X_test[f]  = X_test[f].astype('category')
    cat_columns = X_train.select_dtypes(['category']).columns
    X_train[cat_columns] = X_train[cat_columns].apply(lambda x: x.cat.codes)
    X_test[cat_columns]  = X_test[cat_columns].apply(lambda x: x.cat.codes)

    # Scale the data
    X_train = StandardScaler().fit_transform(X_train)
    X_test  = StandardScaler().fit_transform(X_test)
    
    # Return Principel components of data if desired
    if usePCA:
        pca = PCA(0.95)
        pca.fit(X_train)

        # Apply PCA
        X_train = pca.transform(X_train)
        X_test = pca.transform(X_test)
        
        print("Components used for PCA: ", pca.n_components_)
    
    
    return X_train, X_test, Y_train, Y_test

# # Use Principle Component Analysis
# 
# 
# cpts = pd.DataFrame(pca.transform(X_train))

# pcaList = pca.explained_variance_ratio_

# fig, ax = plt.subplots()
# ax.set_xlabel('Pricipal components')
# ax.set_ylabel('Percentage of variance')
# ax.set_title('PCA', fontsize=20)

# validPCAs = len(pcaList)
# validPCAs = 20
# bar_width = 0.35

# index = np.arange(validPCAs)

# ax.set_xticks(index + bar_width)
# ax.set_xticklabels(index)

# for i in range(validPCAs):
#     ax.bar(i, pcaList[i], bar_width)

# fig.tight_layout()
# plt.show()

# pca = PCA(n_components=2)
# principalComp = pca.fit_transform(X_train)
# PCdf = pd.DataFrame(data=principalComp, 


#### Test models

In [None]:
def randomForestModel(X_train, Y_train):
        
    regr = RandomForestRegressor(oob_score=True, random_state=random_state, n_estimators=n_estimators)
    regr.fit(X_train, Y_train)
    
    return regr


In [None]:
def logisticRegressionModel(X_train, y_train):
    
    logisticRegr = LogisticRegression(solver = 'lbfgs')
    logisticRegr.fit(X_train, y_train)
    
    return logisticRegr

In [None]:
#featureList = ["ExterCond", "Functional", "BsmtCond", "GarageType", "Condition2", "GarageQual"]

X_train, X_test, y_train, y_test = processData(trainData, importantFeatures, isTraining = True)
regr = randomForestModel(X_train, y_train)

trainScore = round(regr.score(X_train, y_train) * 100, 2)
testScore = round(regr.score(X_test, y_test) * 100, 2)

print("Train score: ", trainScore, " Test Score: ", testScore)

predicted_train = regr.predict(X_train)
predicted_test = regr.predict(X_test)

test_score = r2_score(y_test, predicted_test)
spearman = spearmanr(y_test, predicted_test)
pearson = pearsonr(y_test, predicted_test)

print(f'Out-of-bag R-2 score estimate: {regr.oob_score_:>5.3}')
print(f'Test data R-2 score: {test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')

In [None]:
X_train, X_test, y_train, y_test = processData(trainData, importantFeatures, isTraining = True, usePCA=True)
regr = logisticRegressionModel(X_train, y_train)

trainScore = round(regr.score(X_train, y_train) * 100, 2)
testScore = round(regr.score(X_test, y_test) * 100, 2)

print("Train score: ", trainScore, " Test Score: ", testScore)

predicted_train = regr.predict(X_train)
predicted_test = regr.predict(X_test)

test_score = r2_score(y_test, predicted_test)
spearman = spearmanr(y_test, predicted_test)
pearson = pearsonr(y_test, predicted_test)

print(f'Test data R-2 score: {test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')

## Measuring performance
precision, recall, F1 Score, ROC Curve, etc would be better than simple score