In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from scipy import stats
from scipy.stats import mode
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

In [2]:
dataFilePath = "housing_filled.csv"
dataFile = pd.read_csv(dataFilePath)
data = pd.DataFrame(dataFile)
enc = LabelEncoder()
dataDropped = data.dropna()

# Logistic Regression

In [3]:
logit_y = dataDropped['OverallQual'] > 5
logit_y = logit_y.map({True:1, False:0})

#corr = dataDropped.corr()
#print(corr['OverallQual'])

logitdata = pd.DataFrame(dataDropped['SalePrice'] > 140000)
logitdata['GarageCars'] = dataDropped['GarageCars']
logitdata['GrLivArea'] = dataDropped['GrLivArea']
logitdata['FullBath'] = dataDropped['FullBath']
logitdata['1stFlrSF'] = dataDropped['1stFlrSF']
logitdata['TotalBsmtSF'] = dataDropped['TotalBsmtSF']
logitdata['1stFlrSF'] = dataDropped['1stFlrSF']
logitdata['GarageYrBlt'] = dataDropped['GarageYrBlt']
logitdata['YearRemodAdd'] = ((dataDropped['YearRemodAdd'] - 1900)/10).astype(int)
logitdata['GarageArea'] = dataDropped['GarageArea']
logitdata['GarageFinish'] = enc.fit_transform(dataDropped['GarageFinish'])
logitdata['CBlock'] = dataDropped['Foundation'] == 'CBlock'
logitdata['GoodExQual'] = (dataDropped['ExterQual'] == 'Ex') | (dataDropped['ExterQual'] == 'Gd')

In [4]:
logit_sum = 0
for rand_state in [4444,912,5702,3502,1959]:
    logit_X_train, logit_X_test, logit_y_train, logit_y_test = train_test_split(logitdata, logit_y, test_size=0.3, random_state = rand_state)

    # baseline
    logit_baseline = np.full([len(logit_y_test),1],bool(logit_y_test.mode))

    # model
    logit = LogisticRegression()#class_weight = {'SalePrice': 0.5, 'YearBuilt':0.5}
    logit.fit(logit_X_train, logit_y_train)
    logit_pred = logit.predict(logit_X_test)

    # accuracy
    logit_baseline_score = accuracy_score(y_true = logit_y_test, y_pred = logit_baseline)
    logit_model_score = accuracy_score(y_true = logit_y_test, y_pred = logit_pred)
    logit_improvement = (logit_model_score - logit_baseline_score) / logit_baseline_score
    print('random_state: ' + str(rand_state))
    print('improvement: ' + str(logit_improvement))
    print('\n\n')

random_state: 4444
improvement: 0.433734939759



random_state: 912
improvement: 0.435483870968



random_state: 5702
improvement: 0.429166666667



random_state: 3502
improvement: 0.424



random_state: 1959
improvement: 0.423387096774





## Explanation

We first found the correlation between OverallQual and each feature and tried forming a model using the features with the greatest correlation with OverallQual. We then tried several features that were listed as categorical data. In particular, we focused on features with the word "finish" or "quality" in their descriptions. We did not need to change the threshold, as the threshold with the best results was generally around 0.5.

# Decision Tree

In [5]:
tree_y = dataDropped['Neighborhood']
#treedata = dataDropped.drop(['Neighborhood'],axis=1)
treedata = pd.DataFrame(dataDropped['HouseStyle'])
treedata['BldgType'] = dataDropped['BldgType']
treedata['LandSlope'] = dataDropped['LandSlope']
treedata['Street'] = dataDropped['Street']
treedata['YearBuilt'] = dataDropped['YearBuilt']
treedata['Fence'] = dataDropped['Fence']
treedata['SalePrice'] = dataDropped['SalePrice']

for i in treedata.columns:
    treedata[i] = enc.fit_transform(treedata[i])

In [6]:
tree_sum = 0
for i in range(1,500):
    tree_X_train, tree_X_test, tree_y_train, tree_y_test = train_test_split(treedata, tree_y, test_size=0.8)
    
    #baseline
    neighborhood_frequencies = (tree_y_train.value_counts)(0)
    neighborhood_frequencies = neighborhood_frequencies.index
    neighborhood_mode = neighborhood_frequencies[0]
    tree_baseline = np.ndarray([len(tree_y_test),1],dtype=object)
    tree_baseline.fill(neighborhood_mode)
    
    #model
    tree_classifier = DecisionTreeClassifier(max_leaf_nodes=15)
    tree_classifier.fit(tree_X_train, tree_y_train)
    with open("decisiontree.txt", 'w') as f:
        export_graphviz(tree_classifier, out_file=f, feature_names=list(treedata))
    tree_pred = tree_classifier.predict(tree_X_test)

    # measure accuracy
    tree_baseline_score = accuracy_score(y_true = tree_y_test, y_pred = tree_baseline)
    tree_model_score = accuracy_score(y_true = tree_y_test, y_pred = tree_pred)
    tree_improvement = (tree_model_score - tree_baseline_score) / tree_baseline_score
    tree_sum += tree_improvement

In [7]:
print(tree_sum / 500)

1.58203046675


## Explanation

We first tried features that are directly related to geography, such as LandSlope and Street. We then tried features that might be trends in some neighborhoods, such as Fence. Lastly, we tried features that might be determined by construction -- that is, we used the fact that neighborhoods are often constructed all at once, with some uniformity in house type -- such as HouseStyle, BldgType, YearBuilt, and SalePrice.

# Support Vector Machine

In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
from scipy import stats
from scipy.stats import mode
from sklearn.metrics import roc_curve, auc

In [5]:
from sklearn.svm import SVC

In [8]:
dataFilePath = "housing_filled.csv"
dataFile = pd.read_csv(dataFilePath)
data = pd.DataFrame(dataFile)
enc = LabelEncoder()
dataDropped = data.dropna()

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_svm = pd.DataFrame(data['LotArea'])
X_svm['OverallQual'] = data['OverallQual']
X_svm['YearRemodAdd'] = data['YearRemodAdd']
X_svm['1stFlrSF'] = data['1stFlrSF']
X_svm['2ndFlrSF'] = data['2ndFlrSF']
X_svm['LowQualFinSF'] = data['LowQualFinSF']
X_svm['BsmtUnfSF'] = data['BsmtUnfSF']

y_svm = data['HouseStyle']

X_svm_train, X_svm_test, y_svm_train, y_svm_test = train_test_split(X_svm, y_svm, test_size=0.30)

In [11]:
# Baseline

houseStyle_frequencies = (y_svm_train.value_counts)(0)
houseStyle_frequencies = houseStyle_frequencies.index
houseStyle_mode = houseStyle_frequencies[0]
svm_baseline = np.ndarray([len(y_svm_test),1],dtype=object)
svm_baseline.fill(houseStyle_mode)

In [12]:
svm = SVC(kernel='rbf', C=100, gamma=0.0000000001)

In [13]:
svm.fit(X_svm_train, y_svm_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1e-10, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
predictions = svm.predict(X_svm_test)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
#print(confusion_matrix(y_svm_test, predictions))

print(classification_report(y_svm_test, predictions))

             precision    recall  f1-score   support

     1.5Fin       0.50      0.02      0.04        54
     1.5Unf       0.00      0.00      0.00         5
     1Story       0.86      1.00      0.92       221
     2.5Fin       0.00      0.00      0.00         2
     2.5Unf       0.00      0.00      0.00         2
     2Story       0.71      1.00      0.83       129
     SFoyer       0.00      0.00      0.00         8
       SLvl       0.00      0.00      0.00        17

avg / total       0.71      0.80      0.72       438



  'precision', 'predicted', average, warn_for)


In [17]:
# Accuracy
svm.score(X_svm_test,y_svm_test)

0.79908675799086759

### Explanation

We constrained our features to numerical ones. We focused on features that related to the size or quality of the house, since the size would relate to the number of stories in the house and the quality of the house would potentially relate to whether the house was finished. Then after decinding on features, we shifted the parameters, in particular the gamma parameter, until we obtained more than 75% baseline accuracy.