In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("./Housing.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
def find_unique(df):
    objectColumns = df.dtypes[df.dtypes == object]
    numberColumns = df.dtypes[df.dtypes != object]
    text_to_num = list(objectColumns.index)
    num_list = list(numberColumns.index)
    #print('numerical colums are: ', num_list)
    for i in range(len(text_to_num)):
        col_name = text_to_num[i]
        print("\nthere are " + str(df[col_name].nunique()) +" different types in the column "+ col_name +":\n")
        print(df[col_name].unique())

In [None]:
find_unique(df)

In [None]:
#eliminate the non residential source
df = df[df.SOURCE == 'Residential']

In [None]:
df.isna().sum()

### based on the missing values, lets elimnate some columns to help fill in the dataset
- get rid of cmplx_num and living_gba
-- the number on the building probably doesnt hold much predicive value and we have more values in gba as well
- get rid of the number of units - we have no missing values in rooms, bedrooms, bathrooms, or kitches so this should be fine
- get rid of full address, city, state - none of these matter in prediciton
- get rid of AYB, this information is better captured in eyb which has no missing values
- get rid of sale number and building number
- get rid of national grid, longitude and latitude

In [None]:
df.drop("CMPLX_NUM", inplace=True, axis=1)

In [None]:
df.drop(["NUM_UNITS","AYB","SALEDATE","Unnamed: 0"], inplace=True, axis=1)

In [None]:
df.drop(["SALE_NUM","BLDG_NUM","LIVING_GBA","FULLADDRESS","CITY", "STATE","NATIONALGRID","X","Y","SOURCE","STORIES"], inplace=True, axis=1)

In [None]:
#lets also get rid of census block, that should be convered well enough by census tract
#roof, exterior wall, interior wall, have too many values missing to really be helpful
# honestly how many houses will have more than one kitchen
# more than half of the year remodel missing won't be a helpful predictor
df.drop(["EXTWALL","ROOF","INTWALL","KITCHENS","CENSUS_BLOCK", "YR_RMDL" ], inplace=True, axis=1)
df.isna().sum()

In [None]:
#subneighborhood seems unimportant and has several missing values
#stories and quadrant are missing values and seem to be small enough to eliminate but still have a
#good amount of data

df.drop(["ASSESSMENT_SUBNBHD", "QUADRANT"], inplace=True, axis=1)

# lets get rid of the entries where we only have one value
# only keep the rows if zipcode is not null
df = df[df.ZIPCODE.notnull()]

In [None]:
df.isna().sum()

## Price is the most important data we want to fill in
### Find a similar spread in another graph and match based on bins or quantile
### We should also be able to eliminate other less useful pieces of information like fireplaes

# Part A: Cleaning the data

## Implement a sensible approabch for dealing with missing data
### After that
- change all the values in the dataset into numerical values
- substitute dummy columns for categorical variables
- eliminate any colums that may not be relevant to objectives

In [None]:
df.drop(["QUALIFIED", "HEAT","USECODE","LATITUDE","LONGITUDE","CENSUS_TRACT"], inplace=True, axis=1)

In [None]:
df["PRICE"].plot()

In [None]:
# add a new column that adds rooms baths and half baths 
# plot against stories
df['ALL_RM'] = df.apply(lambda row: row.BATHRM + row.HF_BATHRM + row.ROOMS + row.BEDRM, axis=1)

In [None]:
df.plot(x='ROOMS', y='PRICE', style='o')

In [None]:
df.plot(x='ALL_RM', y='PRICE', style='o')
#that results in the same distribution, so lets get rid of that

In [None]:
df.plot(x='EYB', y='PRICE', style='o')

In [None]:
#Lets look at ward - it should have 8 categories, perhaps we can bin that and get an average price
# trim all the values 
df["WARD"] = df["WARD"].str[-1]

In [None]:
df['WARD'] = df['WARD'].astype(int)

In [None]:
w_df = df['WARD'].value_counts()
print(w_df)
w_options = df['WARD'].value_counts().keys().to_list()
w_count = df['WARD'].value_counts().to_list()
print(w_options)
print(w_count)

In [None]:
plt.bar(w_options, w_count, width = 0.3, color="blue")
plt.ylabel('Number of houses')
plt.xlabel('WARD')
plt.show()

In [None]:
test = df.groupby('WARD', as_index=False)['PRICE'].mean()
test.head()

In [None]:
test.plot(x="WARD", y="PRICE", style = 'o')

In [None]:
df.plot(x='GBA', y='PRICE', style='o')

In [None]:
df.plot(x='FIREPLACES', y='PRICE', style='o')

In [None]:
z_df = df['ZIPCODE'].value_counts()
print(z_df)
z_options = df['ZIPCODE'].value_counts().keys().to_list()
z_count = df['ZIPCODE'].value_counts().to_list()
print(z_options)
print(z_count)

In [None]:
print(len(z_options))

In [None]:
df.plot(x='ZIPCODE', y='PRICE', style='o')

In [None]:
test = df.groupby('ZIPCODE', as_index=False)['PRICE'].mean()
test.head()

In [None]:
df.plot(x='ROOMS', y='BEDRM', style='o')

In [None]:
df.plot(x='ROOMS', y='ALL_RM', style='o')

In [None]:
df.plot(x='ROOMS', y='BATHRM', style='o')

In [None]:
df.plot(x='ROOMS', y='HF_BATHRM', style='o')

In [None]:
df.plot(x='HF_BATHRM', y='PRICE', style='o')

In [None]:
ac_df = df['AC'].value_counts()
print(ac_df)
ac_options = df['AC'].value_counts().keys().to_list()
ac_count = df['AC'].value_counts().to_list()
print(ac_options)
print(ac_count)

plt.bar(ac_options, ac_count, width = 0.3, color="blue")
plt.ylabel('Number of houses')
plt.xlabel('AC')
plt.show()

In [None]:
ac = df.groupby('AC', as_index=False)['PRICE'].mean()
ac.head()

ac.plot(x="AC", y="PRICE", style = 'o')

In [None]:
c_df = df['CNDTN'].value_counts()
print(c_df)
c_options = df['CNDTN'].value_counts().keys().to_list()
c_count = df['CNDTN'].value_counts().to_list()
print(c_options)
print(c_count)

plt.bar(c_options, c_count, width = 0.3, color="blue")
plt.ylabel('Number of houses')
plt.xlabel('CNDTN')
plt.show()

In [None]:
c = df.groupby('CNDTN', as_index=False)['PRICE'].mean()

c.plot(x="CNDTN", y="PRICE", style = 'o')

# Lets fill in price based on the average from the ward
## there are 8 wards and we can match up the average from that

In [None]:
ward_op = test.values.tolist()

In [None]:
print(ward_op)

In [None]:
print(ward_op[0][0])
print(ward_op[0][1])
print(len(ward_op))

In [None]:
def ward_price_lookup(cols):
    ward = cols[0]
    price = cols[1]
    if pd.isnull(price):
        if ward == ward_op[0][0]:
            return ward_op[0][1]
        elif ward == ward_op[1][0]:
            return ward_op[1][1]
        elif ward == ward_op[2][0]:
            return ward_op[2][1]
        elif ward == ward_op[3][0]:
            return ward_op[3][1]
        elif ward == ward_op[4][0]:
            return ward_op[4][1]
        elif ward == ward_op[5][0]:
            return ward_op[5][1]
        elif ward == ward_op[6][0]:
            return ward_op[6][1]
        else:
            return ward_op[7][1]
    
    else:
        return price

In [None]:
df['PRICE'] = df[['WARD','PRICE']].apply(ward_price_lookup,axis=1)

In [None]:
df.isna().sum()

## Now that we have all the price values filled in, lets delete other irrelevant columns and convert everything to numerical values and create dummies for the categorical variables

In [None]:
df.info()

### ward and zipcode should fill in the genral area so get rid of nbhd and square
### all room was a variable I made, so delete it. - in that there was a linear correlation between the number of rooms and all rooms, so eliminate everything but rooms
### Structure has less options and should be able to fill in for style
### there are less condidtions than grades so eliminate grade
### EYB should cover what the last mod does
### Fireplaces does not have much correlation or relevance
### GBA should be enough to go off of, eliminate land area
### AC is unhelpful since 0 means nothing

In [None]:
df.drop(['ASSESSMENT_NBHD','SQUARE', 'ALL_RM','STYLE','GIS_LAST_MOD_DTTM', 'FIREPLACES','ZIPCODE','LANDAREA','AC'], inplace=True, axis=1)

In [None]:
#keep only bathrooms and rooms since there is a pretty linear relationship with both
df.drop(['HF_BATHRM', 'BEDRM','BATHRM'], inplace=True, axis=1)

In [None]:
df.drop(['CNDTN'], inplace=True, axis=1)
df.info()

In [None]:
## in the end we have kept bathrooms bedrooms EYB price GBA struct condition and ward

In [None]:
objectColumns = df.dtypes[df.dtypes == object]
numberColumns = df.dtypes[df.dtypes != object]
text_to_num = list(objectColumns.index)

In [None]:
X = pd.get_dummies(df, columns=text_to_num, drop_first=True) 
X = X.astype('int')
print(X.shape)

# Part B: Dimmensional Reduction

## Use SVD and PCA
- in each case show eigenvectors and eigenvalues




In [None]:
X.drop(['PRICE'], inplace=True, axis=1)
X = X.T
y = df['PRICE']
y = y.astype('int')
print(X.shape)
print(y.shape)

Using PCA:

In [None]:
C = np.dot(X, X.T)
v, w = np.linalg.eig(C)
print('Eigen-values: v=\n', v,'\n')
print('Eigen-vectors: w=\n', w)

In [None]:
print(X.shape)
print(w.shape)

In [None]:
sv = np.cumsum(v)/sum(v)

In [None]:
#sv = np.insert(sv, 0, 0)
plt.step(list(range(len(sv))), sv)
plt.show()
print('sv =', sv)

Use SVD

In [None]:
U, s, VT = np.linalg.svd(X, full_matrices=False)
print('Eigen-vectors: U=\n', U, U.shape,'\n')
print('Eigen-values: s=', s, s.shape)
print('Eigen-vectors: VT=', VT.shape,'\n')

In [None]:
sv2 = np.cumsum(s)/sum(s)
#sv = np.insert(sv, 0, 0)
plt.step(list(range(len(sv2))), sv2)
plt.show()
print('sv2 =', sv2)

## Show Scree Plot

### Answer the question
- How many dimmensitons of the dataset should be used to retain over 90% of the data variance

PCA converges around 4
SVD converges at 2

# Part C
## Divide dataset
- Use Linear regression model to predict

In [None]:
#Linear Regression
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [None]:
X = pd.get_dummies(df, columns=text_to_num, drop_first=True) 
X = X.astype('int')
X.drop(['PRICE'], inplace=True, axis=1)
y = df['PRICE']
y = y.astype('int')
print(X.shape)
print(y.shape)
X.head()

In [None]:
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
Xstd = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(Xstd, y, test_size=0.25, random_state=45) 

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
print('coefficient', lm.coef_)
print("intercept:",lm.intercept_)

In [None]:
predictions = lm.predict(X_test)
print(len(X_test))
print(type(predictions))
print(predictions.shape)
print(X_test.shape)
print(y_test.shape)
print(len(predictions))
print(len(y_test))
print(lm.score(X_test, y_test))


## Convert Price into categorical value

In [None]:
def convert_price_category(cols):
    for price in cols:
        if price <= 30000:
            return 0
        elif price <= 60000:
            return 1
        elif price <= 90000:
            return 2
        elif price<= 130000:
            return 3
        else:
            return 4

In [None]:
df['PRICE'] = df[['PRICE']].apply(convert_price_category,axis=1)

In [None]:
y = df['PRICE']
plt.hist(y, bins=3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xstd, y, test_size=0.25, random_state=45) 


### Use each of the following to predict
- Logistic Regression
- Neural Network
- Naive Bays



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train.ravel())

In [None]:
Log_predictions = log_reg.predict(X_test)
print(Log_predictions.shape, Log_predictions.dtype)
print(y_test.shape, y_test.dtype)

In [None]:
cm = confusion_matrix(y_test, Log_predictions)
print(cm)  
score = accuracy_score(y_test, Log_predictions)
print(score)

In [None]:
def g1(x):
    return 1. / (1 + np.exp(-x))
def g2(x):
    return 1. / (1 + np.exp(-x))
def g1_prime(x):
    return g1(x) * (1 - g1(x))

In [None]:
# experiment with number of neurons in hidden layer 
# display confusion matrix and classification results

def NN(X, y, n_h, n_y, alpha, iterations):
	stage_1 = iterations//2; alpha2 = alpha/2.
	stage_2 = iterations//1.75; alpha3 = alpha/4.
	stage_3 = iterations//1.5; alpha4 = alpha/8.
	cost_list = [[],[]]
	m = X.shape[1]
	n_x = X.shape[0]
	W1 = np.random.randn(n_h, n_x)
	b1 = np.random.randn(n_h, 1)
	W2 = np.random.randn(n_y, n_h)
	b2 = np.random.randn(n_y, 1)
	for i in range(iterations):
		Z1 = np.dot(W1, X) + b1
		A1 = g1(Z1)
		Z2 = np.dot(W2, A1) + b2
		A2 = g2(Z2)
		if i%100 == 0:
			cost = -np.sum(y*np.log(A2) + (1-y)*np.log(1-A2))
			cost_list[0].append(i)
			cost_list[1].append(cost)
		dZ2 = A2 - y
		dW2 = (1/m) * np.dot(dZ2, A1.T)
		db2 = (1/m) * np.sum(dZ2)
		dZ1 = np.dot(W2.T, dZ2) * g1_prime(Z1)
		dW1 = (1/m) * np.dot(dZ1, X.T)
		db1 = (1/m) * np.sum(dZ1)
		W2 = W2 - alpha * dW2
		b2 = b2 - alpha * db2
		W1 = W1 - alpha * dW1
		b1 = b1 - alpha * db1
		if i>stage_1: alpha = alpha2
		elif i>stage_2: alpha = alpha3
		elif i>stage_3: alpha = alpha4
	return W1, b1, W2, b2, cost_list

In [None]:
#n_h is the number of hidden layers - this is what we will play with
#n_y is the number in the output layer - we are only looking for 1 output
n_h = 2; n_y = 1; n_h2=2
alpha = 0.05
iterations = 50000

In [None]:
y_train_N = y_train.values.reshape(-1,1)
print(y_train_N.shape)

In [None]:
print(X_train.shape, y_train_N.shape)
print(X_train.T.shape, y_train_N.T.shape)

In [None]:
W1, b1, W2, b2, cost_list = NN(X_train.T, y_train_N.T, n_h, n_y, alpha, iterations)

In [None]:
W1_2, b1_2, W2_2, b2_2, cost_list_2 = NN(X_train.T, y_train_N.T, n_h2, n_y, alpha, iterations)

print('W1=',W1, '\n', 'b1=',b1, '\n', 'W2=',W2, '\n', 'b2=',b2)
plt.plot(cost_list[0][1:], cost_list[1][1:], 'go')

In [None]:
print('With two hidden layers:\n', 'W1=',W1_2, '\n', 'b1=',b1_2, '\n', 'W2=',W2_2, '\n', 'b2=',b2_2)
plt.plot(cost_list_2[0][1:], cost_list_2[1][1:], 'go')

In [None]:
Z1 = np.dot(W1, X_test.T) + b1
A1 = g1(Z1)
Z2 = np.dot(W2, A1) + b2
A2 = g2(Z2)

NN_predictions = A2.copy()
NN_predictions[A2 < 0.5] = 0
NN_predictions[A2 > 0.5] = 1
print('A2=',A2)
print('predictions=',NN_predictions)

print(y_test.shape, NN_predictions.T.shape)

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, NN_predictions.T))  
print(classification_report(y_test, NN_predictions.T))
NNscore = accuracy_score(y_test, NN_predictions.T)
print(NNscore)

In [None]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

def summarize(dataset):
    summaries = [(np.mean(attribute), np.std(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [None]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries


import math
def calculateProbability(x, mean, stdev):
    if stdev == 0: return 0
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [None]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [None]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [None]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

import random
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [None]:
print(X.shape)
y = y.values.reshape(106695,1)
print(y.shape)

In [None]:
data = np.append(X, y, 1)
print(data.shape)
# Split data into training and testing
splitRatio = 0.67
trainingSet, testSet = splitDataset(data, splitRatio)
print(len(trainingSet))
print(len(testSet))
# Apply the NaÃ¯ve Bayes Algorithm
summaries = summarizeByClass(trainingSet)
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy:', accuracy)

In [None]:
print('Train:', X_train.shape, y_train.shape)
print('Test:', X_test.shape, y_test.shape)

In [None]:
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB, MultinomialNB
model = GaussianNB()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))

# Finally, validate the models with testing data