## 0. Read File and Split

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

cleanedFile = "4.10.24 Duplicates Removed.csv"
DF = pd.read_csv(cleanedFile)

DF.dropna(axis='index', how='any', inplace=True)

train_set, test_set = train_test_split(DF, test_size=0.3, random_state=0)
print(DF.keys)

X = DF[['latitude', 'longitude', 'floorSize']]
y = DF['price']

X_train = train_set[['latitude', 'longitude', 'floorSize']]
y_train = train_set['price']

X_test = test_set[['latitude', 'longitude', 'floorSize']]
y_test = test_set['price']

<bound method NDFrame.keys of       latitude  longitude  floorSize   price  \
0    39.252922 -94.571526       1656  330000   
1    39.286930 -94.595634       1649  376700   
2    39.052692 -94.545950       1600  185000   
3    39.211430 -94.638050       1698  319000   
4    39.293766 -94.464226       1649  373600   
..         ...        ...        ...     ...   
401  39.142340 -94.649720       1366  295000   
402  39.147373 -94.832670       2565  590159   
403  39.112940 -94.631050       1512  195000   
404  39.136390 -94.644430       1570  145000   
405  39.148520 -94.832760       2257  574635   

                                                   url  
0    https://www.zillow.com/homedetails/701-NE-88th...  
1    https://www.zillow.com/community/holly-farms/3...  
2    https://www.zillow.com/homedetails/4014-Colleg...  
3    https://www.zillow.com/homedetails/6407-N-Kirk...  
4    https://www.zillow.com/community/somerbrook/34...  
..                                                 

## 1. Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import silhouette_score


KMC = KMeans(n_clusters=2)


KMC.fit(X_train)
y_train_predict = KMC.predict(X_train)
matrixTrain = confusion_matrix(y_train, y_train_predict)
print(matrixTrain)
print("Accuracy", accuracy_score(y_train, y_train_predict))
print("F1", f1_score(y_train ,y_train_predict, average='weighted'))

y_test_predict = KMC.predict(X_test)
matrixTest = confusion_matrix(y_test, y_test_predict)
print(matrixTest)
silhouette_avg = silhouette_score(X_train, y_train_predict)
print(f"Silhouette Score for training data: {silhouette_avg}")

silhouette_avg_test = silhouette_score(X_test, y_test_predict)
print(f"Silhouette Score for test data: {silhouette_avg_test}")

In [None]:
from sklearn.model_selection import StratifiedKFold

spliter = StratifiedKFold(n_splits = 5, shuffle=True, random_state= 23)

for train_indices, validate_indices in spliter.split(X, y):
    KMC = KMeans(n_clusters=2, random_state=3)
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    KMC.fit(X_train)
    X_validate = X.iloc[validate_indices]
    y_validate = y.iloc[validate_indices]
    y_predicted = KMC.predict(X_validate)
    matrix = confusion_matrix(y_validate, y_predicted)
    print(matrix)
    print(f"Silhouette Score for training data: {silhouette_avg}")

    silhouette_avg_test = silhouette_score(X_validate, y_predicted)
    print(f"Silhouette Score for test data: {silhouette_avg_test}")
    print()

## 2. Dimensional Analysis

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import silhouette_score

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

pca = PCA(n_components=3)

X_train_pca = pca.fit_transform(X_train_standardized)
X_test_pca = pca.transform(X_test_standardized)

KMC = KMeans(n_clusters=2)


KMC.fit(X_train_pca)
y_train_predict = KMC.predict(X_train_pca)

y_test_predict = KMC.predict(X_test_pca)
matrixTrain = confusion_matrix(y_train, y_train_predict)
print(matrixTrain)
print( accuracy_score(y_train, y_train_predict))
print(f1_score(y_train, y_train_predict, average='weighted'))

matrixTest = confusion_matrix(y_test, y_test_predict)
print(matrixTest)
print(accuracy_score(y_test, y_test_predict))
print(f1_score(y_test, y_test_predict, average='weighted'))

## 3. Visualize and Narrative

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score


RFC = RandomForestClassifier(max_depth=1)


RFC.fit(X_train, y_train)
y_train_predict = RFC.predict(X_train)
matrixTrain = confusion_matrix(y_train, y_train_predict)
print(matrixTrain)
print("Accuracy", accuracy_score(y_train, y_train_predict))
print("F1", f1_score(y_train ,y_train_predict, average='weighted'))

y_test_predict = RFC.predict(X_test)
matrixTest = confusion_matrix(y_test, y_test_predict)
print(matrixTest)
print("Accuracy", accuracy_score(y_test, y_test_predict))
print("F1", f1_score(y_test, y_test_predict, average='weighted'))



## 4. Anomalous Data

## 5. Random Forest & Neural Net

#### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score


RFC = RandomForestClassifier(max_depth=10)


RFC.fit(X_train, y_train)
y_train_predict = RFC.predict(X_train)
matrixTrain = confusion_matrix(y_train, y_train_predict)
print(matrixTrain)
print("Accuracy", accuracy_score(y_train, y_train_predict))
print("F1", f1_score(y_train ,y_train_predict, average='weighted'))

y_test_predict = RFC.predict(X_test)
matrixTest = confusion_matrix(y_test, y_test_predict)
print(matrixTest)
print("Accuracy", accuracy_score(y_test, y_test_predict))
print("F1", f1_score(y_test, y_test_predict, average='weighted'))

[[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 1]]
Accuracy 0.9964788732394366
F1 0.9953051643192486
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Accuracy 0.04918032786885246
F1 0.0430327868852459


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

spliter = StratifiedKFold(n_splits = 5)

for train_indices, validate_indices in spliter.split(X, y):
    RFC = RandomForestClassifier(max_depth=8)
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    RFC.fit(X_train,y_train)
    X_validate = X.iloc[validate_indices]
    y_validate = y.iloc[validate_indices]
    y_predicted = RFC.predict(X_validate)
    matrix = confusion_matrix(y_validate, y_predicted)
    print(matrix)
    print("Accuracy", accuracy_score(y_validate,y_predicted))
    print("F1", f1_score(y_validate, y_predicted, average='weighted'))
    print()

#### Neural Net

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score


MLP = MLPClassifier()


MLP.fit(X_train, y_train)
y_train_predict = MLP.predict(X_train)
matrixTrain = confusion_matrix(y_train, y_train_predict)
print(matrixTrain)
print("Accuracy", accuracy_score(y_train, y_train_predict))
print("F1", f1_score(y_train ,y_train_predict, average='weighted'))

y_test_predict = RFC.predict(X_test)
matrixTest = confusion_matrix(y_test, y_test_predict)
print(matrixTest)
print("Accuracy", accuracy_score(y_test, y_test_predict))
print("F1", f1_score(y_test, y_test_predict, average='weighted'))

In [None]:
from sklearn.model_selection import StratifiedKFold

spliter = StratifiedKFold(n_splits = 5)

for train_indices, validate_indices in spliter.split(X, y):
    MLP = MLPClassifier()
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    MLP.fit(X_train,y_train)
    X_validate = X.iloc[validate_indices]
    y_validate = y.iloc[validate_indices]
    y_predicted = MLP.predict(X_validate)
    matrix = confusion_matrix(y_validate, y_predicted)
    print(matrix)
    print("Accuracy", accuracy_score(y_validate,y_predicted))
    print("F1", f1_score(y_validate, y_predicted, average='weighted'))
    print()