In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import KMeans

lmodel = LinearRegression()

# Read in CSV's to pandas
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train_labels = train["SalePrice"]

model_data = pd.concat([train,test],ignore_index=True)
model_data = model_data.drop("SalePrice", 1)

In [137]:
# Let's take a peak at data with pandas
corrs = train.corr()["SalePrice"].abs()
corrs.sort_values(ascending=False)[1:] # Sort from highest to lowest c

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
KitchenAbvGr     0.135907
EnclosedPorch    0.128578
ScreenPorch      0.111447
PoolArea         0.092404
MSSubClass       0.084284
OverallCond      0.077856
MoSold           0.046432
3SsnPorch        0.044584
YrSold           0.028923
LowQualFinSF     0.025606
Id               0.021917
MiscVal          0.021190
BsmtHalfBath     0.016844
BsmtFinSF2       0.011378
Name: SalePr

In [110]:
########## KMeans to modify nan values ##########
########## inputs:
########## label of col to update, indices in that col that have nan value,
########## data for clustering (including nan col), k number of clusters

########## output:
########## dictionary of {index : new value to replace nan}
def KMeans_nan_replacement(nan_col_label, nan_indices, k_data, k):

    # first iteration of KMeans
    kmeans = KMeans(n_clusters=k, random_state=0).fit(k_data)

    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    prev_labels = []
    converged = False

    # Run KMeans, update previously nan values to converge on their centroids,
    # Until the clusters stop changing
    while(converged == False):
        for nan_index in nan_indices:
            # Set LotFrontage value that was previously nan to
            # centroid for the k_data point at that same index
            k_data.loc(nan_index)[0]["LotFrontage"] = centroids[labels[nan_index]][0]

        # Rerun KMeans
        kmeans = KMeans(n_clusters=k, random_state=0).fit(k_data)
        # Store new labels
        labels = kmeans.labels_
        
        # If labels did not change in this run, algorithm has converged
        # (is this error prone if the window of distance to centroid is large?)
        if np.all(labels == prev_labels):
            converged = True

        prev_labels = labels

    # Return a dict of {index : new value}
    return dict(zip(nan_indices, k_data.oc(nan_indices)))
##############################

In [111]:
# Find cols with nan
all_nan_cols = model_data.columns[pd.isnull(model_data).any()].tolist()

# one-hot vector for categorical features
model_data = pd.get_dummies(model_data)

print all_nan_cols

['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea', 'TotalBsmtSF']


In [112]:
# Store id's for LotFrontage nan
nan_indices_lot_frontage = model_data['LotFrontage'].index[model_data['LotFrontage'].apply(np.isnan)]

# Set all nan cols to col mean
k_data = model_data.fillna(model_data.mean())

# Run KMeans to fill nan for LotFrontage
lot_frontage_indices_and_new_values = KMeans_nan_replacement("LotFrontage", nan_indices_lot_frontage, k_data, 9)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


AttributeError: 'DataFrame' object has no attribute 'oc'