# Clustering using Zillow Data

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# general
import pandas as pd
import numpy as np

# explore/ stat
import scipy.stats as stats

# visuals
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# default pandas decimal display formatting
pd.options.display.float_format='{:20,.2f}'.format

# support modules
from env import host, user, password
import acquire
import summarize
import prepare
import explore

In [None]:
df = acquire.get_zillow_data()

In [None]:
df.unitcnt.value_counts(dropna=False)

In [None]:
# remove unit counts of two and three
df = df [(df.unitcnt != 2) | (df.unitcnt != 3)]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
summarize.df_summary(df)

In [None]:
summarize.nulls_by_col(df)

In [None]:
summarize.nulls_by_row(df)

**This function deals with missing values in the columns and the rows. If a row has more than 50% nulls, they will be removed. If a columns has more than 75% nulls, they will be removed.**

In [None]:
df = prepare.handle_missing_values(df, prop_required_column = .5, prop_required_row = .75)

After using this function, we didn't lose any rows, but we lost a lot of columns.

In [None]:
# summarize.report_remain_nulls_by_col(df, num_rows_of_interest)
summarize.report_remain_nulls_by_col(df, 200)

In [None]:
# may want to drop these columns
# doesn't make sense to impute
df = df.drop(columns = ['finishedsquarefeet12', 'buildingqualitytypeid', 'heatingorsystemtypeid', 'heatingorsystemdesc', 'propertyzoningdesc', 'propertyzoningdesc', 'unitcnt'])
# now we can start impute and handle outlier

In [None]:
df.info()

In [None]:
df['age'] = 2017 - df.yearbuilt

In [None]:
# visualize the columns with boxplot
# identify outliers visually
explore.df_feature_box(df)

The 'heatingorsystemtypeid', 'heatingorsystemdesc' and 'propertyzoningdesc' have significantly less values than the other values, but not so much so that they were removed with the function. We will just take note of this if we need to reference that leter on.

In [None]:
def df_feature_dist(df):
    # get a list of columns where content is number
    features_num = list(df.select_dtypes(np.number).columns)
    for feature in features_num:
        sns.distplot(df[feature].dropna())
        plt.show()

In [None]:
df_feature_dist(df)