In [None]:
import numpy as np

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import zipfile
import os

# Plot settings
plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 12

In [None]:
with zipfile.ZipFile('data/cook_county_data.zip') as item:
    item.extractall()

In [None]:
initial_data = pd.read_csv("cook_county_train.csv", index_col='Unnamed: 0')

In [None]:
def plot_distribution(data, label):
    fig, axs = plt.subplots(nrows=2)

    sns.distplot(
        data[label], 
        ax=axs[0]
    )
    sns.boxplot(
        x=data[label],
        width=0.3, 
        ax=axs[1],
        showfliers=False,
    )

    # Align axes
    spacer = np.max(data[label]) * 0.05
    xmin = np.min(data[label]) - spacer
    xmax = np.max(data[label]) + spacer
    axs[0].set_xlim((xmin, xmax))
    axs[1].set_xlim((xmin, xmax))

    # Remove some axis text
    axs[0].xaxis.set_visible(False)
    axs[0].yaxis.set_visible(False)
    axs[1].yaxis.set_visible(False)

    # Put the two plots together
    plt.subplots_adjust(hspace=0)
    fig.suptitle("Distribution of " + label)

plot_distribution(initial_data, label='Sale Price')

In [None]:
training_data = initial_data[initial_data["Sale Price"] >= 500]
training_data["Log Sale Price"] = training_data["Sale Price"].apply(lambda n: np.log(n))

In [None]:
plot_distribution(training_data, label='Log Sale Price');

In [None]:
def remove_outliers(data, variable, lower=-np.inf, upper=np.inf):
    """
    Input:
      data (DataFrame): the table to be filtered
      variable (string): the column with numerical outliers
      lower (numeric): observations with values lower than or equal to this will be removed
      upper (numeric): observations with values higher than or equal to this will be removed
    
    Output:
      a DataFrame with outliers removed
      
    Note: This function should not change mutate the contents of data.
    """  

    data_temp = data[data[variable] >= lower]
    data_temp = data_temp[data_temp[variable] <= upper]
    return data_temp

import re

def add_total_bedrooms(data):
    """
    Input:
      data (DataFrame): a DataFrame containing at least the Description column.

    Output:
      a Dataframe with a new column "Bedrooms" containing ints.

    """
    with_rooms = data.copy()
    patt = r'(\d+) of which are bedrooms'
    with_rooms['Bedrooms'] = with_rooms['Description'].apply(
        lambda n: int(re.search(patt, n).group(1)) if re.search(patt, n) 
        else 0
    )
    
    return with_rooms

training_data = add_total_bedrooms(training_data)

In [None]:
sns.violinplot(data = training_data, x = "Bedrooms", y = "Log Sale Price")

# Set labels and title
plt.xlabel("Number of Bedrooms")
plt.ylabel("Sale Price (Log)")
plt.title("Association between Bedrooms and Log Sale Price")

# Show the plot
plt.show()

In [None]:
num_neighborhoods = len(training_data['Neighborhood Code'].unique())
num_neighborhoods

In [None]:
# Feel free to create a cell below this and run plot_cateogrical(training_data) if you want to see what this function outputs.
def plot_categorical(neighborhoods):
    fig, axs = plt.subplots(nrows=2)

    sns.boxplot(
        x='Neighborhood Code',
        y='Log Sale Price',
        data=neighborhoods,
        ax=axs[0],
    )

    sns.countplot(
        x='Neighborhood Code',
        data=neighborhoods,
        ax=axs[1],
    )

    # Draw median price
    axs[0].axhline(
        y=training_data['Log Sale Price'].median(), 
        color='red',
        linestyle='dotted'
    )

    # Label the bars with counts
    for patch in axs[1].patches:
        x = patch.get_bbox().get_points()[:, 0]
        y = patch.get_bbox().get_points()[1, 1]
        axs[1].annotate(f'{int(y)}', (x.mean(), y), ha='center', va='bottom')

    # Format x-axes
    axs[1].set_xticklabels(axs[1].xaxis.get_majorticklabels(), rotation=90)
    axs[0].xaxis.set_visible(False)

    # Narrow the gap between the plots
    plt.subplots_adjust(hspace=0.01)

In [None]:
top_20_neighborhood_codes = training_data["Neighborhood Code"].value_counts().index[:20]
in_top_20_neighborhoods = training_data[training_data["Neighborhood Code"].isin(top_20_neighborhood_codes)]
plot_categorical(neighborhoods=in_top_20_neighborhoods)

In [None]:
def find_expensive_neighborhoods(data, n=3, metric=np.median):
    """
    Input:
      data (DataFrame): should contain at least an int-valued 'Neighborhood Code'
        and a numeric 'Log Sale Price' column
      n (int): the number of top values desired
      metric (function): function used for aggregating the data in each neighborhood.
        for example, np.median for median prices
    
    Output:
      a list of the the neighborhood codes of the top n highest-priced neighborhoods 
      as measured by the metric function
    """
    neighborhoods = data.groupby("Neighborhood Code").agg({"Log Sale Price": metric}).sort_values("Log Sale Price", ascending = False)[:n]
    neighborhoods = neighborhoods.index    
    # This makes sure the final list contains the generic int type used in Python3, not specific ones used in NumPy.
    return [int(code) for code in neighborhoods]

expensive_neighborhoods = find_expensive_neighborhoods(training_data, 3, np.median)
expensive_neighborhoods

def add_in_expensive_neighborhood(data, expensive_neighborhoods):
    """
    Input:
      data (DataFrame): a DataFrame containing a 'Neighborhood Code' column with values
        found in the codebook
      expensive_neighborhoods (list of ints): ints should be the neighborhood codes of
        neighborhoods pre-identified as expensive
    Output:
      DataFrame identical to the input with the addition of a binary
      in_expensive_neighborhood column
    """
    data['in_expensive_neighborhood'] = data["Neighborhood Code"].isin(expensive_neighborhoods).astype("int")
    return data

expensive_neighborhoods = find_expensive_neighborhoods(training_data, 3, np.median)
training_data = add_in_expensive_neighborhood(training_data, expensive_neighborhoods)


In [None]:
def substitute_roof_material(data):
    """
    Input:
      data (DataFrame): a DataFrame containing a 'Roof Material' column.  Its values
                         should be limited to those found in the codebook
    Output:
      new DataFrame identical to the input except with a refactored 'Roof Material' column
    """
    new_data = data.copy()
    new_data["Roof Material"] = new_data["Roof Material"].replace({
        1: "Shingle/Asphalt", 
        2: "Tar & Gravel",
        3: "Slate",
        4: "Shake",
        5: "Tile",
        6: "Other"
    })    
    return new_data
    
training_data_mapped = substitute_roof_material(training_data)
training_data_mapped.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

def ohe_roof_material(data):
    """
    One-hot-encodes roof material. New columns are of the form "Roof Material_MATERIAL".
    """
    cat = ["Roof Material"]
    ohe = OneHotEncoder()
    ohe.fit(data[cat])
    cat_data = ohe.transform(data[cat]).toarray()
    cat_df = pd.DataFrame(data = cat_data, columns = ohe.get_feature_names_out(), index = data.index)
    return data.join(cat_df)

training_data_ohe = ohe_roof_material(training_data_mapped)
# This line of code will display only the one-hot-encoded columns in training_data_ohe that 
# have names that begin with “Roof Material_" 
training_data_ohe.filter(regex='^Roof Material_').head(10)