***exploration a: zip, square foot***

In [1]:
import csv
import numpy as np
import copy
import import_clean_functions
import matplotlib.pyplot
from collections import defaultdict

In [2]:
with open('EXTR_ResBldg.csv') as f:
    reader = csv.DictReader(f)
    X = list(reader)

In [3]:
def select_cols(row, cols_to_keep):
    return {col: row[col] for col in cols_to_keep}

In [4]:
data = [select_cols(row, ['Major', 'Minor', 'ZipCode', 'SqFtTotLiving'])
       for row in X]

In [5]:
data_clean_sf = import_clean_functions.clean_data(data,
                                                 import_clean_functions.clean_square_footage)

In [6]:
data_zip_sf = import_clean_functions.clean_data(data_clean_sf, import_clean_functions.combine_major_minor)

In [32]:
data_zip_sf

[{'ZipCode': '98007',
  'SqFtTotLiving': 1460,
  'major_minor': ('022405', '9064')},
 {'ZipCode': '98007',
  'SqFtTotLiving': 1480,
  'major_minor': ('022405', '9111')},
 {'ZipCode': '98008',
  'SqFtTotLiving': 1100,
  'major_minor': ('022405', '9150')},
 {'ZipCode': '98008',
  'SqFtTotLiving': 2400,
  'major_minor': ('022405', '9203')},
 {'ZipCode': '98075',
  'SqFtTotLiving': 1980,
  'major_minor': ('022406', '9003')},
 {'ZipCode': '98027',
  'SqFtTotLiving': 2020,
  'major_minor': ('022406', '9041')},
 {'ZipCode': '98075',
  'SqFtTotLiving': 2200,
  'major_minor': ('022406', '9136')},
 {'ZipCode': '98075',
  'SqFtTotLiving': 2120,
  'major_minor': ('022406', '9138')},
 {'ZipCode': '98075',
  'SqFtTotLiving': 1930,
  'major_minor': ('022406', '9158')},
 {'ZipCode': '98075',
  'SqFtTotLiving': 2250,
  'major_minor': ('022406', '9168')},
 {'ZipCode': '98117',
  'SqFtTotLiving': 2050,
  'major_minor': ('022503', '9092')},
 {'ZipCode': '98117',
  'SqFtTotLiving': 1620,
  'major_minor': (

***exploration b: price, year***

In [7]:
with open('EXTR_RPSale.csv') as f:
    reader = csv.DictReader(f)
    Y = list(reader)

In [8]:
def select_cols(row, cols_to_keep):
    return {col: row[col] for col in cols_to_keep}

In [9]:
data_y = [select_cols(row, ['Major', 'Minor', 'DocumentDate', 'SalePrice'])
       for row in Y]

In [10]:
def clean_data(data_y, cleaning_function):
    return [cleaning_function(row) for row in data_y]

In [11]:
def clean_sales_price(row):
    """Function to clean 'SalePrice' of a single dictionary. Intended to be passed to clean_data function."""
    out_row = row.copy()
    out_row['SalePrice'] = int(out_row['SalePrice'])
    return out_row

In [12]:
def clean_sp_zeros(row):
    """Function to remove zeros from 'SalePrice' of a single dictionary. Intended to be passed to clean_data function."""
    out_row = row.copy()
    if out_row['SalePrice'] != 0:
        return out_row

In [23]:
def clean_date(row):
    output = row.copy()
    for key, val in output.items():
        output['DocumentDate'] = output['DocumentDate'][-2:]
    return output

In [24]:
data_clean_date = [clean_date(data_y[i]) for i in range(len(data_y))]

In [25]:
data_clean_date

[{'Major': '330405',
  'Minor': '100',
  'DocumentDate': '98',
  'SalePrice': '215000'},
 {'Major': '868146', 'Minor': '30', 'DocumentDate': '09', 'SalePrice': '0'},
 {'Major': '258190', 'Minor': '265', 'DocumentDate': '03', 'SalePrice': '0'},
 {'Major': '334330', 'Minor': '1343', 'DocumentDate': '06', 'SalePrice': '0'},
 {'Major': '663990',
  'Minor': '40',
  'DocumentDate': '06',
  'SalePrice': '690576'},
 {'Major': '32103',
  'Minor': '230',
  'DocumentDate': '09',
  'SalePrice': '2340000'},
 {'Major': '32103',
  'Minor': '240',
  'DocumentDate': '09',
  'SalePrice': '2340000'},
 {'Major': '32103',
  'Minor': '180',
  'DocumentDate': '09',
  'SalePrice': '2340000'},
 {'Major': '32103',
  'Minor': '60',
  'DocumentDate': '09',
  'SalePrice': '2340000'},
 {'Major': '32103',
  'Minor': '130',
  'DocumentDate': '09',
  'SalePrice': '2340000'},
 {'Major': '32103',
  'Minor': '160',
  'DocumentDate': '09',
  'SalePrice': '2340000'},
 {'Major': '32103',
  'Minor': '220',
  'DocumentDate': 

In [26]:
data_clean_sp = [clean_sales_price(data_clean_date[i]) for i in range(len(data_clean_date))]

In [27]:
data_clean_sp_zero = [clean_sp_zeros(data_clean_sp[i]) for i in range(len(data_clean_sp))]

In [28]:
data_clean_none = [x for x in data_clean_sp_zero if x != None]

In [29]:
def combine_major_minor(row):
    """Function to merge 'Major' and 'Minor' into a single (key: value) pair with
    key = 'major_minor'. Intended to be passed to clean_data function."""
    out_row = row.copy()
    out_row['major_minor'] = tuple((out_row['Major'], out_row['Minor']))
    del out_row['Major']
    del out_row['Minor']
    return out_row

In [30]:
data_test = clean_data(data_clean_none, combine_major_minor)

In [33]:
data_price_date = data_test

In [34]:
data_price_date

[{'DocumentDate': '98', 'SalePrice': 215000, 'major_minor': ('330405', '100')},
 {'DocumentDate': '06', 'SalePrice': 690576, 'major_minor': ('663990', '40')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '230')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '240')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '180')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '60')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '130')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '160')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '220')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '10')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '20')},
 {'DocumentDate': '09', 'SalePrice': 2340000, 'major_minor': ('32103', '70')},
 {'DocumentDate': '09', 'SalePrice': 2340000,