## Description:
1. Load and aggregate all of the saved housing data from python PKL files
2. Clean and build features based on the raw data

### Header

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.display import Audio
import pandas as pd
import numpy as np
import re
import pickle
import os
import ast
import json

## Main:
Cleans and utilizes the functions listed below to convert python Pickled dictionaries saved locally into Pandas Dataframes

In [2]:
data_dict_path = 'C:/Users/austi/Documents/Github_Repos/Imperial_Applied_Project/Detroit/Saved_Data/Test_Data_Dict/'
#saved_data_html_path = 'C:/Users/austi/Documents/Github_Repos/Imperial_Applied_Project/Detroit/Saved_Data/HTML/'
os.chdir(data_dict_path)

csv_data_path = 'C:/Users/austi/Documents/Github_Repos/Imperial_Applied_Project/Detroit/Saved_Data/CSV_Data/'
file_name = 'CLEANED_Aggregated_Home_Data.csv'
#pd.DataFrame.to_csv(data, path_or_buf= csv_data_path + file_name)
#data = get_aggregatedData(os.listdir(data_dict_path)[:-1])
data = pd.read_csv(csv_data_path+file_name)

data = data.set_index('House_Id')
data = data.drop(axis=1, columns=['Full_Bathrooms', 'Price_Cut_Date', 'Parcel_Number', 'Has_Hoa_Fee'])

#data = data.drop(axis=1, columns=['Year_Built:', 'Living_Area', 'Lot:', 'Rent_Control'])

temp = list(data.columns)
for i in range(0, len(temp)):
    temp[i] = temp[i].replace(':', '').replace('/', '_')
data.columns = temp

data.Time_On_Zillow = force_numeric(data.Time_On_Zillow.str.replace('\sdays', '', regex=True), fill_value=0)
data.Lot_Size = force_numeric(data.Lot_Size.str.replace('\sacres', '', regex=True), fill_value=None)
data.Price_sqft = force_numeric(data.Price_sqft.str.replace('\$', '', regex=True), fill_value=0)
data.Total_Interior_Livable_Area = force_numeric(data.Total_Interior_Livable_Area.str.replace('\ssqft', '', regex=True).replace('\,', '', regex=True), fill_value=0)
data.Hoa_Fee = force_numeric(data.Hoa_Fee.str.replace((','), ('')).str.replace(('$'), ('')).str.replace(('/mo'), ('')), fill_value=0)
data.Tax_Assessed_Value = force_numeric(data.Tax_Assessed_Value.str.replace((','), ('')).str.replace(('$'), ('')), fill_value=None)
data.Annual_Tax_Amount = force_numeric(data.Annual_Tax_Amount.str.replace((','), ('')).str.replace(('$'), ('')), fill_value=None)

data.Bedrooms = force_numeric(data.Bedrooms, fill_value=0)
data.Bathrooms = force_numeric(data.Bathrooms, fill_value=0)
data.Stories = force_numeric(data.Stories, fill_value=0)
data.Year_Built = force_numeric(data.Year_Built, fill_value=None)
data.Garage_Spaces = force_numeric(data.Garage_Spaces, fill_value=0)

data.Price = force_numeric(data.Price, fill_value=None)
data['Price_Unknown_Binary'] = pd.isna(data.Price)
data['Calculated_Price_Sqft'] = (data.Price/data.Total_Interior_Livable_Area).replace([np.inf, -np.inf], np.nan)
data.Price_Cut = force_numeric(data.Price_Cut, fill_value=0)
data.New_Construction =data.New_Construction.replace(('Yes', 'No'), (1, 0)).fillna(0)

data.Home_Type.loc[(data.Bathrooms == 0) & (data.Bedrooms == 0) & (data.Year_Built.isna()) & (data.Garage_Spaces ==0) & (data.Home_Type.isna())] = 'VACANT_LOT'
data.Home_Type.loc[data.Home_Type == 'MANUFACTURED'] = 'SINGLE_FAMILY'
data.Annual_Tax_Amount = data.Annual_Tax_Amount.fillna(0)
data = data[~data.Price_Unknown_Binary]
data = data[~data.Home_Type.isin(['VACANT_LOT'])]
data.head().T


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


House_Id,2096667469_zpid,2090771520_zpid,2085043646_zpid,88380061_zpid,2083788163_zpid
Bedrooms,2,2,1,2,3
Heating_Features,Forced air,Forced air,,Forced air,Forced air
Total_Interior_Livable_Area,1182,1250,840,2300,2450
Number_Of_Appliances,8,8,0,0,4
Bathrooms,2,2,1,3,4
Exterior_Features,"Brick, Vinyl","Brick, Vinyl",,Other,Other
Parking_Features,Attached Garage,Attached Garage,,Attached Garage,Attached Garage
Stories,1,1,0,0,0
Home_Type,TOWNHOUSE,TOWNHOUSE,CONDO,CONDO,TOWNHOUSE
Year_Built,1905,1905,1920,2002,2019


###### Notes
1. Appliances: get the number of appliances included in sale, not the listing
2. Get the binary value of if price cut or not
3. Get Percentage away from listing from zestimate
4. Have Dataframe and do a merge from the main dataframe with the created data frames from the dict
    * Make a separate dictionary for each
    * If not listed initially, go forward and look for these:
        1. Zestimate
        2. Parking
        3. Heating / Cooling
        4. Home Type
5. Loop through Zip Code PKL files and find the top 50% - 75% of Dictionary Keys listed
    * Gather the top 50% - 75% and find the top values across the board again
6. Gather the Descriptions for each home and match them for Sentiment Analysis
    * Base the sentiment analysis on the price of the home or price per square foot, or price/Lot Size

## Functions

In [27]:
def load_Dictionary(path, file_name):
    with open(path + str(file_name), "rb") as input_file:
        data_dict = pickle.load(input_file)
    return data_dict
def make_series(name, series_name):
    return pd.Series(data=name, name=series_name)
def force_numeric(column, fill_value):
    if fill_value == None:
        return pd.to_numeric(column, errors='coerce')
    return pd.to_numeric(column, errors='coerce').fillna(fill_value)

def get_commonKeys(zip_code_file_list, percent_threshold):
    '''
    Return the keys that are at least covered in over half of the samples
    '''
    data_dict_path = 'C:/Users/austi/Documents/Github_Repos/Imperial_Applied_Project/Detroit/Saved_Data/Test_Data_Dict/'
    Overall_keys = []
    Interior_Details =[]
    Property_Details = []
    Construction_Details = []
    Building_Details = []
    Community_And_Neighborhood_Details = []
    Hoa_And_Financial_Details = []
    Other = []
    total_homes = 0
        
    for zip_code_file_name in zip_code_file_list:
        
        zip_code_dict = load_Dictionary(data_dict_path, zip_code_file_name)
        zip_code_keys = zip_code_dict.keys()
        num_homes = len(zip_code_keys)
        total_homes += num_homes
        for house in zip_code_keys:
            first_layer = zip_code_dict[house].keys()
            for val in first_layer: Overall_keys.append(val)
            if 'Interior_Details' in first_layer:
                for val in zip_code_dict[house]['Interior_Details'].keys(): Interior_Details.append(val)
            if 'Property_Details' in first_layer:
                for val in zip_code_dict[house]['Property_Details'].keys(): Property_Details.append(val)
            if 'Construction_Details' in first_layer:
                for val in zip_code_dict[house]['Construction_Details'].keys(): Construction_Details.append(val)
            if 'Building_Details' in first_layer:
                for val in zip_code_dict[house]['Building_Details'].keys(): Building_Details.append(val)
            if 'Community_And_Neighborhood_Details' in first_layer:
                for val in zip_code_dict[house]['Community_And_Neighborhood_Details'].keys(): Community_And_Neighborhood_Details.append(val)
            if 'Hoa_And_Financial_Details' in first_layer:
                for val in zip_code_dict[house]['Hoa_And_Financial_Details'].keys(): Hoa_And_Financial_Details.append(val)
            if 'Other' in first_layer:
                for val in zip_code_dict[house]['Other'].keys(): Other.append(val)
    series_list = [
        Overall_keys,
        Interior_Details,
        Property_Details,
        Construction_Details,
        Building_Details,
        Community_And_Neighborhood_Details,
        Hoa_And_Financial_Details,
        Other
    ]
    detail_keys = [
    'Overall_Keys',
    'Interior_Details',
    'Property_Details',
    'Construction_Details',
    'Building_Details',
    'Community_And_Neighborhood_Details',
    'Hoa_And_Financial_Details',
    'Other'
    ]
    #for i in range(0, len(series_list)):
    print(f'---------------- Total Homes Analyzed {total_homes} ----------------')
    for i in range(0, len(series_list)):
        temp = make_series(series_list[i], series_name=detail_keys[i])
        print('----------------'  + '----------------')
        temp = temp.value_counts().loc[temp.value_counts().values > total_homes* percent_threshold]
        display(temp.name, temp, temp.index)
        #series_list[i] = temp.value_counts().loc[temp.value_counts().values > num_homes* percent_threshold]
    #return series_list

def get_standardData(data_dict, house):
    standard_data_cols = [
    'Time_On_Zillow', 'Living_Area','Price', 'Zestimate','Price_Cut_Date',
    'Number_Of_Photos', 'Address', 'Views', 'Longitude','Latitude',
    'Saves', 'Price_Cut', 'Cooling:','Parking:', 'Heating:',
    'Year_Built:', 'Type:', 'Price/sqft:', 'Lot:'
    ]
    d = {k: data_dict[house].get(k, None) for k in standard_data_cols}
    
    return pd.DataFrame(d, index=[house])

def get_schoolData(data_dict, house):
    school_data = data_dict[house]['Schools']
    if school_data:
        school_data_listed = True
        average_rating = []
        average_distance = []
        for school in school_data.keys():
            rating = school_data[school]['Rating']
            distance = school_data[school]['Distance:']
            if rating == 'NA':
                rating = 0
            else:
                rating = np.array(int(re.match("\d", string=rating)[0]))
            distance = np.array(float(re.match("\d*\.?\d*", string=distance)[0]))
            average_rating.append(rating)
            average_distance.append(distance)
        average_rating = np.mean(np.array(average_rating))
        average_distance = np.mean(np.array(average_distance))
    else:
        school_data_listed = False
        average_rating = 0
        average_distance = 0
    d = {'Average_School_Rating': average_rating, 'Average_School_Distance': average_distance, 'School_Listed_Binary': school_data_listed}
    return pd.DataFrame(d, index=[house])
    
def get_aggregatedData(file_list):
    sub_DataColumns = {
    'Interior_Details': [
        'Total_Interior_Livable_Area', 'Heating_Features', 'Bathrooms',
        'Basement', 'Bedrooms', 'Full_Bathrooms', 'Has_Cooling', 'Number_Of_Appliances'
    ],
    'Property_Details': [
        'Exterior_Features', 'Lot_Size', 'Parcel_Number', 'Stories',
        'Parking_Features', 'Garage_Spaces'
    ],
    'Construction_Details': [
        'New_Construction', 'Home_Type', 'Year_Built', 'Roof', 'Foundation',
        'Construction_Materials', 'Architectural_Style','Major_Remodel_Year'
    ],
    'Building_Details': [
        'Rent_Control', 'Structure_Type'
    ],
    'Hoa_And_Financial_Details': [
        'Has_Hoa_Fee', 'Tax_Assessed_Value', 'Annual_Tax_Amount'
    ]
    }
    sub_detail_keys = [
        'Interior_Details',
        'Property_Details',
        'Construction_Details',
        'Building_Details',
        'Hoa_And_Financial_Details'
    ]
    cols = [
    'Time_On_Zillow', 'Living_Area', 'Average_School_Rating', 'Average_School_Distance',
    'Description_Score', 'Price', 'Zestimate', 'Price_Cut_Date','Hoa_Fee',
    'Number_Of_Photos', 'Address', 'Views', 'Longitude', 'Saves', 'Number_Of_Appliances',
    'Latitude', 'Price_Cut', 'Cooling:','Parking:', 'Heating:','Year_Built:', 'Type:', 'Price/sqft:', 'Lot:',
    'Other', 'Total_Interior_Livable_Area','Heating_Features', 'Bathrooms','Basement', 'Bedrooms', 'Full_Bathrooms', 'Has_Cooling',
    'Exterior_Features', 'Lot_Size', 'Parcel_Number', 'Stories','Parking_Features', 'Garage_Spaces',
    'New_Construction', 'Home_Type', 'Year_Built', 'Roof', 'Foundation', 'Construction_Materials',
    'Architectural_Style', 'Rent_Control', 'Structure_Type', 'Has_Hoa_Fee', 'Tax_Assessed_Value', 'Annual_Tax_Amount'
    ]
    data_dict_path = 'C:/Users/austi/Documents/Github_Repos/Imperial_Applied_Project/Detroit/Saved_Data/Test_Data_Dict/'
    total_value = pd.DataFrame()
    total_value.index.name = 'House_Id'
    no_key_count = 0
    
    for file in file_list:
        data_dict = load_Dictionary(data_dict_path, file_name=file)
        print(f'Working on: {file}')
        for house in list(data_dict.keys()):
            row_value = pd.DataFrame(index=[house])
            row_value.index.name = 'House_Id'
            for sub_detail_key in sub_detail_keys:
                #print(sub_detail_key)
                #print(house)
                if sub_detail_key not in data_dict[house].keys():
                    no_key_count+=1
                    #print(f'---------------------------- {house} Has No Key: {sub_detail_key} ----------------------------')
                    temp_data = pd.DataFrame(columns=sub_DataColumns[sub_detail_key], index=[house])

                if sub_detail_key in data_dict[house].keys():
                    sub_data_values = data_dict[house][sub_detail_key]
                    temp_data = pd.DataFrame.from_dict(sub_data_values, orient='index').T
                    temp_data.index = [house]
                
                temp_data.index.name = 'House_Id'
                if sub_detail_key == 'Interior_Details':
                    if 'Appliances_Included_In_Sale' in temp_data.columns:
                        temp_data['Number_Of_Appliances'] = len(temp_data['Appliances_Included_In_Sale'].values[0].split(','))
                    else:
                        temp_data['Number_Of_Appliances'] = 0

                if sub_detail_key == 'Hoa_And_Financial_Details':
                    if 'Hoa_Fee' not in temp_data.columns:
                        temp_data['Hoa_Fee'] = 0
                #temp_data[sub_DataColumns[sub_detail_key]]
                intersection_cols = list(set.intersection(set(cols), set(temp_data.columns)))
                #display(temp_data[intersection_cols])
                #data = pd.merge(data, temp_data[intersection_cols], how='outer')
                #display(pd.merge(data, temp_data[intersection_cols], how='outer'))
                row_value = pd.concat([row_value, temp_data[intersection_cols]], axis=1, sort=False)
            #display(row_value.T)
            #print(f'House Complete: {house}')
            row_value = pd.concat([row_value, get_schoolData(data_dict, house)], axis=1, sort=False)
            row_value = pd.concat([row_value, get_standardData(data_dict, house)], axis=1, sort=False)
            total_value = pd.concat([total_value, row_value], axis=0, sort=False)
            #display(total_value)
    print(f'Number of missing keys: {no_key_count}')
    return total_value

IndentationError: expected an indented block (<ipython-input-27-b1548629ce18>, line 3)