# Zillow Dataset Cleaning (housing data)

In [44]:
import pandas as pd


# Import datasets

# Houses Data
house_bd1_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_1bedroom.csv', encoding = 'iso-8859-1')
house_bd2_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_2bedroom.csv', encoding = 'iso-8859-1')
house_bd3_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_3bedroom.csv', encoding = 'iso-8859-1')
house_bd4_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_4bedroom.csv', encoding = 'iso-8859-1')
house_bd5_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_5bedroomOrMore.csv', encoding = 'iso-8859-1')

# Rentals Data
rental_bd1_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_1Bedroom.csv', encoding = 'iso-8859-1')
rental_bd2_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_2Bedroom.csv', encoding = 'iso-8859-1')
rental_bd3_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_3Bedroom.csv', encoding = 'iso-8859-1')
rental_bd4_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_4Bedroom.csv', encoding = 'iso-8859-1')
rental_bd5_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_5BedroomOrMore.csv', encoding = 'iso-8859-1')

# Census Data
census_df = pd.read_csv('datasets/all_census_formated.csv')

# Shapes of housing data
print('Shape of house_bd1_df:', house_bd1_df.shape)
print('Shape of house_bd2_df:', house_bd2_df.shape)
print('Shape of house_bd3_df:', house_bd3_df.shape)
print('Shape of house_bd4_df:', house_bd4_df.shape)
print('Shape of house_bd5_df:', house_bd5_df.shape)

# Shapes of Rentals data
print('Shape of rental_bd1_df:', rental_bd1_df.shape)
print('Shape of rental_bd2_df:', rental_bd2_df.shape)
print('Shape of rental_bd3_df:', rental_bd3_df.shape)
print('Shape of rental_bd4_df:', rental_bd4_df.shape)
print('Shape of rental_bd5_df:', rental_bd5_df.shape)

Shape of house_bd1_df: (11106, 149)
Shape of house_bd2_df: (19552, 149)
Shape of house_bd3_df: (21749, 149)
Shape of house_bd4_df: (21863, 149)
Shape of house_bd5_df: (17569, 149)
Shape of rental_bd1_df: (803, 123)
Shape of rental_bd2_df: (1263, 123)
Shape of rental_bd3_df: (989, 123)
Shape of rental_bd4_df: (317, 123)
Shape of rental_bd5_df: (53, 112)


## Format city and state names

In [45]:
def format_names(X):
    
    full_name = []
    for x in range(0,len(X.RegionName)):
        full_name.append(X.RegionName[x] + ' ' + X.State[x])
    
    #insert full_name column
    X['full_name'] = full_name
    
    return X    

In [46]:
# Apply format_names function to all dataframes

house_bd1_df = format_names(house_bd1_df)
house_bd2_df = format_names(house_bd2_df)
house_bd3_df = format_names(house_bd3_df)
house_bd4_df = format_names(house_bd4_df)
house_bd5_df = format_names(house_bd5_df)

rental_bd1_df = format_names(rental_bd1_df)
rental_bd2_df = format_names(rental_bd2_df)
rental_bd3_df = format_names(rental_bd3_df)
rental_bd4_df = format_names(rental_bd4_df)
rental_bd5_df = format_names(rental_bd5_df)

## Drop cities not in largest dataset

In [50]:
def drop_cities(df):
    """
    This function drops cities not included in the largest dataset, "house_bd4_df"
    """
    ""
    cities = []
    for city in df['full_name']:
        if city not in list(house_bd4_df['full_name']):
            cities.append(city)
            
    # drop cities
    for city in cities:
        df = df[df['full_name'] != city]
    
    return df

In [None]:
# Apply drop_cities function to all dataframes

house_bd1_df = drop_cities(house_bd1_df)
print("Finished Processing house_bd1_df...")
house_bd2_df = drop_cities(house_bd2_df)
print("Finished Processing house_bd2_df...")
house_bd3_df = drop_cities(house_bd3_df)
print("Finished Processing house_bd3_df...")
house_bd4_df = drop_cities(house_bd4_df)
print("Finished Processing house_bd4_df...")
house_bd5_df = drop_cities(house_bd5_df)
print("Finished Processing house_bd5_df...")

rental_bd1_df = drop_cities(rental_bd1_df)
print("Finished Processing rental_bd1_df...")
rental_bd2_df = drop_cities(rental_bd2_df)
print("Finished Processing rental_bd2_df...")
rental_bd3_df = drop_cities(rental_bd3_df)
print("Finished Processing rental_bd3_df...")
rental_bd4_df = drop_cities(rental_bd4_df)
print("Finished Processing rental_bd4_df...")
rental_bd5_df = drop_cities(rental_bd5_df)
print("Finished Processing rental_bd5_df...")

Finished Processing house_bd1_df...
Finished Processing house_bd2_df...
Finished Processing house_bd3_df...
Finished Processing house_bd4_df...


In [None]:
# Shapes of housing data
print('Shape of house_bd1_df:', house_bd1_df.shape)
print('Shape of house_bd2_df:', house_bd2_df.shape)
print('Shape of house_bd3_df:', house_bd3_df.shape)
print('Shape of house_bd4_df:', house_bd4_df.shape)
print('Shape of house_bd5_df:', house_bd5_df.shape)

# Shapes of Rentals data
print('Shape of rental_bd1_df:', rental_bd1_df.shape)
print('Shape of rental_bd2_df:', rental_bd2_df.shape)
print('Shape of rental_bd3_df:', rental_bd3_df.shape)
print('Shape of rental_bd4_df:', rental_bd4_df.shape)
print('Shape of rental_bd5_df:', rental_bd5_df.shape)