# Zillow Dataset Cleaning (housing data)

In [1]:
import pandas as pd


# Import datasets

# Houses Data
house_bd1_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_1bedroom.csv', encoding = 'iso-8859-1')
house_bd2_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_2bedroom.csv', encoding = 'iso-8859-1')
house_bd3_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_3bedroom.csv', encoding = 'iso-8859-1')
house_bd4_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_4bedroom.csv', encoding = 'iso-8859-1')
house_bd5_df = pd.read_csv('datasets/zillow/Housing/City_Zhvi_5bedroomOrMore.csv', encoding = 'iso-8859-1')

# Rentals Data
rental_bd1_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_1Bedroom.csv', encoding = 'iso-8859-1')
rental_bd2_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_2Bedroom.csv', encoding = 'iso-8859-1')
rental_bd3_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_3Bedroom.csv', encoding = 'iso-8859-1')
rental_bd4_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_4Bedroom.csv', encoding = 'iso-8859-1')
rental_bd5_df = pd.read_csv('datasets/zillow/Rentals/City_MedianRentalPrice_5BedroomOrMore.csv', encoding = 'iso-8859-1')

# Census Data
census_df = pd.read_csv('datasets/all_census_formated.csv')

# Shapes of housing data
print('Shape of house_bd1_df:', house_bd1_df.shape)
print('Shape of house_bd2_df:', house_bd2_df.shape)
print('Shape of house_bd3_df:', house_bd3_df.shape)
print('Shape of house_bd4_df:', house_bd4_df.shape)
print('Shape of house_bd5_df:', house_bd5_df.shape)

# Shapes of Rentals data
print('Shape of rental_bd1_df:', rental_bd1_df.shape)
print('Shape of rental_bd2_df:', rental_bd2_df.shape)
print('Shape of rental_bd3_df:', rental_bd3_df.shape)
print('Shape of rental_bd4_df:', rental_bd4_df.shape)
print('Shape of rental_bd5_df:', rental_bd5_df.shape)

  interactivity=interactivity, compiler=compiler, result=result)


Shape of house_bd1_df: (11106, 149)
Shape of house_bd2_df: (19552, 149)
Shape of house_bd3_df: (21749, 149)
Shape of house_bd4_df: (21863, 149)
Shape of house_bd5_df: (17569, 149)
Shape of rental_bd1_df: (803, 123)
Shape of rental_bd2_df: (1263, 123)
Shape of rental_bd3_df: (989, 123)
Shape of rental_bd4_df: (317, 123)
Shape of rental_bd5_df: (53, 112)


## Format city and state names

In [2]:
def format_names(X):
    
    full_name = []
    for x in range(0,len(X.RegionName)):
        full_name.append(X.RegionName[x] + ' ' + X.State[x])
    
    #insert full_name column
    X['full_name'] = full_name
    
    return X    

In [3]:
# Apply format_names function to all dataframes

house_bd1_df = format_names(house_bd1_df)
house_bd2_df = format_names(house_bd2_df)
house_bd3_df = format_names(house_bd3_df)
house_bd4_df = format_names(house_bd4_df)
house_bd5_df = format_names(house_bd5_df)

rental_bd1_df = format_names(rental_bd1_df)
rental_bd2_df = format_names(rental_bd2_df)
rental_bd3_df = format_names(rental_bd3_df)
rental_bd4_df = format_names(rental_bd4_df)
rental_bd5_df = format_names(rental_bd5_df)

## Drop cities not in largest dataset

In [4]:
def drop_cities(df):
    """
    This function drops cities not included in the largest dataset, "house_bd4_df"
    """
    ""
    cities = []
    for city in df['full_name']:
        if city not in list(house_bd4_df['full_name']):
            cities.append(city)
            
    # drop cities
    for city in cities:
        df = df[df['full_name'] != city]
    
    return df

In [5]:
# Apply drop_cities function to all dataframes

house_bd1_df = drop_cities(house_bd1_df)
print("Finished Processing house_bd1_df...")
house_bd2_df = drop_cities(house_bd2_df)
print("Finished Processing house_bd2_df...")
house_bd3_df = drop_cities(house_bd3_df)
print("Finished Processing house_bd3_df...")
house_bd4_df = drop_cities(house_bd4_df)
print("Finished Processing house_bd4_df...")
house_bd5_df = drop_cities(house_bd5_df)
print("Finished Processing house_bd5_df...")

rental_bd1_df = drop_cities(rental_bd1_df)
print("Finished Processing rental_bd1_df...")
rental_bd2_df = drop_cities(rental_bd2_df)
print("Finished Processing rental_bd2_df...")
rental_bd3_df = drop_cities(rental_bd3_df)
print("Finished Processing rental_bd3_df...")
rental_bd4_df = drop_cities(rental_bd4_df)
print("Finished Processing rental_bd4_df...")
rental_bd5_df = drop_cities(rental_bd5_df)
print("Finished Processing rental_bd5_df...")

Finished Processing house_bd1_df...
Finished Processing house_bd2_df...
Finished Processing house_bd3_df...
Finished Processing house_bd4_df...
Finished Processing house_bd5_df...
Finished Processing rental_bd1_df...
Finished Processing rental_bd2_df...
Finished Processing rental_bd3_df...
Finished Processing rental_bd4_df...
Finished Processing rental_bd5_df...


In [6]:
# Shapes of housing data
print('Shape of house_bd1_df:', house_bd1_df.shape)
print('Shape of house_bd2_df:', house_bd2_df.shape)
print('Shape of house_bd3_df:', house_bd3_df.shape)
print('Shape of house_bd4_df:', house_bd4_df.shape)
print('Shape of house_bd5_df:', house_bd5_df.shape)

# Shapes of Rentals data
print('Shape of rental_bd1_df:', rental_bd1_df.shape)
print('Shape of rental_bd2_df:', rental_bd2_df.shape)
print('Shape of rental_bd3_df:', rental_bd3_df.shape)
print('Shape of rental_bd4_df:', rental_bd4_df.shape)
print('Shape of rental_bd5_df:', rental_bd5_df.shape)

Shape of house_bd1_df: (10314, 150)
Shape of house_bd2_df: (17787, 150)
Shape of house_bd3_df: (18852, 150)
Shape of house_bd4_df: (21863, 150)
Shape of house_bd5_df: (16562, 150)
Shape of rental_bd1_df: (795, 124)
Shape of rental_bd2_df: (1245, 124)
Shape of rental_bd3_df: (963, 124)
Shape of rental_bd4_df: (314, 124)
Shape of rental_bd5_df: (52, 113)


In [7]:
# Rename house dataset columns to differentiate between bedrooms for merging datasets

for index in range(6,148):
    house_bd1_df.rename(columns = {house_bd1_df.columns[index]:'bd1-'+ house_bd1_df.columns[index]}, inplace = True)
    
for index in range(6,148):
    house_bd2_df.rename(columns = {house_bd2_df.columns[index]:'bd2-'+ house_bd2_df.columns[index]}, inplace = True)
    
for index in range(6,148):
    house_bd3_df.rename(columns = {house_bd3_df.columns[index]:'bd3-'+ house_bd3_df.columns[index]}, inplace = True)
    
for index in range(6,148):
    house_bd4_df.rename(columns = {house_bd4_df.columns[index]:'bd4-'+ house_bd4_df.columns[index]}, inplace = True)
    
for index in range(6,148):
    house_bd5_df.rename(columns = {house_bd5_df.columns[index]:'bd5'+ house_bd5_df.columns[index]}, inplace = True)

In [8]:
# Rename rental dataset columns to differentiate between bedrooms for merging datasets

for index in range(5,122):
    rental_bd1_df.rename(columns = {rental_bd1_df.columns[index]:'bd1-'+ rental_bd1_df.columns[index]}, inplace = True)
    
for index in range(5,122):
    rental_bd2_df.rename(columns = {rental_bd2_df.columns[index]:'bd2-'+ rental_bd2_df.columns[index]}, inplace = True)

for index in range(5,122):
    rental_bd3_df.rename(columns = {rental_bd3_df.columns[index]:'bd3-'+ rental_bd3_df.columns[index]}, inplace = True)
    
for index in range(5,122):
    rental_bd4_df.rename(columns = {rental_bd4_df.columns[index]:'bd4-'+ rental_bd4_df.columns[index]}, inplace = True)
    
for index in range(5,111):
    rental_bd5_df.rename(columns = {rental_bd5_df.columns[index]:'bd5-'+ rental_bd5_df.columns[index]}, inplace = True)

In [49]:
# Merge datasets

# Add housing data
new_df = pd.concat([house_bd4_df,house_bd1_df], axis = 1)
new_df = pd.concat([new_df,house_bd2_df], axis = 1)
new_df = pd.concat([new_df,house_bd3_df], axis = 1)
new_df = pd.concat([new_df,house_bd5_df], axis = 1)

# Add rental data
new_df = pd.concat([new_df,rental_bd1_df], axis = 1)
new_df = pd.concat([new_df,rental_bd2_df], axis = 1)
new_df = pd.concat([new_df,rental_bd3_df], axis = 1)
new_df = pd.concat([new_df,rental_bd4_df], axis = 1)
new_df = pd.concat([new_df,rental_bd5_df], axis = 1)

# Reset index
new_df = new_df.reset_index()

new_df.shape

(21863, 1360)

In [54]:
# Find these cities and confirm their names

city_missing = ['Anchorage AK', 'Athensv GA', 'Augusta GA', 'Lexington KY', 'Louisville KY']

check = house_bd4_df.full_name.values
def find_missing(city):
    for i in check:
        if city in i:
            print(i)

for x in city_missing:
    find_missing(x)

Anchorage AK
Augusta GA
Lexington KY
Louisville KY


In [76]:
# Create a function to create true/false columns in the census data for each city in each dataset 
# True will mean that the data for the city is within the dataset
# False will mean that the data for the city is not within the dataset


def create_contain_column(X):
    """
    returns a dataframe
    X should be the Census Data
    """
    # Create empty lists
    house_bd1 = []
    house_bd2 = []
    house_bd3 = []
    house_bd4 = []
    house_bd5 = []
    
    rental_bd1 = []
    rental_bd2 = []
    rental_bd3 = []
    rental_bd4 = []
    rental_bd5 = []
    
    # House datasets - Check to see if the city names in the census data is in the House datasets
    
    for name in X.full_name:
        if name in house_bd1_df.full_name.values:
            house_bd1.append(True)
        else:
            house_bd1.append(False)
    
    for name in X.full_name:
        if name in house_bd2_df.full_name.values:
            house_bd2.append(True)
        else:
            house_bd2.append(False)    
            
    for name in X.full_name:
        if name in house_bd3_df.full_name.values:
            house_bd3.append(True)
        else:
            house_bd3.append(False)       
            
    for name in X.full_name:
        if name in house_bd4_df.full_name.values:
            house_bd4.append(True)
        else:
            house_bd4.append(False)            
            
    for name in X.full_name:
        if name in house_bd5_df.full_name.values:
            house_bd5.append(True)
        else:
            house_bd5.append(False)              
    
    
    # Rental datasets - Check to see if the city names in the census data is in the rental datasets
    
    for name in X.full_name:
        if name in rental_bd1_df.full_name.values:
            rental_bd1.append(True)
        else:
            rental_bd1.append(False)   
            
    for name in X.full_name:
        if name in rental_bd2_df.full_name.values:
            rental_bd2.append(True)
        else:
            rental_bd2.append(False)   
            
    for name in X.full_name:
        if name in rental_bd3_df.full_name.values:
            rental_bd3.append(True)
        else:
            rental_bd3.append(False)  
            
    for name in X.full_name:
        if name in rental_bd4_df.full_name.values:
            rental_bd4.append(True)
        else:
            rental_bd4.append(False)   
            
            
    for name in X.full_name:
        if name in rental_bd5_df.full_name.values:
            rental_bd5.append(True)
        else:
            rental_bd5.append(False)   
            
            
    print(len(house_bd1))        
            
    
    # Add columns from lists to census dataframe
    X['zillow_house_bd1'] = house_bd1
    X['zillow_house_bd2'] = house_bd2
    X['zillow_house_bd3'] = house_bd3
    X['zillow_house_bd4'] = house_bd4
    X['zillow_house_bd5'] = house_bd5
    X['zillow_rental_bd1'] = rental_bd1
    X['zillow_rental_bd2'] = rental_bd2
    X['zillow_rental_bd3'] = rental_bd3
    X['zillow_rental_bd4'] = rental_bd4
    X['zillow_rental_bd5'] = rental_bd5
    
    
    return X

In [None]:
census_df_zillow = create_contain_column(census_df)