### Input Dependencies & Load Data

In [None]:
# Import dependencies
import pandas as pd
import numpy as np
import plotly.express as px

# For GEOPY (alternate)
from geopy.geocoders import Nominatim

# Import the time library and the datetime module from the datetime library 
import time
from datetime import datetime

In [None]:
# Create real estate dataframe
real_estate_df=pd.read_csv("Resources/Real_Estate_Data.csv")
real_estate_df.head()

In [None]:
# Create Trader Joe's dataframe
trader_joes_df=pd.read_csv("Resources/Trader_Joes_Stores.csv")
trader_joes_df.head()

In [None]:
# Create Walmart dataframe
walmart_df=pd.read_csv("Resources/Walmart_Stores.csv")
walmart_df.head()

### Clean Real_Estate_Data.csv

In [None]:
# Find the length of the real estate df
len(real_estate_df)

In [None]:
# Find the number of null values in each column
real_estate_df.isnull().sum()

In [None]:
# Find unique values for status column
real_estate_df.status.unique()

In [None]:
# Drop status, street, and sold date columns
real_estate_df = real_estate_df.drop(["status", "street", "sold_date"], axis=1)
real_estate_df.head()

In [None]:
# Fill NAs for acre_lot with 0
real_estate_df['acre_lot'] = real_estate_df['acre_lot'].fillna(0)

In [None]:
# Drop all remaining NAs
real_estate_df = real_estate_df.dropna()

In [None]:
# Preview dataframe
real_estate_df.head()

In [None]:
# Checking length after dropping NAs
len(real_estate_df)

In [None]:
# Find unique values for the state column
real_estate_df.state.unique()

In [None]:
# Dropping states with real estate sales that do not have a TJs

real_estate_df = real_estate_df[real_estate_df.state != "Puerto Rico"]
real_estate_df = real_estate_df[real_estate_df.state != "Virgin Islands"]
real_estate_df = real_estate_df[real_estate_df.state != "Wyoming"]
real_estate_df = real_estate_df[real_estate_df.state != "West Virginia"]
real_estate_df.head()

In [None]:
# Checking datatypes specifically for zip code
real_estate_df.dtypes

In [None]:
# Convert zip_code to 5 digits 
real_estate_df['zip_code'] = real_estate_df['zip_code'].astype(int).astype(str).str.zfill(5)
real_estate_df

In [None]:
# Remove apt from addresses (necessary for geocoding)

# Create Forms
form_unit=r'(\s*Unit\s*[a-zA-Z0-9]+)'
form_apt=r'(\s*Apt\s*[a-zA-Z0-9]+)'

# Remove apt and units from full address
real_estate_df['street_address']=real_estate_df['full_address'].str.replace(form_unit,'')
real_estate_df['street_address']=real_estate_df['street_address'].str.replace(form_apt,'')
real_estate_df.head()

In [None]:
# Checking length after dropping NAs
len(real_estate_df)

### Clean Trader_Joes_Stores.csv

In [None]:
# Drop phone and website columns
trader_joes_df = trader_joes_df.drop(["phone", "website"], axis=1)
trader_joes_df.head()

In [None]:
# Convert zip_code to 5 digits 
trader_joes_df['zip'] = trader_joes_df['zip'].astype(int).astype(str).str.zfill(5)
trader_joes_df.head()

In [None]:
# Find unique values for the state column
trader_joes_df.state.unique()

In [None]:
# Dropping states with TJs that do not have real estate sales

trader_joes_states = ['VT', 'ME', 'RI', 'NH', 'CT', 'PA', 'DE', 'NJ', 'MA', 'NY']

for x in trader_joes_states:
    new_trader_joes_df = trader_joes_df[trader_joes_df.state == x]
new_trader_joes_df.head()

### Clean Walmart_Stores.csv

In [None]:
# Drop unnecessary columns
walmart_df = walmart_df.drop(["url", "phone_number_1", "phone_number_2", "fax_1", "fax_2", "email_1", "email_2", "website", "open_hours", "facebook", "twitter", "instagram", "pinterest", "youtube"], axis=1)
walmart_df.head()

### Using Latitude & Longitude Values to Calculate the Distance to Trader Joe's Stores

In [None]:
# Insert columns for lat / long 
real_estate_df['location_lat'] = ""
real_estate_df['location_long'] = ""
real_estate_df['location_address'] = ""
real_estate_df.head()

In [None]:
# Getting lat / long for real estate addresses with GeoPy

geolocator = Nominatim(user_agent="myApp")

# Define record count and set count 
set_count = 1
record_count = 1

# Use API to add lat and long
for i in real_estate_df.index:

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(real_estate_df['street_address'][i])
        
        #append lat/long to column using dataframe location
        real_estate_df.loc[i,'location_lat'] = location.latitude
        real_estate_df.loc[i,'location_long'] = location.longitude
        real_estate_df.loc[i,'location_address'] = location.address
       
    except:
        #catches exception for the case where no value is returned
        #appends null value to column
        real_estate_df.loc[i,'location_lat'] = ""
        real_estate_df.loc[i,'location_long'] = ""
        real_estate_df.loc[i,'location_address'] = ""
    
    # Keep track of amount of records
    if (i % 1000 == 0 and i >= 1000):
        set_count += 1
        record_count = 1
        
    # Log the URL, record, and set numbers and the city.
    print(f"Processing Record {record_count} of Set {set_count}")
    # Add 1 to the record count.
    record_count += 1

        
# Indicate that Data Loading is complete.
print("-----------------------------")
print("Data Retrieval Complete      ")
print("-----------------------------")

In [None]:
# # FOR ZIP CODE 
# # Find whether the zip code of the address has a Trader Joe's 

# # Insert TJs binary column in df
# real_estate_df["TJs_store"] = ""

# # Create empty list for TJ stores
# zip_code_matching = []

# # For loop to check matching TJs zip code
# for zip_code in range(len(real_estate_df)):
    
#     if zip_code in trader_joes_df['zip']:
#         zip_code_matching.append('Yes')
#     else:
#         zip_code_matching.append('No')
       
# real_estate_df["TJs_store"] = zip_code_matching

# real_estate_df.sample(10)

In [None]:
# # Find whether the zip code of the address has a Walmart

# # Insert TJs binary column in df
# real_estate_df["Walmart_store"] = ""

# # Create empty list for TJ stores
# walmart_zip_code_matching = []

# # For loop to check matching TJs zip code
# for zip_code in range(len(real_estate_df)):
    
#     if zip_code in walmart_df['zip_code']:
#         walmart_zip_code_matching.append('Yes')
#     else:
#         walmart_zip_code_matching.append('No')
       
# real_estate_df["Walmart_store"] = walmart_zip_code_matching

# real_estate_df.sample(10)

In [None]:
# # Single address lat/long test
# gmaps_key = googlemaps.Client(key=g_key)
# geocode_obj = gmaps_key.geocode("23 Moore St, Agawam, MA, 01001")
# geocode_obj

In [None]:
# Finding lat / long for all addresses

# Import API key
gmaps_key = googlemaps.Client(key=g_key)

# Create columns to store longitude and lattitude
real_estate_df["longitude"] = None
real_estate_df["latitude"] = None

# Define record count and set count 
set_count = 0
record_count = 0

# Use API to add lat and long
for i in real_estate_df.index:
    
    # Keep track of amount of records
    if (i % 1000 == 0 and i >= 1000):
        set_count += 1
        record_count = 1
        
    # Log the URL, record, and set numbers and the city.
    print(f"Processing Record {record_count} of Set {set_count}")
    # Add 1 to the record count.
    record_count += 1
    
    geocode_obj = gmaps_key.geocode(real_estate_df.loc[i, "full_address"])
    try:
        lat = geocode_obj[0]['geometry']['location']['lat']
        lon = geocode_obj[0]['geometry']['location']['lng']
        real_estate_df.loc[i,'latitude'] = lat
        real_estate_df.loc[i,'longitude'] = lon
    except:
        lat = None
        lon = None

# Indicate that Data Loading is complete.
print("-----------------------------")
print("Data Retrieval Complete      ")
print("-----------------------------")

In [None]:
# Find NAs for lat / long
real_estate_df.isnull().sum()

In [None]:
# # Finding the distance from house address to Trader Joes
# # https://stackoverflow.com/questions/51793928/distance-between-two-points-in-pandas-csv-data-frame

# from math import sin, cos, sqrt, atan2, radians
# def calculate_distance(lat1, lon1, lat2, lon2):
#     R = 6373.0

#     lat1 = radians(lat1)
#     lon1 = radians(lon1)
#     lat2 = radians(lat2)
#     lon2 = radians(lon2)

#     dlon = lon2 - lon1
#     dlat = lat2 - lat1

#     a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
#     c = 2 * atan2(sqrt(a), sqrt(1 - a))

#     return R * c

# df['distance'] = [calculate_distance(**df[['lat1', 'lon1', 'lat2', 'lon2']].iloc[i].to_dict()) for i in range(df.shape[0])]

### Inspect Real_Estate_Data.csv Data

In [None]:
# LOOK AT DOCUMENTATION FOR THIS AND ADD OTHER VARIABLES
# https://www.geeksforgeeks.org/python-pandas-dataframe-corr/
real_estate_df.corr()

In [None]:
# Investigating distribution
real_estate_df.describe()

In [None]:
# Creating histogram of prices
fig = px.histogram(real_estate_df, x='price')
fig.show()

In [None]:
# Creating a box plot of prices
fig = px.box(real_estate_df, y='price')
fig.show()

In [None]:
# Creating a scatter plot of two separate variables
fig = px.scatter(x=real_estate_df['price'], y=real_estate_df['bed'])
fig.show()

In [None]:
# Find outliers for price
def find_outliers_IQR(real_estate_df):
    q1=real_estate_df.quantile(0.25)
    q3=real_estate_df.quantile(0.75)
    IQR=q3-q1
    outliers = real_estate_df[((real_estate_df<(q1-1.5*IQR)) | (real_estate_df>(q3+1.5*IQR)))]
    return outliers

outliers = find_outliers_IQR(real_estate_df['price'])
print('number of outliers: '+ str(len(outliers)))
print('max outlier value: '+ str(outliers.max()))
print('min outlier value: '+ str(outliers.min()))

In [None]:
# Find outliers for all columns and drop
outliers = find_outliers_IQR(real_estate_df[['price','bed', 'bath', 'acre_lot', 'house_size']]).dropna(thresh=2)
outliers

In [None]:
# Find the expected length of the new dataframe
(len(real_estate_df)) - (len(outliers))

In [None]:
# Remove outliers
clean_real_estate_df= real_estate_df[~real_estate_df.index.isin(outliers.index)]
clean_real_estate_df.head()

In [None]:
# Find the length of the new dataframe
len(clean_real_estate_df)

In [None]:
# Remove $160 mil house
clean_real_estate_df = clean_real_estate_df[clean_real_estate_df.price <= 160000000]

In [None]:
# Creating histogram of prices
fig = px.histogram(clean_real_estate_df, x='price')
fig.show()

In [None]:
# Creating a box plot of prices
fig = px.box(clean_real_estate_df, y='price')
fig.show()

In [None]:
# Creating a scatter plot of two separate variables
fig = px.scatter(x=clean_real_estate_df['price'], y=clean_real_estate_df['bed'])
fig.show()

In [None]:
# Export dataframe to CSV
compression_opts = dict(method='zip', archive_name='Resources/Clean_Real_Estate.csv')
clean_real_estate_df.to_csv('Resources/Clean_Real_Estate.zip', index=False, compression=compression_opts)