# DC AREA RENT ANALYSIS -- Group 6
## DNSC 6211: PROGRAMMING FOR ANALYTICS

* Dmitry Chudinovskikh
* Patrick Steeves 
* Daniel Swart 
* Szuying Yang 
* Xin Yuan

# INTRODUCTION

### OBJECTIVE: 
- Predicting the rent using the predictors below.


### PREDICTORS: 
- Num. of bedrooms 
- Website image count 
- Income by zip code 
- Amenities offered (air-conditioning availability, or washer-dryer, pets permission, parking, dishwasher) 
- Distance to the nearest metro station


# Obtain Data and Data Formatting

## Data Sources

- Apartments.com for DC, Arlington, Alexandria, Bethesda, Silver Spring
- Individual Income Tax Statistics by ZIP Code Data  
- WMATA JSON data for metro stations location 

## Method 

- Web Scraping in python
- Pandas DataFrame

In [None]:
def getData(url):
    """
    Input: Dictionary containing URLs of first result page
    Output: Two csv files summarizing the data from results
    Returns: Nothing
    """
    
    apartments = []
    i = 0 # Key to identify restaurants
    pages = range(1,29)   # Range of strings to add to URL to go to next result page
    
    
    for page in pages:   # Get results one page at a time
        time.sleep(0.1)
        url1 = url+str(page)+'/'  # URL to search
        request = requests.get(url1)
        html = request.text
        soup = bs(html,"html.parser")    
        results1 = soup.findAll('article', {"class" : "diamond placard"})
        results2 = soup.findAll('article', {"class" : "platinum placard"})
        results3 = soup.findAll('article', {"class" : "gold placard"})
        results4 = soup.findAll('article', {"class" : "silver placard"})
        results5 = soup.findAll('article', {"class" : "prosumer placard"})
        results6 = soup.findAll('article', {"class" : "basic placard"})
        results = results1 + results2 + results3 + results4 + results5 + results6

        for result in results:  
            apartments.append({})  
            apt_name = result.findAll('a', {"class" : "placardTitle js-placardTitle"})
            apartments[i]['Name'] = apt_name[0].getText()
            address = result.findAll('div', {"class" : "location"})
            address = address[0].getText()
            splits = address.split(',')
            if len(splits) > 2:
                apartments[i]['Address'] = splits[0]
                apartments[i]['City'] = splits[1]
                apartments[i]['State'] = splits[2].split()[0]
                apartments[i]['Zip'] = splits[2].split()[1]
            else:
                apartments[i]['Address'] = apt_name[0].getText()
                apartments[i]['City'] = splits[0].strip()
                apartments[i]['State'] = splits[1].split()[0]
                if len(splits[1].split()) > 1:
                    apartments[i]['Zip'] = int(splits[1].split()[1])
                else:
                    apartments[i]['Zip'] = ''
            last_update = result.findAll('span', {"class" : "listingFreshness"})
            apartments[i]['Last update'] = last_update[0].getText().strip()            
            images = result.findAll('span', {"class" : "js-spnImgCount"})
            if len(images) > 0:
                apartments[i]['Image count'] = images[0].getText()
            else:
                apartments[i]['Image count'] = 'Not specified'

            price = result.findAll('span', {"class" : "altRentDisplay"})
            if '-' in price[0].getText():
                apartments[i]['Rent'] = price[0].getText().split('-')[1].replace('$','').strip()
            else:
                apartments[i]['Rent'] = price[0].getText().replace('$','')
            if 'Call' in apartments[i]['Rent']:
                apartments[i]['Rent'] = ''
            else:
                apartments[i]['Rent'] = int(apartments[i]['Rent'].replace(',',''))

            style = result.findAll('span', {"class" : "unitLabel propertyStyle"})
            if len(style) == 0:
                style = result.findAll('span', {"class" : "unitLabel"})
                if 'Studio' in style[0].getText():
                    if len(style[0].getText().split()) < 2:
                        apartments[i]['Num bedrooms'] = 0
                    else:
                        apartments[i]['Num bedrooms'] = style[0].getText().split()[2]
                else:
                    if '-' in style[0].getText():
                        apartments[i]['Num bedrooms'] = style[0].getText().split()[0].split('-')[1]
                    else:
                        apartments[i]['Num bedrooms'] = style[0].getText().split()[0]
            else:
                apartments[i]['Num bedrooms'] = 'N/A'
            
            pet_results = {}
            if (len(result.findAll('ul', {"class" : "amenities"}))) > 0:
                pet_results['Dogs'] = result.findAll('li', {"class" : "petIcon"})
                pet_results['Cats'] = result.findAll('li', {"class" : "catIcon"})
                if (len(pet_results['Dogs']) + len(pet_results['Cats']) > 1):
                    apartments[i]['Pets Allowed'] = 'Yes'
                else:
                    apartments[i]['Pets Allowed'] = 'No'
            else:
                apartments[i]['Pets Allowed'] = 'Contact Management'
            i+=1
    return apartments

In [None]:
import pandas as pd
# concatenate the 2 csv files that shows the US national income status, save it to income_df
df1 = pd.read_csv("2014_incometax_part1.csv",names = None, index_col=False)
df2 = pd.read_csv("2014_incometax_part2.csv",names = None, index_col=False)
df_income = pd.concat([df1, df2], axis=0)

# select and save only the unique zip codes as referential list of zip 
apt_df = pd.read_csv("apartments.csv",names = None, index_col=False)
apt_df = apt_df.Zip.unique()
zip = apt_df.tolist()

# select income based on zipcodes in the "zip" list and save as df_match
df_match = df_income[df_income['ZIPCODE'].isin(zip)] 
df_match = df_match[['A00100','ZIPCODE','N1','STATE']]
df_match = df_match.rename(columns = {'ZIPCODE':'Zip','A00100':'Income','N1':'Num_of_Households'})
df_match[['Income', 'Zip','Num_of_Households']] = df_match[['Income', 'Zip','Num_of_Households']].astype(int)

#join df_match to df with "Zip" as the key to get df_full and save to csv file
df = pd.read_csv('apartments_full.csv', names = None, index_col = False, encoding='ISO-8859-1')
df_full = pd.merge(df,df_match[['Income','Zip','Num_of_Households']], on = ["Zip"], how = 'inner')
df_full[['Zip','Num_of_Households']] = df_full[['Zip','Num_of_Households']].astype(int)
df_full.to_csv('apartments_full.csv', index = False)

### Generating cordinates (latitude, longitude) according to address, using https://geocod.io/

### Downing Metro Station lists of metro lines (json files) from https://developer.wmata.com/

In [None]:
# define function to get all json files in the current directory
def getFileNames():
    """
    Input = None
    Returns = list of all JSON files in current directory
    """
    import os
    included_extenstions = ['json'] ;
    file_names = [fn for fn in os.listdir(os.getcwd())
        if any([fn.endswith(ext)
        for ext in included_extenstions])];
    return file_names

# read json files and load data
def readMetroData(file):
    import json
    with open(file, 'r') as f:
        data = json.load(f)
    return data

# select only coordinates of stations on different lines from data
def getCoords(metro_data):
    metro_stations = [metro_data[line]['Stations'] for line in metro_data]
    metro_stops = []
    for i in range(len(metro_stations)):
        metro_stops+= metro_stations[i]

    coords = [(stop['Lat'],stop['Lon']) for stop in metro_stops]
    return coords

def readCSV(filename):
    import csv
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        apartments = []
        for row in reader:
            apartments.append(row)
    return apartments

def calculateDistance(coord1, coord2):
    import math
    R = 6371000
    latitude1 = coord1[0]
    longitude1 = coord1[1]
    
    latitude2 = coord2[0]
    longitude2 = coord2[1]
    
    lat1 = math.radians(latitude1)
    lat2 = math.radians(latitude2)
    delta_lat = math.radians(latitude1 - latitude2)
    delta_lon = math.radians(longitude1 - longitude2)
    
    x = delta_lon * math.cos((lat1+lat2)/2)
    y = delta_lat
    d = math.sqrt(x*x + y*y) * R
    return d

def writeData(apartment_data):
    import csv
    with open('apartments_full.csv', 'w', newline = '') as f:
        fieldnames = ['Name','Address','City','State','Zip','Style','Last update','Image count','Num bedrooms','Rent','Pets Allowed','Latitude','Longitude','Metro Distance','Income','Num_of_Households']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for apt in apartment_data:
            writer.writerow(apt)

       
    
#codes to use functions defined above            
files = getFileNames() 

#classify station lists in dictionaries of different metro lines
metro_lines = {}
for file in files:    
    metro_lines[file.replace('Metro_','').replace('.json','')] = readMetroData(file)
metro_coords = getCoords(metro_lines)

#get all the coordinates of apartments to compare with that of metro stations and calculate distances
apartments = readCSV('apartments_geocoded.csv')
for apt in apartments:
    distances = []
    for station in metro_coords:
        distances.append(calculateDistance((float(apt['Latitude']),float(apt['Longitude'])), station))
    apt['Metro Distance'] = min(distances)
    if min(distances) > 9041:
        apt['Metro Distance'] = ''
        
#write Metro Distance column into csv file and rename it
writeData(apartments)        

In [None]:
# drop unrelated rows 
to_drop = ['CT', 'MI']
df1 = df[~df['State'].isin(to_drop)]
# clean out the null values and save it as 'apartments_nonull.csv'
df1 = df[~df['Num bedrooms'].isnull()]
df1 = df[~df['Rent'].isnull()]
df1 = df[~df['Image count'].isnull()]
df1 = df[~df['Image count'].isin(['Not specified'])]
df1.to_csv('apartments_nonull.csv', index = False)