In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census

# Census API Key
from config import census_key


c = Census(census_key, year=2013)

In [2]:
file = 'zipcodes_stl2.csv'
df = pd.read_csv(file)
df_small = df.iloc[0:5,:]
df_small

Unnamed: 0,ZIP Code,County,Latitude,Longitude
0,63101,Saint Louis City,38.631551,-90.193
1,63102,Saint Louis City,38.6352,-90.18702
2,63103,Saint Louis City,38.631451,-90.21415
3,63104,Saint Louis City,38.610701,-90.21362
4,63105,Saint Louis,38.645484,-90.32888


In [3]:
df_census = pd.DataFrame()

for zip in df_small['ZIP Code']:
    census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                                  "B19301_001E",
                                  "B17001_002E"), {'for': 'zip code tabulation area:'+str(zip)})
#     print(census_data)
    df_new = pd.DataFrame(census_data)
    df_census = df_census.append(df_new)
# df_census.head()
# Column Reordering
df_census = df_census.rename(columns={"B01003_001E": "Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "NAME": "Name", "zip code tabulation area": "ZIP Code"})

# Add in Poverty Rate (Poverty Count / Population)
df_census["Poverty Rate"] = 100 * \
    df_census["Poverty Count"].astype(
        int) / df_census["Population"].astype(int)

# Final DataFrame
df_census = df_census[["ZIP Code", "Population", "Median Age", "Household Income",
                       "Per Capita Income", "Poverty Count","Poverty Rate"]]

df_census['ZIP Code'] = pd.to_numeric(df_census['ZIP Code'])
df_census.head()

Unnamed: 0,ZIP Code,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate
0,63101,2613.0,30.4,54417.0,42701.0,610.0,23.344814
0,63102,2216.0,33.6,54018.0,32318.0,142.0,6.407942
0,63103,6824.0,30.4,34719.0,30274.0,1340.0,19.636577
0,63104,19520.0,31.2,45498.0,29780.0,5417.0,27.751025
0,63105,17361.0,33.8,86031.0,57408.0,1206.0,6.946604


In [4]:
df_merge = pd.merge(df_census, df_small, on=('ZIP Code'))
df_merge
# df_merge.dtypes

Unnamed: 0,ZIP Code,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,County,Latitude,Longitude
0,63101,2613.0,30.4,54417.0,42701.0,610.0,23.344814,Saint Louis City,38.631551,-90.193
1,63102,2216.0,33.6,54018.0,32318.0,142.0,6.407942,Saint Louis City,38.6352,-90.18702
2,63103,6824.0,30.4,34719.0,30274.0,1340.0,19.636577,Saint Louis City,38.631451,-90.21415
3,63104,19520.0,31.2,45498.0,29780.0,5417.0,27.751025,Saint Louis City,38.610701,-90.21362
4,63105,17361.0,33.8,86031.0,57408.0,1206.0,6.946604,Saint Louis,38.645484,-90.32888


In [5]:
#New Dependencies

import json
import pprint
# import requests
# import sys
import urllib
from urllib.parse import quote
    
from config import yelp_key

In [6]:
API_KEY=yelp_key

API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
MATCH_PATH = '/v3/businesses/matches'
SEARCH_PHONE_PATH = '/v3/businesses/search/phone'
BUSINESS_PATH = '/v3/businesses/'  # Business ID will come after slash.
# REVIEW_PATH = BUSINESS_PATH + business_id + 'reviews'
# GET https://api.yelp.com/v3/businesses/{id}/reviews
    
# Defaults for our simple example.
# DEFAULT_TERM = 'dinner'
# DEFAULT_LOCATION = 'San Francisco, CA'
# SEARCH_LIMIT = 5

In [7]:
def request(host, path, api_key, url_params=None):
    url_params = url_params or {}
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
#     print(u'Querying {0} ...'.format(url))
#     print(headers)

    response = requests.request('GET', url, headers=headers, params=url_params)
#     my_url=url
    return response.json() #, my_url

In [8]:
df_merge.dtypes
# df_merge['Latitude'] = df_merge['Latitude'].astype(str)
# df_merge['Longitude'] = df_merge['Longitude'].astype(str)
# df_merge.dtypes

df_businesses = pd.DataFrame()
df_merge.dtypes

ZIP Code               int64
Population           float64
Median Age           float64
Household Income     float64
Per Capita Income    float64
Poverty Count        float64
Poverty Rate         float64
County                object
Latitude             float64
Longitude            float64
dtype: object

In [9]:
len(df_small)
for index, row in df_small.iterrows():
        row_zip = row[0]
        row_county = row[1]
        row_lat = row[2]
        row_long = row[3]
        print(index, row_zip, row_lat, row_long)

0 63101 38.631551 -90.193
1 63102 38.6352 -90.18701999999999
2 63103 38.631451 -90.21415
3 63104 38.610701 -90.21361999999999
4 63105 38.645484 -90.32888


In [10]:
df_bus_total = pd.DataFrame()
prices = []
dict_total = {}

for index, row in df_merge.iterrows():
    row_zip = row[0]
    row_long = row[9]
    row_lat = row[8]
    print(index, row_zip, row_lat, row_long)
#     print(index, row[0])
#     url_params={"city":"St. Louis County", 'zip_code':'63131','latitude':'38.618582','longitude':'-90.43643'}
    url_params={"city":row_lat, 'zip_code':row_zip,'latitude':row_lat,'longitude':row_long}
    dict_businesses = request(API_HOST, SEARCH_PATH, API_KEY, url_params)

    df_businesses['business_id']=[business["id"] for business in dict_businesses['businesses']]
    df_businesses['name']=[business["name"]for business in dict_businesses['businesses']]
    df_businesses['rating']=[business["rating"]for business in dict_businesses['businesses']]
    df_businesses['zip_code']=[business['location']["zip_code"]for business in dict_businesses['businesses']]
    df_businesses['review_count']=[business["review_count"]for business in dict_businesses['businesses']]
#     try:
#         df_businesses['price']=[business["price"]for business in dict_businesses['businesses']]
#     except:
#         df_businesses['price']='not found'
    dict_total.update(dict_businesses)


    df_bus_total = df_bus_total.append(df_businesses)
df_businesses
# pprint.pprint(dict_businesses)

0 63101 38.631551 -90.193
1 63102 38.6352 -90.18701999999999
2 63103 38.631451 -90.21415
3 63104 38.610701 -90.21361999999999
4 63105 38.645484 -90.32888


Unnamed: 0,business_id,name,rating,zip_code,review_count
0,YSgcojsb2kWHXNLWaGCuBA,Pastaria,4.0,63105,847
1,R8t9g5nvi7VFyS8zsgmj8Q,Salt + Smoke,4.5,63130,1432
2,TXg82zeFo2MpX5BzZXpJUg,I Fratellini,4.5,63105,208
3,Fok0BLJP0OMxbbl3l6QmZA,Sauce on the Side,4.5,63105,302
4,b4YPBSnqU5_L6TUz147Dow,City Coffeehouse & Crêperie,4.0,63105,415
5,YPTYOQO8Lg9BtHsRwYBY7g,Mai Lee,4.0,63144,840
6,OZcl2HOUXAHuICrh5d2WjQ,Half & Half,4.0,63105,726
7,4mGnHfvwjE6P-vuKowHzCQ,Taco Buddha,4.5,63130,188
8,OXDo1mHlp2Io3CM6a1aB1w,Seoul Taco,4.0,63130,635
9,XvjJflV_0I1CS0_dckuV-A,801 Chophouse - St Louis,4.0,63105,256


In [11]:
df_bus_total

Unnamed: 0,business_id,name,rating,zip_code,review_count
0,8YDJraW_cg5IPTPisPfB-A,City Museum,4.5,63103,1244
1,iRIHK8-EwpeffwvoO4nzIA,Broadway Oyster Bar,4.5,63102,1737
2,Mr7Aov2n7wPCpwaUxk8lCw,Mango,4.0,63101,901
3,2BMk_drsikKWslJCXmQtjQ,Rooster,4.0,63101,1754
4,WCdSajl5Q0qywpv7K5jHdQ,Sugarfire Smoke House,4.5,63101,882
...,...,...,...,...,...
15,KdAWjL9MKjpJzEeI902qBA,Corner 17,4.0,63112,496
16,73ZXjIfdHJs59XOzKHQ45g,The Crossing,4.5,63105,183
17,oC91lprbbfzw7DOYiYln7Q,5 Star Burgers,4.0,63105,374
18,I8cL6l-Yu3CyHPXlFmOhtQ,Louie's Wine Dive & Clayton Kitchen,4.0,63105,219


In [12]:
for b in dict_total['businesses']:
    try:
        my_price = b['price']
        prices.append(my_price)
    except:
        prices.append('no price')
# df_businesses['price'] = prices
# df_businesses

# df_bus_total['price'] = prices
df_bus_total.head(5)
# prices

Unnamed: 0,business_id,name,rating,zip_code,review_count
0,8YDJraW_cg5IPTPisPfB-A,City Museum,4.5,63103,1244
1,iRIHK8-EwpeffwvoO4nzIA,Broadway Oyster Bar,4.5,63102,1737
2,Mr7Aov2n7wPCpwaUxk8lCw,Mango,4.0,63101,901
3,2BMk_drsikKWslJCXmQtjQ,Rooster,4.0,63101,1754
4,WCdSajl5Q0qywpv7K5jHdQ,Sugarfire Smoke House,4.5,63101,882


In [13]:
df_bus_total = df_bus_total.sort_values('business_id')
df_bus_total = df_bus_total.reset_index(drop=True)
df_bus_unique = df_bus_total.drop_duplicates(['business_id'], keep='first')
# df_bus_unique = df_bus_unique.set_index('business_id')
df_bus_unique.head(5)
df_bus_unique_small = df_bus_unique[1:5]
df_bus_unique_small

Unnamed: 0,business_id,name,rating,zip_code,review_count
1,0MhxvTys2ADYa3sOO91ldQ,Egg,4.5,63104,563
2,1Mc_Reqnxhs0eHQFAhXVUQ,Citygarden,4.5,63101,137
4,2BMk_drsikKWslJCXmQtjQ,Rooster,4.0,63101,1754
7,37dCsVwCe8yWPkW7TXCLCA,Melo's Pizzeria,5.0,63104,103


In [44]:
# business_id = '0MhxvTys2ADYa3sOO91ldQ'
dict_rev_total = {}
dict_reviews = {}
df_reviews = pd.DataFrame()
df_rev_total = pd.DataFrame()

for index, row in df_bus_unique_small.iterrows():
    row_bus = row[0]
#     print(row_bus)
    REVIEW_PATH = BUSINESS_PATH + row_bus + '/reviews'
    dict_reviews = request(API_HOST, REVIEW_PATH, API_KEY)
#     pprint.pprint(dict_reviews)
    df_reviews['business_id']= [row_bus for review in dict_reviews['reviews']]
    df_reviews['review_id']=[review["id"]for review in dict_reviews['reviews']]
    df_reviews['rating']=[review["rating"]for review in dict_reviews['reviews']]
    df_reviews['name']=[review['user']["name"]for review in dict_reviews['reviews']]
    df_rev_total = df_rev_total.append(df_reviews)
#     dict_rev_total.update(dict_reviews)
# # print(REVIEW_PATH)
# dict_reviews
df_rev_total


Unnamed: 0,business_id,review_id,rating,name
0,0MhxvTys2ADYa3sOO91ldQ,thOMsGuRwXtItKm3Pcr7iw,5,Ian S.
1,0MhxvTys2ADYa3sOO91ldQ,7YZdEcrGSHXzWrnVpqSZZA,4,Emily T.
2,0MhxvTys2ADYa3sOO91ldQ,AhoVT9lYWlq9bO1F-_56xg,5,Scott S.
0,1Mc_Reqnxhs0eHQFAhXVUQ,X4y4tN9fXDIO6T0PZxNtqA,5,Danielle W.
1,1Mc_Reqnxhs0eHQFAhXVUQ,B2vtv4rpEdfSTlxbPfhh2g,5,Brittany S.
2,1Mc_Reqnxhs0eHQFAhXVUQ,3CavzAk67fE0QaLLddWHBg,4,Dave D.
0,2BMk_drsikKWslJCXmQtjQ,yjWuxHY4MQDDKUxHa7kJvA,4,Jamie B.
1,2BMk_drsikKWslJCXmQtjQ,u8w-JcWPKRCpEo_5n9xDpg,2,Trevor P.
2,2BMk_drsikKWslJCXmQtjQ,BQsywEAiZWSQuRXUQNGLFw,4,michael d.
0,37dCsVwCe8yWPkW7TXCLCA,w_VSvmGwHGk1fF4SAjOq7g,5,Dan J.


In [48]:
import re

# df_rev_total.reset_index(inplace=True)

df_rev_total['short name']=""
short_name = []
for index, row in df_rev_total.iterrows():
    short_name = re.split("( )",row[4])
    print(short_name)
    df_rev_total.iloc[index, 5] = short_name[0].lower()

df_rev_total
# re.split("( )","I'm a test")

['Ian', ' ', 'S.']
['Emily', ' ', 'T.']
['Scott', ' ', 'S.']
['Danielle', ' ', 'W.']
['Brittany', ' ', 'S.']
['Dave', ' ', 'D.']
['Jamie', ' ', 'B.']
['Trevor', ' ', 'P.']
['michael', ' ', 'd.']
['Dan', ' ', 'J.']
['Zack', ' ', 'G.']
['Micah', ' ', 'G.']


Unnamed: 0,index,business_id,review_id,rating,name,short name
0,0,0MhxvTys2ADYa3sOO91ldQ,thOMsGuRwXtItKm3Pcr7iw,5,Ian S.,ian
1,1,0MhxvTys2ADYa3sOO91ldQ,7YZdEcrGSHXzWrnVpqSZZA,4,Emily T.,emily
2,2,0MhxvTys2ADYa3sOO91ldQ,AhoVT9lYWlq9bO1F-_56xg,5,Scott S.,scott
3,0,1Mc_Reqnxhs0eHQFAhXVUQ,X4y4tN9fXDIO6T0PZxNtqA,5,Danielle W.,danielle
4,1,1Mc_Reqnxhs0eHQFAhXVUQ,B2vtv4rpEdfSTlxbPfhh2g,5,Brittany S.,brittany
5,2,1Mc_Reqnxhs0eHQFAhXVUQ,3CavzAk67fE0QaLLddWHBg,4,Dave D.,dave
6,0,2BMk_drsikKWslJCXmQtjQ,yjWuxHY4MQDDKUxHa7kJvA,4,Jamie B.,jamie
7,1,2BMk_drsikKWslJCXmQtjQ,u8w-JcWPKRCpEo_5n9xDpg,2,Trevor P.,trevor
8,2,2BMk_drsikKWslJCXmQtjQ,BQsywEAiZWSQuRXUQNGLFw,4,michael d.,michael
9,0,37dCsVwCe8yWPkW7TXCLCA,w_VSvmGwHGk1fF4SAjOq7g,5,Dan J.,dan


In [59]:
file = 'gender names.csv'
df = pd.read_csv(file)
df = df.set_index('first_name')
df.head(30)

Unnamed: 0_level_0,nameprimary,gender,count
first_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,,Not Known,1
0lufemi,0LUFEMI,Not Known,1
18991995,18991995,Not Known,1
19101988,19101988,Not Known,1
19101998,19101998,Not Known,1
19121982,19121982,Not Known,1
19461995,19461995,Not Known,1
19511995,19511995,Not Known,1
211;scar,211;SCAR,Not Known,1
3585167,3585167,Not Known,1


In [67]:
df_rev_total['gender']=""
gender = []
for index, row in df_rev_total.iterrows():
    gender = df.loc[str(row[5]),'gender']
#     print(gender)
    df_rev_total.iloc[index, 6] = gender

df_rev_total

Unnamed: 0,index,business_id,review_id,rating,name,short name,gender
0,0,0MhxvTys2ADYa3sOO91ldQ,thOMsGuRwXtItKm3Pcr7iw,5,Ian S.,ian,Male
1,1,0MhxvTys2ADYa3sOO91ldQ,7YZdEcrGSHXzWrnVpqSZZA,4,Emily T.,emily,Female
2,2,0MhxvTys2ADYa3sOO91ldQ,AhoVT9lYWlq9bO1F-_56xg,5,Scott S.,scott,Male
3,0,1Mc_Reqnxhs0eHQFAhXVUQ,X4y4tN9fXDIO6T0PZxNtqA,5,Danielle W.,danielle,Female
4,1,1Mc_Reqnxhs0eHQFAhXVUQ,B2vtv4rpEdfSTlxbPfhh2g,5,Brittany S.,brittany,Female
5,2,1Mc_Reqnxhs0eHQFAhXVUQ,3CavzAk67fE0QaLLddWHBg,4,Dave D.,dave,Male
6,0,2BMk_drsikKWslJCXmQtjQ,yjWuxHY4MQDDKUxHa7kJvA,4,Jamie B.,jamie,Male
7,1,2BMk_drsikKWslJCXmQtjQ,u8w-JcWPKRCpEo_5n9xDpg,2,Trevor P.,trevor,Male
8,2,2BMk_drsikKWslJCXmQtjQ,BQsywEAiZWSQuRXUQNGLFw,4,michael d.,michael,Male
9,0,37dCsVwCe8yWPkW7TXCLCA,w_VSvmGwHGk1fF4SAjOq7g,5,Dan J.,dan,Male
