In [581]:
# Objectives:

# FOR RELEASE 1 MVP
# - Only 2019 data
# - Rename each 2019 data column (population, rent, crime)
# - Create column with city, state (abbreviation?)
# - Column with state
# - Make table for each csv, or just join with pandas
# - Join 3 df/tables on city, state (abbreviation?) column
# - Check for duplicate cities and drop (already drop with inner join)
# - Drop any cities that lack pop, rent, crime data (already drop with inner join, but still drop NaN values and fill later)
# - Bin population, rent, and crime data
# - Categorize population, rent, and crime data

# MACHINE LEARNNG (in upcoming notebook)
# - Vectorize data
# - Train nearest neighbors model on city/state, pop, rent, and crime data
# - Make model into a function
# - Use function to make a recommendation of Location based on population, rent, crime rate
# - Check to see if recommendation matches well with data. If so:
# - Pickle the model and it is ready to be put into API and tested with Web/iOS
# - Once these steps are completed and working, we will also incorporate walkability and livability score in Release 2.
# - When walkability and livability scores are also included and working well in the model, we welcome and further additions to the model, granted the data is from 2019 (otherwise we can include a disclaimer, or we push all of the data used back to 2018, for example, as long as the data all comes from the same year)

# STRETCH GOALS
# - add more data that fit team's user stories
# - attempt forecasting using data from 2010-2020
# - try fb prophet model among other time series models and techniques
# - be in conversation with engineers

In [582]:
# Imports

import pandas as pd
import numpy as np

In [583]:
# Load an inspect population data

population = pd.read_csv("../data/population.csv")
population

Unnamed: 0,Location,Census,Estimates Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019 Population
0,"Abbeville city, Alabama",2688,2705,2699,2694,2643,2628,2608,2600,2584,2575,2571,2560
1,"Adamsville city, Alabama",4522,4506,4500,4493,4471,4449,4420,4390,4356,4327,4308,4281
2,"Addison town, Alabama",758,754,751,750,743,742,739,734,731,726,723,718
3,"Akron town, Alabama",356,356,355,347,347,343,338,339,333,332,331,328
4,"Alabaster city, Alabama",30352,31112,31209,31375,31684,31980,32182,32772,33017,33275,33413,33487
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19497,"Wamsutter town, Wyoming",451,451,450,453,462,487,508,499,493,483,474,467
19498,"Wheatland town, Wyoming",3627,3625,3622,3620,3626,3622,3642,3645,3587,3549,3527,3462
19499,"Worland city, Wyoming",5487,5487,5487,5436,5419,5419,5328,5332,5263,5158,5071,5024
19500,"Wright town, Wyoming",1807,1807,1810,1812,1864,1860,1856,1885,1857,1760,1754,1753


In [584]:
# Check population data types

population.dtypes

Location           object
Census             object
Estimates Base     object
2010               object
2011               object
2012               object
2013               object
2014               object
2015               object
2016               object
2017               object
2018               object
2019 Population    object
dtype: object

In [585]:
# Drop columns from population data (or can create a copy with just location and 2019 population data)

population = population.drop(['Census', 'Estimates Base', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'], axis=1)
population

# In Location column, after the name of the town or city it contains "town" or "city". Wrangle this out.

Unnamed: 0,Location,2019 Population
0,"Abbeville city, Alabama",2560
1,"Adamsville city, Alabama",4281
2,"Addison town, Alabama",718
3,"Akron town, Alabama",328
4,"Alabaster city, Alabama",33487
...,...,...
19497,"Wamsutter town, Wyoming",467
19498,"Wheatland town, Wyoming",3462
19499,"Worland city, Wyoming",5024
19500,"Wright town, Wyoming",1753


In [586]:
# Create new column to specify if Location is city or town

population["Town or City"] = population['Location'].str.extract("(city|town)")
population

Unnamed: 0,Location,2019 Population,Town or City
0,"Abbeville city, Alabama",2560,city
1,"Adamsville city, Alabama",4281,city
2,"Addison town, Alabama",718,town
3,"Akron town, Alabama",328,town
4,"Alabaster city, Alabama",33487,city
...,...,...,...
19497,"Wamsutter town, Wyoming",467,town
19498,"Wheatland town, Wyoming",3462,town
19499,"Worland city, Wyoming",5024,city
19500,"Wright town, Wyoming",1753,town


In [587]:
# Remove city and town strings from Location column so that the column can be used to join with other data

population["Location"] = population['Location'].str.replace(' city', '', regex=False)
population["Location"] = population['Location'].str.replace(' town', '', regex=False)

population

# Consider adding just a State column, having City and State columns separate

Unnamed: 0,Location,2019 Population,Town or City
0,"Abbeville, Alabama",2560,city
1,"Adamsville, Alabama",4281,city
2,"Addison, Alabama",718,town
3,"Akron, Alabama",328,town
4,"Alabaster, Alabama",33487,city
...,...,...,...
19497,"Wamsutter, Wyoming",467,town
19498,"Wheatland, Wyoming",3462,town
19499,"Worland, Wyoming",5024,city
19500,"Wright, Wyoming",1753,town


In [588]:
# Unique number of population Locations count

population['Location'].nunique()

19491

In [589]:
# Load and inspect rental rates data

rent = pd.read_csv("../data/rental_rates.csv")
rent

Unnamed: 0,RegionID,RegionName,SizeRank,MsaName,2014-01,2014-02,2014-03,2014-04,2014-05,2014-06,...,2020-02,2020-03,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,2020-11
0,61639,10025,1,"New York, NY",3012.0,3025.0,3037.0,3049.0,3062.0,3074.0,...,3268.0,3234.0,3199.0,3162.0,3125.0,3088.0,3048.0,3008.0,2968.0,2925.0
1,84654,60657,2,"Chicago, IL",1588.0,1594.0,1599.0,1605.0,1610.0,1615.0,...,1834.0,1829.0,1824.0,1818.0,1813.0,1807.0,1801.0,1795.0,1788.0,1781.0
2,61637,10023,3,"New York, NY",3114.0,3123.0,3131.0,3140.0,3148.0,3156.0,...,3307.0,3275.0,3244.0,3211.0,3178.0,3144.0,3108.0,3072.0,3035.0,2997.0
3,91982,77494,4,"Houston, TX",1759.0,1763.0,1766.0,1770.0,1773.0,1776.0,...,1775.0,1777.0,1780.0,1782.0,1785.0,1788.0,1791.0,1794.0,1796.0,1799.0
4,84616,60614,5,"Chicago, IL",1740.0,1745.0,1750.0,1755.0,1759.0,1764.0,...,2023.0,2017.0,2010.0,2003.0,1995.0,1988.0,1979.0,1970.0,1962.0,1952.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,62321,11976,9253,"New York, NY",63788.0,,,,,,...,49083.0,48860.0,48636.0,,,,,,,47911.0
3186,58624,2110,9469,"Boston, MA",4113.0,4105.0,4097.0,4089.0,,4077.0,...,4510.0,,,4350.0,4294.0,4239.0,4180.0,4121.0,4062.0,4002.0
3187,66128,20004,9592,"Washington, DC",,,2295.0,2304.0,,2323.0,...,2496.0,2494.0,2492.0,2490.0,2487.0,2484.0,2481.0,2478.0,2475.0,2471.0
3188,399647,80951,9634,"Colorado Springs, CO",,1252.0,1256.0,1260.0,1265.0,1269.0,...,,1656.0,1662.0,1669.0,1676.0,1683.0,1690.0,1697.0,1704.0,1712.0


In [590]:
# Check rent data types

rent.dtypes

RegionID        int64
RegionName      int64
SizeRank        int64
MsaName        object
2014-01       float64
               ...   
2020-07       float64
2020-08       float64
2020-09       float64
2020-10       float64
2020-11       float64
Length: 87, dtype: object

In [591]:
# Drop all columns you dont want, or just use code below to make new df

#rent = rent.drop(['RegionID', 'RegionName', 'SizeRank', '2014-01', '2014-02', '2014-03', '2014-04', ...], axis=1)

In [592]:
# Create copy of rental rates dataframe and change column names

rent = rent[['MsaName','2019-12']].copy()
rent = rent.rename(columns = {"MsaName":"Location"})
rent = rent.rename(columns = {"2019-12":"2019 Rental Rates"})
rent

Unnamed: 0,Location,2019 Rental Rates
0,"New York, NY",3311.0
1,"Chicago, IL",1838.0
2,"New York, NY",3344.0
3,"Houston, TX",1772.0
4,"Chicago, IL",2028.0
...,...,...
3185,"New York, NY",50175.0
3186,"Boston, MA",4585.0
3187,"Washington, DC",2495.0
3188,"Colorado Springs, CO",1640.0


In [593]:
# Replace state abbreviations with full state names

rent["Location"] = rent['Location'].str.replace('AK', 'Alaska', regex=False)
rent["Location"] = rent['Location'].str.replace('AL', 'Alabama', regex=False)
rent["Location"] = rent['Location'].str.replace('AR', 'Arkansas', regex=False)
rent["Location"] = rent['Location'].str.replace('AS', 'American Samoa', regex=False)
rent["Location"] = rent['Location'].str.replace('AZ', 'Arizona', regex=False)
rent["Location"] = rent['Location'].str.replace('CA', 'California', regex=False)
rent["Location"] = rent['Location'].str.replace('CO', 'Colorado', regex=False)
rent["Location"] = rent['Location'].str.replace('CT', 'Connecticut', regex=False)
rent["Location"] = rent['Location'].str.replace('DC', 'District of Columbia', regex=False)
rent["Location"] = rent['Location'].str.replace('DE', 'Delaware', regex=False)
rent["Location"] = rent['Location'].str.replace('FL', 'Florida', regex=False)
rent["Location"] = rent['Location'].str.replace('GA', 'Georgia', regex=False)
rent["Location"] = rent['Location'].str.replace('GU', 'Guam', regex=False)
rent["Location"] = rent['Location'].str.replace('HI', 'Hawaii', regex=False)
rent["Location"] = rent['Location'].str.replace('IA', 'Iowa', regex=False)
rent["Location"] = rent['Location'].str.replace('ID', 'Idaho', regex=False)
rent["Location"] = rent['Location'].str.replace('IL', 'Illinois', regex=False)
rent["Location"] = rent['Location'].str.replace('IN', 'Indiana', regex=False)
rent["Location"] = rent['Location'].str.replace('KS', 'Kansas', regex=False)
rent["Location"] = rent['Location'].str.replace('KY', 'Kentucky', regex=False)
rent["Location"] = rent['Location'].str.replace('LA', 'Louisiana', regex=False)
rent["Location"] = rent['Location'].str.replace('MA', 'Massachusetts', regex=False)
rent["Location"] = rent['Location'].str.replace('MD', 'Maryland', regex=False)
rent["Location"] = rent['Location'].str.replace('ME', 'Maine', regex=False)
rent["Location"] = rent['Location'].str.replace('MI', 'Michigan', regex=False)
rent["Location"] = rent['Location'].str.replace('MN', 'Minnesota', regex=False)
rent["Location"] = rent['Location'].str.replace('MO', 'Missouri', regex=False)
rent["Location"] = rent['Location'].str.replace('MP', 'Northern Mariana Islands', regex=False)
rent["Location"] = rent['Location'].str.replace('MS', 'Mississippi', regex=False)
rent["Location"] = rent['Location'].str.replace('MT', 'Montana', regex=False)
rent["Location"] = rent['Location'].str.replace('NA', 'National', regex=False)
rent["Location"] = rent['Location'].str.replace('NC', 'North Carolina', regex=False)
rent["Location"] = rent['Location'].str.replace('ND', 'North Dakota', regex=False)
rent["Location"] = rent['Location'].str.replace('NE', 'Nebraska', regex=False)
rent["Location"] = rent['Location'].str.replace('NH', 'New Hampshire', regex=False)
rent["Location"] = rent['Location'].str.replace('NJ', 'New Jersey', regex=False)
rent["Location"] = rent['Location'].str.replace('NM', 'New Mexico', regex=False)
rent["Location"] = rent['Location'].str.replace('NV', 'Nevada', regex=False)
rent["Location"] = rent['Location'].str.replace('NY', 'New York', regex=False)
rent["Location"] = rent['Location'].str.replace('OH', 'Ohio', regex=False)
rent["Location"] = rent['Location'].str.replace('OK', 'Oklahoma', regex=False)
rent["Location"] = rent['Location'].str.replace('OR', 'Oregon', regex=False)
rent["Location"] = rent['Location'].str.replace('PA', 'Pennsylvania', regex=False)
rent["Location"] = rent['Location'].str.replace('PR', 'Puerto Rico', regex=False)
rent["Location"] = rent['Location'].str.replace('RI', 'Rhode Island', regex=False)
rent["Location"] = rent['Location'].str.replace('SC', 'South Carolina', regex=False)
rent["Location"] = rent['Location'].str.replace('SD', 'South Dakota', regex=False)
rent["Location"] = rent['Location'].str.replace('TN', 'Tennessee', regex=False)
rent["Location"] = rent['Location'].str.replace('TX', 'Texas', regex=False)
rent["Location"] = rent['Location'].str.replace('UT', 'Utah', regex=False)
rent["Location"] = rent['Location'].str.replace('VA', 'Virginia', regex=False)
rent["Location"] = rent['Location'].str.replace('VI', 'Virgin Islands', regex=False)
rent["Location"] = rent['Location'].str.replace('VT', 'Vermont', regex=False)
rent["Location"] = rent['Location'].str.replace('WA', 'Washington', regex=False)
rent["Location"] = rent['Location'].str.replace('WI', 'Wisconsin', regex=False)
rent["Location"] = rent['Location'].str.replace('WV', 'West Virginia', regex=False)
rent["Location"] = rent['Location'].str.replace('WY', 'Wyoming', regex=False)
rent

Unnamed: 0,Location,2019 Rental Rates
0,"New York, New York",3311.0
1,"Chicago, Illinois",1838.0
2,"New York, New York",3344.0
3,"Houston, Texas",1772.0
4,"Chicago, Illinois",2028.0
...,...,...
3185,"New York, New York",50175.0
3186,"Boston, Massachusetts",4585.0
3187,"Washington, District of Columbia",2495.0
3188,"Colorado Springs, Colorado",1640.0


In [594]:
# Unique number of rent Locations count

rent['Location'].nunique()

101

In [595]:
# Get rid of duplicate locations and group them by their average rental rates

rent = rent.groupby(['Location']).mean()

rent

Unnamed: 0_level_0,2019 Rental Rates
Location,Unnamed: 1_level_1
"Akron, Ohio",678.000000
"Albany, New York",1659.000000
"Albuquerque, New Mexico",1120.272727
"Allentown, Pennsylvania",1373.333333
"Atlanta, Georgia",1520.842105
...,...
"Virginia Beach, Virginia",1356.707317
"Washington, District of Columbia",2103.435115
"Wichita, Kansas",857.000000
"Winston-Salem, North Carolina",1233.285714


In [596]:
# Load and inspect crime rates data (need to replicate state for every city)

crime = pd.read_csv("../data/crime_rates.csv")
crime

Unnamed: 0,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson
0,Alabama,Hoover,85670,114,4.0,15,27,68,1922,128,1694,100,2
1,Alaska,Anchorage,287731,3581,32.0,540,621,2388,12261,1692,9038,1531,93
2,Alaska,Bethel,6544,130,1.0,47,3,79,132,20,84,28,12
3,Alaska,Bristol Bay Borough,852,2,0.0,0,0,2,20,5,8,7,0
4,Alaska,Cordova,2150,0,0.0,0,0,0,7,1,6,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8100,Wyoming,Sheridan,17895,9,0.0,4,0,5,369,75,278,16,3
8101,Wyoming,Thermopolis,2830,13,0.0,0,0,13,34,7,22,5,0
8102,Wyoming,Torrington,6709,13,0.0,4,1,8,48,8,40,0,0
8103,Wyoming,Wheatland,3544,7,0.0,1,0,6,72,24,45,3,0


In [597]:
# Check crime data types

crime.dtypes

State                                    object
City                                     object
Population                               object
Violent crime                            object
Murder and nonnegligent manslaughter    float64
Rape                                     object
Robbery                                  object
Aggravated assault                       object
Property crime                           object
Burglary                                 object
Larceny-theft                            object
Motor vehicle theft                      object
Arson                                    object
dtype: object

In [598]:
# Make new Location column with City and State data, so the data can be joineed with the other data on this column

crime['Location'] = crime['City'] + ',' + ' ' + crime['State']
crime

# Consider adding Alabama crime data from 2018

Unnamed: 0,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson,Location
0,Alabama,Hoover,85670,114,4.0,15,27,68,1922,128,1694,100,2,"Hoover, Alabama"
1,Alaska,Anchorage,287731,3581,32.0,540,621,2388,12261,1692,9038,1531,93,"Anchorage, Alaska"
2,Alaska,Bethel,6544,130,1.0,47,3,79,132,20,84,28,12,"Bethel, Alaska"
3,Alaska,Bristol Bay Borough,852,2,0.0,0,0,2,20,5,8,7,0,"Bristol Bay Borough, Alaska"
4,Alaska,Cordova,2150,0,0.0,0,0,0,7,1,6,0,0,"Cordova, Alaska"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8100,Wyoming,Sheridan,17895,9,0.0,4,0,5,369,75,278,16,3,"Sheridan, Wyoming"
8101,Wyoming,Thermopolis,2830,13,0.0,0,0,13,34,7,22,5,0,"Thermopolis, Wyoming"
8102,Wyoming,Torrington,6709,13,0.0,4,1,8,48,8,40,0,0,"Torrington, Wyoming"
8103,Wyoming,Wheatland,3544,7,0.0,1,0,6,72,24,45,3,0,"Wheatland, Wyoming"


In [599]:
# Unique number of crime Locations count

crime['Location'].nunique()

# 19491 unique Locations in population dataset
# 101 unique Locations in rent dataset   -   (Consider looking for a different rent data set with more unique locations)
# 8105 unique Locations in crime dataset

8105

In [600]:
# When wrangling is done here, combine tables, bin data in new columns, and push data to PG DB. 
 
# Example of Joins

merged_population_rent = pd.merge(left=population, right=rent, left_on='Location', right_on='Location')
merged_population_rent

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates
0,"Birmingham, Alabama",209403,city,1249.111111
1,"Phoenix, Arizona",1680992,city,1447.437500
2,"Tucson, Arizona",548073,city,1248.714286
3,"Little Rock, Arkansas",197312,city,895.000000
4,"Bakersfield, California",384145,city,1340.000000
...,...,...,...,...
85,"Virginia Beach, Virginia",449974,city,1356.707317
86,"Seattle, Washington",753675,city,1964.097561
87,"Spokane, Washington",222081,city,1111.400000
88,"Madison, Wisconsin",259680,city,1415.166667


In [601]:
# Unique number of merged_population_rent Locations count

merged_population_rent['Location'].nunique()

# There are 11 unique Locations being lost from the dataset because the population and rent data sets do not have all of the same Locations

90

In [602]:
# Merge population, rent, and crime data which can be pushed to DB. Data can also be added to this, more data preserved, and binned.

merged_population_rent_crime = pd.merge(left=merged_population_rent, right=crime, left_on='Location', right_on='Location')
merged_population_rent_crime

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson
0,"Phoenix, Arizona",1680992,city,1447.437500,Arizona,Phoenix,1688722,11803,131.0,1139,3197,7336,55974,9471,39427,7076,201
1,"Tucson, Arizona",548073,city,1248.714286,Arizona,Tucson,548374,3775,40.0,527,1105,2103,17943,2497,13196,2250,142
2,"Little Rock, Arkansas",197312,city,895.000000,Arkansas,Little Rock,198382,3009,38.0,209,391,2371,12145,1760,9316,1069,45
3,"Bakersfield, California",384145,city,1340.000000,California,Bakersfield,388080,1766,34.0,116,701,915,16074,3888,9277,2909,470
4,"Riverside, California",331360,city,1944.444444,California,Riverside,333260,1686,17.0,139,476,1054,9790,1302,6997,1491,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,"Virginia Beach, Virginia",449974,city,1356.707317,Virginia,Virginia Beach,449038,581,30.0,79,196,276,7906,530,6797,579,34
72,"Seattle, Washington",753675,city,1964.097561,Washington,Seattle,763706,4471,28.0,358,1339,2746,34333,7210,23478,3645,98
73,"Spokane, Washington",222081,city,1111.400000,Washington,Spokane,220432,1520,6.0,230,311,973,13048,1743,10026,1279,45
74,"Madison, Wisconsin",259680,city,1415.166667,Wisconsin,Madison,261270,940,4.0,107,217,612,6464,1046,4873,545,9


In [603]:
# Unique number of merged_population_rent_crime Locations count

merged_population_rent_crime['Location'].nunique()

# There are 14 more unique Locations being lost from the dataset because the population and rent data sets do not have all of the same Locations

76

In [604]:
# Check data types before binning

merged_population_rent_crime.dtypes

Location                                 object
2019 Population                          object
Town or City                             object
2019 Rental Rates                       float64
State                                    object
City                                     object
Population                               object
Violent crime                            object
Murder and nonnegligent manslaughter    float64
Rape                                     object
Robbery                                  object
Aggravated assault                       object
Property crime                           object
Burglary                                 object
Larceny-theft                            object
Motor vehicle theft                      object
Arson                                    object
dtype: object

In [605]:
# Remove commas from columns and change data type to floats

merged_population_rent_crime['2019 Population'] = merged_population_rent_crime['2019 Population'].replace(',','', regex=True)
merged_population_rent_crime['Crime Population'] = merged_population_rent_crime['Population'].replace(',','', regex=True)
merged_population_rent_crime = merged_population_rent_crime.drop(['Population'], axis=1)
merged_population_rent_crime['Violent crime'] = merged_population_rent_crime['Violent crime'].replace(',','', regex=True)
merged_population_rent_crime['Murder and nonnegligent manslaughter'] = merged_population_rent_crime['Murder and nonnegligent manslaughter'].replace(',','', regex=True)
merged_population_rent_crime['Rape'] = merged_population_rent_crime['Rape'].replace(',','', regex=True)
merged_population_rent_crime['Robbery'] = merged_population_rent_crime['Robbery'].replace(',','', regex=True)
merged_population_rent_crime['Aggravated assault'] = merged_population_rent_crime['Aggravated assault'].replace(',','', regex=True)
merged_population_rent_crime['Property crime'] = merged_population_rent_crime['Property crime'].replace(',','', regex=True)
merged_population_rent_crime['Burglary'] = merged_population_rent_crime['Burglary'].replace(',','', regex=True)
merged_population_rent_crime['Larceny-theft'] = merged_population_rent_crime['Larceny-theft'].replace(',','', regex=True)
merged_population_rent_crime['Motor vehicle theft'] = merged_population_rent_crime['Motor vehicle theft'].replace(',','', regex=True)
merged_population_rent_crime['Arson'] = merged_population_rent_crime['Arson'].replace(',','', regex=True)

merged_population_rent_crime

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson,Crime Population
0,"Phoenix, Arizona",1680992,city,1447.437500,Arizona,Phoenix,11803,131.0,1139,3197,7336,55974,9471,39427,7076,201,1688722
1,"Tucson, Arizona",548073,city,1248.714286,Arizona,Tucson,3775,40.0,527,1105,2103,17943,2497,13196,2250,142,548374
2,"Little Rock, Arkansas",197312,city,895.000000,Arkansas,Little Rock,3009,38.0,209,391,2371,12145,1760,9316,1069,45,198382
3,"Bakersfield, California",384145,city,1340.000000,California,Bakersfield,1766,34.0,116,701,915,16074,3888,9277,2909,470,388080
4,"Riverside, California",331360,city,1944.444444,California,Riverside,1686,17.0,139,476,1054,9790,1302,6997,1491,75,333260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,"Virginia Beach, Virginia",449974,city,1356.707317,Virginia,Virginia Beach,581,30.0,79,196,276,7906,530,6797,579,34,449038
72,"Seattle, Washington",753675,city,1964.097561,Washington,Seattle,4471,28.0,358,1339,2746,34333,7210,23478,3645,98,763706
73,"Spokane, Washington",222081,city,1111.400000,Washington,Spokane,1520,6.0,230,311,973,13048,1743,10026,1279,45,220432
74,"Madison, Wisconsin",259680,city,1415.166667,Wisconsin,Madison,940,4.0,107,217,612,6464,1046,4873,545,9,261270


In [606]:
# Check for null values

merged_population_rent_crime.isnull().sum().sum()

9

In [607]:
# Drop all null values for now, can return to preserve rows by filling (fillna(), replace() and interpolate())

merged_population_rent_crime = merged_population_rent_crime.dropna()
merged_population_rent_crime.shape

(68, 17)

In [608]:
# Change datatype to float or int, int looks better with no decimal and trailing 0

#merged_population_rent_crime['2019 Population'].astype('float64')
#merged_population_rent_crime['2019 Rental Rates'].astype('float64')
#merged_population_rent_crime['Population'].astype('float64')
#merged_population_rent_crime['Violent crime'].astype('float64')
#merged_population_rent_crime['Murder and nonnegligent manslaughter'].astype('float64')
#merged_population_rent_crime['Rape'].astype('float64')
#merged_population_rent_crime['Robbery'].astype('float64')
#merged_population_rent_crime['Aggravated assault'].astype('float64')
#merged_population_rent_crime['Property crime'].astype('float64')
#merged_population_rent_crime['Burglary'].astype('float64')
#merged_population_rent_crime['Larceny-theft'].astype('float64')
#merged_population_rent_crime['Motor vehicle theft'].astype('float64')
#merged_population_rent_crime['Arson'].astype('float64')

merged_population_rent_crime['2019 Population'] = merged_population_rent_crime['2019 Population'].astype('int')
merged_population_rent_crime['2019 Rental Rates'] = merged_population_rent_crime['2019 Rental Rates'].astype('int') # ValueError: Cannot convert non-finite values (NA or inf) to integer
#merged_population_rent_crime['Population'] = merged_population_rent_crime['Population'].astype('int')
merged_population_rent_crime['Violent crime'] = merged_population_rent_crime['Violent crime'].astype('int')
merged_population_rent_crime['Murder and nonnegligent manslaughter'] = merged_population_rent_crime['Murder and nonnegligent manslaughter'].astype('int')
merged_population_rent_crime['Rape'] = merged_population_rent_crime['Rape'].astype('int')
merged_population_rent_crime['Robbery'] = merged_population_rent_crime['Robbery'].astype('int')
merged_population_rent_crime['Aggravated assault'] = merged_population_rent_crime['Aggravated assault'].astype('int')
merged_population_rent_crime['Property crime'] = merged_population_rent_crime['Property crime'].astype('int') # ValueError: cannot convert float NaN to integer
merged_population_rent_crime['Burglary'] = merged_population_rent_crime['Burglary'].astype('int') # ValueError: cannot convert float NaN to integer
merged_population_rent_crime['Larceny-theft'] = merged_population_rent_crime['Larceny-theft'].astype('int')
merged_population_rent_crime['Motor vehicle theft'] = merged_population_rent_crime['Motor vehicle theft'].astype('int')
merged_population_rent_crime['Arson'] = merged_population_rent_crime['Arson'].astype('int') # ValueError: cannot convert float NaN to integer

# These columns previously had NA or NaN values which had to be dropped. These columns can be revisited to preserve more data.

#merged_population_rent_crime['2019 Rental Rates']
#merged_population_rent_crime['Property crime']
#merged_population_rent_crime['Burglary']
#merged_population_rent_crime['Arson']

In [609]:
# Check data types again before binning, since changing

merged_population_rent_crime.dtypes

Location                                object
2019 Population                          int32
Town or City                            object
2019 Rental Rates                        int32
State                                   object
City                                    object
Violent crime                            int32
Murder and nonnegligent manslaughter     int32
Rape                                     int32
Robbery                                  int32
Aggravated assault                       int32
Property crime                           int32
Burglary                                 int32
Larceny-theft                            int32
Motor vehicle theft                      int32
Arson                                    int32
Crime Population                        object
dtype: object

In [610]:
# Combine crime data into new column and divide by population * 1000 (This is how crime rate is calculated)

merged_population_rent_crime['Crime Rate']=merged_population_rent_crime.iloc[:,-11:].sum(axis=1)/merged_population_rent_crime['2019 Population']*100000

#merged_population_rent_crime['Crime Rate'] = merged_population_rent_crime['Violent crime'] + merged_population_rent_crime['Murder and nonnegligent manslaughter'] + #merged_population_rent_crime['Rape'] + merged_population_rent_crime + merged_population_rent_crime['Robbery'] + ['Aggravated assault'] + merged_population_rent_crime['Property #crime'] + merged_population_rent_crime['Burglary'] + merged_population_rent_crime['Larceny-theft'] + merged_population_rent_crime['Motor vehicle theft'] + #merged_population_rent_crime['Arson']/merged_population_rent_crime['Population']*100000

#merged_population_rent_crime['Crime Rate'] = total_crimes/merged_population_rent_crime['Population']*100000

merged_population_rent_crime

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson,Crime Population,Crime Rate
0,"Phoenix, Arizona",1680992,city,1447,Arizona,Phoenix,11803,131,1139,3197,7336,55974,9471,39427,7076,201,1688722,8075.886143
1,"Tucson, Arizona",548073,city,1248,Arizona,Tucson,3775,40,527,1105,2103,17943,2497,13196,2250,142,548374,7951.130598
2,"Little Rock, Arkansas",197312,city,895,Arkansas,Little Rock,3009,38,209,391,2371,12145,1760,9316,1069,45,198382,15383.250892
3,"Bakersfield, California",384145,city,1340,California,Bakersfield,1766,34,116,701,915,16074,3888,9277,2909,470,388080,9410.509053
4,"Riverside, California",331360,city,1944,California,Riverside,1686,17,139,476,1054,9790,1302,6997,1491,75,333260,6949.239498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,"Virginia Beach, Virginia",449974,city,1356,Virginia,Virginia Beach,581,30,79,196,276,7906,530,6797,579,34,449038,3779.773942
72,"Seattle, Washington",753675,city,1964,Washington,Seattle,4471,28,358,1339,2746,34333,7210,23478,3645,98,763706,10310.279630
73,"Spokane, Washington",222081,city,1111,Washington,Spokane,1520,6,230,311,973,13048,1743,10026,1279,45,220432,13139.800343
74,"Madison, Wisconsin",259680,city,1415,Wisconsin,Madison,940,4,107,217,612,6464,1046,4873,545,9,261270,5705.868762


In [611]:
# Describe the data to begin binning data with pandas cut or qcut

merged_population_rent_crime['2019 Population'].describe()

count    6.800000e+01
mean     4.499816e+05
std      5.046458e+05
min      4.927100e+04
25%      1.360930e+05
50%      2.428600e+05
75%      5.909902e+05
max      2.693976e+06
Name: 2019 Population, dtype: float64

In [612]:
# Quartile cut the data, to see how it is evenly distributed

pd.qcut(merged_population_rent_crime['2019 Population'], q=6)

0      (679778.5, 2693976.0]
1     (433712.667, 679778.5]
2       (186153.0, 242860.0]
3     (242860.0, 433712.667]
4     (242860.0, 433712.667]
               ...          
71    (433712.667, 679778.5]
72     (679778.5, 2693976.0]
73      (186153.0, 242860.0]
74    (242860.0, 433712.667]
75    (433712.667, 679778.5]
Name: 2019 Population, Length: 68, dtype: category
Categories (6, interval[float64]): [(49270.999, 117422.0] < (117422.0, 186153.0] < (186153.0, 242860.0] < (242860.0, 433712.667] < (433712.667, 679778.5] < (679778.5, 2693976.0]]

In [613]:
# Bin 2019 population into Urban population by City Size Categories and ranges

# (https://data.oecd.org/popregion/urban-population-by-city-size.htm#:~:text=their%20administrative%20boundaries.-,Urban%20areas%20in%20OECD%20countries%20are%20classified%20as%3A%20large%20metropolitan,areas%20if%20their%20population%20is)

# Urban areas in OECD countries are classified as: large metropolitan areas if they have a population of 1.5 million or more; metropolitan areas if their population is between 500 000 and 1.5 million; medium-size urban areas if their population is between 200 000 and 500 000; and, small urban areas if their population is between 50 000 and 200 000. This indicator is measured as a percentage of the national population.

# Qcut

#merged_population_rent_crime['Urban Population by City Size'] = pd.qcut(merged_population_rent_crime['2019 Population'], q=4, labels = ["Small Urban Area", "Medium-size Urban Area", "Metropolitan Area", "Large Metropolitan Area"])

#merged_population_rent_crime['Population by City Size'] = pd.qcut(merged_population_rent_crime['2019 Population'], q=4)

#merged_population_rent_crime

# Consider using just cut to fit the labels above. qcut will make the bins equal, but it is difficult to categorize cities based on those numbers.

bins = [0, 50000, 200000, 500000, 1500000, 100000000]
labels = ["Town","Small Urban Area", "Medium-size Urban Area", "Metropolitan Area", "Large Metropolitan Area"]
merged_population_rent_crime['Urban Population by City Size Categories'] = pd.cut(merged_population_rent_crime['2019 Population'], bins=bins, labels=labels)

bins = [0, 50000, 200000, 500000, 1500000, 100000000]
labels = ["0-50,000", "50,000-200,000", "200,000-500,000", "500,000-1,500,000", "1,500,000 <"]
merged_population_rent_crime['Urban Population by City Size Ranges'] = pd.cut(merged_population_rent_crime['2019 Population'], bins=bins, labels=labels)


merged_population_rent_crime

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson,Crime Population,Crime Rate,Urban Population by City Size Categories,Urban Population by City Size Ranges
0,"Phoenix, Arizona",1680992,city,1447,Arizona,Phoenix,11803,131,1139,3197,7336,55974,9471,39427,7076,201,1688722,8075.886143,Large Metropolitan Area,"1,500,000 <"
1,"Tucson, Arizona",548073,city,1248,Arizona,Tucson,3775,40,527,1105,2103,17943,2497,13196,2250,142,548374,7951.130598,Metropolitan Area,"500,000-1,500,000"
2,"Little Rock, Arkansas",197312,city,895,Arkansas,Little Rock,3009,38,209,391,2371,12145,1760,9316,1069,45,198382,15383.250892,Small Urban Area,"50,000-200,000"
3,"Bakersfield, California",384145,city,1340,California,Bakersfield,1766,34,116,701,915,16074,3888,9277,2909,470,388080,9410.509053,Medium-size Urban Area,"200,000-500,000"
4,"Riverside, California",331360,city,1944,California,Riverside,1686,17,139,476,1054,9790,1302,6997,1491,75,333260,6949.239498,Medium-size Urban Area,"200,000-500,000"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,"Virginia Beach, Virginia",449974,city,1356,Virginia,Virginia Beach,581,30,79,196,276,7906,530,6797,579,34,449038,3779.773942,Medium-size Urban Area,"200,000-500,000"
72,"Seattle, Washington",753675,city,1964,Washington,Seattle,4471,28,358,1339,2746,34333,7210,23478,3645,98,763706,10310.279630,Metropolitan Area,"500,000-1,500,000"
73,"Spokane, Washington",222081,city,1111,Washington,Spokane,1520,6,230,311,973,13048,1743,10026,1279,45,220432,13139.800343,Medium-size Urban Area,"200,000-500,000"
74,"Madison, Wisconsin",259680,city,1415,Wisconsin,Madison,940,4,107,217,612,6464,1046,4873,545,9,261270,5705.868762,Medium-size Urban Area,"200,000-500,000"


In [614]:
# Describe the data to begin binning data with pandas cut or qcut

merged_population_rent_crime['2019 Rental Rates'].describe()

count      68.000000
mean     1448.794118
std       458.855701
min       678.000000
25%      1180.750000
50%      1369.000000
75%      1604.000000
max      3362.000000
Name: 2019 Rental Rates, dtype: float64

In [615]:
# Quartile cut the 2019 Rental Rates data, to see how it is evenly distributed

pd.qcut(merged_population_rent_crime['2019 Rental Rates'], q=4)

0       (1369.0, 1604.0]
1      (1180.75, 1369.0]
2     (677.999, 1180.75]
3      (1180.75, 1369.0]
4       (1604.0, 3362.0]
             ...        
71     (1180.75, 1369.0]
72      (1604.0, 3362.0]
73    (677.999, 1180.75]
74      (1369.0, 1604.0]
75    (677.999, 1180.75]
Name: 2019 Rental Rates, Length: 68, dtype: category
Categories (4, interval[float64]): [(677.999, 1180.75] < (1180.75, 1369.0] < (1369.0, 1604.0] < (1604.0, 3362.0]]

In [616]:
# Bin the 2019 Rental Rates data into categories and ranges

bins = [0, 581.99, 1294.5, 1549, 1891.5, 6373]
labels = ["Lowest Rent","Low Rent", "Average Rent", "Higher Rent", "Highest Rent"]
merged_population_rent_crime['Rental Rate Categories'] = pd.cut(merged_population_rent_crime['2019 Rental Rates'], bins=bins, labels=labels)

bins = [0, 581.99, 1294.5, 1549, 1891.5, 6373]
labels = ["$0-581.99","$581.99-1,294.50", "$1,294.50-1,549", "$1,549-1,891.50", "$1891.50-6,373"]
merged_population_rent_crime['Rental Rate Ranges'] = pd.cut(merged_population_rent_crime['2019 Rental Rates'], bins=bins, labels=labels)

merged_population_rent_crime

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,...,Burglary,Larceny-theft,Motor vehicle theft,Arson,Crime Population,Crime Rate,Urban Population by City Size Categories,Urban Population by City Size Ranges,Rental Rate Categories,Rental Rate Ranges
0,"Phoenix, Arizona",1680992,city,1447,Arizona,Phoenix,11803,131,1139,3197,...,9471,39427,7076,201,1688722,8075.886143,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549"
1,"Tucson, Arizona",548073,city,1248,Arizona,Tucson,3775,40,527,1105,...,2497,13196,2250,142,548374,7951.130598,Metropolitan Area,"500,000-1,500,000",Low Rent,"$581.99-1,294.50"
2,"Little Rock, Arkansas",197312,city,895,Arkansas,Little Rock,3009,38,209,391,...,1760,9316,1069,45,198382,15383.250892,Small Urban Area,"50,000-200,000",Low Rent,"$581.99-1,294.50"
3,"Bakersfield, California",384145,city,1340,California,Bakersfield,1766,34,116,701,...,3888,9277,2909,470,388080,9410.509053,Medium-size Urban Area,"200,000-500,000",Average Rent,"$1,294.50-1,549"
4,"Riverside, California",331360,city,1944,California,Riverside,1686,17,139,476,...,1302,6997,1491,75,333260,6949.239498,Medium-size Urban Area,"200,000-500,000",Highest Rent,"$1891.50-6,373"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,"Virginia Beach, Virginia",449974,city,1356,Virginia,Virginia Beach,581,30,79,196,...,530,6797,579,34,449038,3779.773942,Medium-size Urban Area,"200,000-500,000",Average Rent,"$1,294.50-1,549"
72,"Seattle, Washington",753675,city,1964,Washington,Seattle,4471,28,358,1339,...,7210,23478,3645,98,763706,10310.279630,Metropolitan Area,"500,000-1,500,000",Highest Rent,"$1891.50-6,373"
73,"Spokane, Washington",222081,city,1111,Washington,Spokane,1520,6,230,311,...,1743,10026,1279,45,220432,13139.800343,Medium-size Urban Area,"200,000-500,000",Low Rent,"$581.99-1,294.50"
74,"Madison, Wisconsin",259680,city,1415,Wisconsin,Madison,940,4,107,217,...,1046,4873,545,9,261270,5705.868762,Medium-size Urban Area,"200,000-500,000",Average Rent,"$1,294.50-1,549"


In [617]:
# Describe the data to begin binning data with pandas cut or qcut

merged_population_rent_crime['Crime Rate'].describe()

count       68.000000
mean      8700.517414
std       3229.769757
min       1972.791138
25%       6334.476970
50%       8387.700330
75%      10806.325688
max      16274.419781
Name: Crime Rate, dtype: float64

In [618]:
# Quartile cut the data, to see how it is evenly distributed

pd.qcut(merged_population_rent_crime['Crime Rate'], q=5)

0      (7904.614, 9414.012]
1      (7904.614, 9414.012]
2     (11886.314, 16274.42]
3      (7904.614, 9414.012]
4      (5645.263, 7904.614]
              ...          
71      (1972.79, 5645.263]
72    (9414.012, 11886.314]
73    (11886.314, 16274.42]
74     (5645.263, 7904.614]
75     (5645.263, 7904.614]
Name: Crime Rate, Length: 68, dtype: category
Categories (5, interval[float64]): [(1972.79, 5645.263] < (5645.263, 7904.614] < (7904.614, 9414.012] < (9414.012, 11886.314] < (11886.314, 16274.42]]

In [619]:
# Bin the 2019 Crimes Rate data into categories and ranges

bins = [1972.79, 5831.013, 7951.131, 9683.737, 11316.718, 16274.42]
labels = ["Lowest Crime", "Lower Crime", "Average Crime", "Higher Crime", "Highest Crime"]
merged_population_rent_crime['Crime Rate Categories'] = pd.cut(merged_population_rent_crime['Crime Rate'], bins=bins, labels=labels)

bins = [1972.79, 5831.013, 7951.131, 9683.737, 11316.718, 16274.42]
labels = ["1,972.79-5,831.013", "5,831.013-7,951.131", "7,951.131-9,683.737", "9,683.737-11,316.718", "11,316.718-16,274.42"]
merged_population_rent_crime['Crime Rate Ranges'] = pd.cut(merged_population_rent_crime['Crime Rate'], bins=bins, labels=labels)

merged_population_rent_crime

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,...,Motor vehicle theft,Arson,Crime Population,Crime Rate,Urban Population by City Size Categories,Urban Population by City Size Ranges,Rental Rate Categories,Rental Rate Ranges,Crime Rate Categories,Crime Rate Ranges
0,"Phoenix, Arizona",1680992,city,1447,Arizona,Phoenix,11803,131,1139,3197,...,7076,201,1688722,8075.886143,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Average Crime,"7,951.131-9,683.737"
1,"Tucson, Arizona",548073,city,1248,Arizona,Tucson,3775,40,527,1105,...,2250,142,548374,7951.130598,Metropolitan Area,"500,000-1,500,000",Low Rent,"$581.99-1,294.50",Lower Crime,"5,831.013-7,951.131"
2,"Little Rock, Arkansas",197312,city,895,Arkansas,Little Rock,3009,38,209,391,...,1069,45,198382,15383.250892,Small Urban Area,"50,000-200,000",Low Rent,"$581.99-1,294.50",Highest Crime,"11,316.718-16,274.42"
3,"Bakersfield, California",384145,city,1340,California,Bakersfield,1766,34,116,701,...,2909,470,388080,9410.509053,Medium-size Urban Area,"200,000-500,000",Average Rent,"$1,294.50-1,549",Average Crime,"7,951.131-9,683.737"
4,"Riverside, California",331360,city,1944,California,Riverside,1686,17,139,476,...,1491,75,333260,6949.239498,Medium-size Urban Area,"200,000-500,000",Highest Rent,"$1891.50-6,373",Lower Crime,"5,831.013-7,951.131"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,"Virginia Beach, Virginia",449974,city,1356,Virginia,Virginia Beach,581,30,79,196,...,579,34,449038,3779.773942,Medium-size Urban Area,"200,000-500,000",Average Rent,"$1,294.50-1,549",Lowest Crime,"1,972.79-5,831.013"
72,"Seattle, Washington",753675,city,1964,Washington,Seattle,4471,28,358,1339,...,3645,98,763706,10310.279630,Metropolitan Area,"500,000-1,500,000",Highest Rent,"$1891.50-6,373",Higher Crime,"9,683.737-11,316.718"
73,"Spokane, Washington",222081,city,1111,Washington,Spokane,1520,6,230,311,...,1279,45,220432,13139.800343,Medium-size Urban Area,"200,000-500,000",Low Rent,"$581.99-1,294.50",Highest Crime,"11,316.718-16,274.42"
74,"Madison, Wisconsin",259680,city,1415,Wisconsin,Madison,940,4,107,217,...,545,9,261270,5705.868762,Medium-size Urban Area,"200,000-500,000",Average Rent,"$1,294.50-1,549",Lowest Crime,"1,972.79-5,831.013"


In [620]:
# Export merged_population_rent_crime with binned categories for csv and beginning modeling in another notebook

merged_population_rent_crime.to_csv('../data/pop_rent_crime_bins.csv')

In [621]:
#1/24/2020 Adding walk_score and cost_of_living data for Release 2

walk = pd.read_csv("../data/walk_score.csv")
cost = pd.read_csv("../data/cost_of_living.csv")

In [622]:
# Check out walk data

walk

Unnamed: 0.1,Unnamed: 0,City,Zip Code,Walk Score,Transit Score,Bike Score,Population,State
0,0,Birmingham,35211.0,35,25,31,212237,AL
1,1,Montgomery,36109.0,26,16,38,205764,AL
2,2,Mobile,36605.0,32,--,39,195111,AL
3,3,Huntsville,35810.0,24,13,40,180105,AL
4,4,Tuscaloosa,,33,--,37,90468,AL
...,...,...,...,...,...,...,...,...
2495,1,Casper,,39,--,44,55316,WY
2496,2,Laramie,,34,--,67,30816,WY
2497,3,Gillette,,21,--,34,29087,WY
2498,4,Rock Springs,,29,--,32,23036,WY


In [623]:
# Replace state abbreviations with full state names

walk["State"] = walk['State'].str.replace('AK', 'Alaska', regex=False)
walk["State"] = walk['State'].str.replace('AL', 'Alabama', regex=False)
walk["State"] = walk['State'].str.replace('AR', 'Arkansas', regex=False)
walk["State"] = walk['State'].str.replace('AS', 'American Samoa', regex=False)
walk["State"] = walk['State'].str.replace('AZ', 'Arizona', regex=False)
walk["State"] = walk['State'].str.replace('CA', 'California', regex=False)
walk["State"] = walk['State'].str.replace('CO', 'Colorado', regex=False)
walk["State"] = walk['State'].str.replace('CT', 'Connecticut', regex=False)
walk["State"] = walk['State'].str.replace('DC', 'District of Columbia', regex=False)
walk["State"] = walk['State'].str.replace('DE', 'Delaware', regex=False)
walk["State"] = walk['State'].str.replace('FL', 'Florida', regex=False)
walk["State"] = walk['State'].str.replace('GA', 'Georgia', regex=False)
walk["State"] = walk['State'].str.replace('GU', 'Guam', regex=False)
walk["State"] = walk['State'].str.replace('HI', 'Hawaii', regex=False)
walk["State"] = walk['State'].str.replace('IA', 'Iowa', regex=False)
walk["State"] = walk['State'].str.replace('ID', 'Idaho', regex=False)
walk["State"] = walk['State'].str.replace('IL', 'Illinois', regex=False)
walk["State"] = walk['State'].str.replace('IN', 'Indiana', regex=False)
walk["State"] = walk['State'].str.replace('KS', 'Kansas', regex=False)
walk["State"] = walk['State'].str.replace('KY', 'Kentucky', regex=False)
walk["State"] = walk['State'].str.replace('LA', 'Louisiana', regex=False)
walk["State"] = walk['State'].str.replace('MA', 'Massachusetts', regex=False)
walk["State"] = walk['State'].str.replace('MD', 'Maryland', regex=False)
walk["State"] = walk['State'].str.replace('ME', 'Maine', regex=False)
walk["State"] = walk['State'].str.replace('MI', 'Michigan', regex=False)
walk["State"] = walk['State'].str.replace('MN', 'Minnesota', regex=False)
walk["State"] = walk['State'].str.replace('MO', 'Missouri', regex=False)
walk["State"] = walk['State'].str.replace('MP', 'Northern Mariana Islands', regex=False)
walk["State"] = walk['State'].str.replace('MS', 'Mississippi', regex=False)
walk["State"] = walk['State'].str.replace('MT', 'Montana', regex=False)
walk["State"] = walk['State'].str.replace('NA', 'National', regex=False)
walk["State"] = walk['State'].str.replace('NC', 'North Carolina', regex=False)
walk["State"] = walk['State'].str.replace('ND', 'North Dakota', regex=False)
walk["State"] = walk['State'].str.replace('NE', 'Nebraska', regex=False)
walk["State"] = walk['State'].str.replace('NH', 'New Hampshire', regex=False)
walk["State"] = walk['State'].str.replace('NJ', 'New Jersey', regex=False)
walk["State"] = walk['State'].str.replace('NM', 'New Mexico', regex=False)
walk["State"] = walk['State'].str.replace('NV', 'Nevada', regex=False)
walk["State"] = walk['State'].str.replace('NY', 'New York', regex=False)
walk["State"] = walk['State'].str.replace('OH', 'Ohio', regex=False)
walk["State"] = walk['State'].str.replace('OK', 'Oklahoma', regex=False)
walk["State"] = walk['State'].str.replace('OR', 'Oregon', regex=False)
walk["State"] = walk['State'].str.replace('PA', 'Pennsylvania', regex=False)
walk["State"] = walk['State'].str.replace('PR', 'Puerto Rico', regex=False)
walk["State"] = walk['State'].str.replace('RI', 'Rhode Island', regex=False)
walk["State"] = walk['State'].str.replace('SC', 'South Carolina', regex=False)
walk["State"] = walk['State'].str.replace('SD', 'South Dakota', regex=False)
walk["State"] = walk['State'].str.replace('TN', 'Tennessee', regex=False)
walk["State"] = walk['State'].str.replace('TX', 'Texas', regex=False)
walk["State"] = walk['State'].str.replace('UT', 'Utah', regex=False)
walk["State"] = walk['State'].str.replace('VA', 'Virginia', regex=False)
walk["State"] = walk['State'].str.replace('VI', 'Virgin Islands', regex=False)
walk["State"] = walk['State'].str.replace('VT', 'Vermont', regex=False)
walk["State"] = walk['State'].str.replace('WA', 'Washington', regex=False)
walk["State"] = walk['State'].str.replace('WI', 'Wisconsin', regex=False)
walk["State"] = walk['State'].str.replace('WV', 'West Virginia', regex=False)
walk["State"] = walk['State'].str.replace('WY', 'Wyoming', regex=False)

In [624]:
# Drop the Unnamed: 0 columns, which seems like an extra sort of index

walk = walk.drop(['Unnamed: 0'], axis=1)
walk

Unnamed: 0,City,Zip Code,Walk Score,Transit Score,Bike Score,Population,State
0,Birmingham,35211.0,35,25,31,212237,Alabama
1,Montgomery,36109.0,26,16,38,205764,Alabama
2,Mobile,36605.0,32,--,39,195111,Alabama
3,Huntsville,35810.0,24,13,40,180105,Alabama
4,Tuscaloosa,,33,--,37,90468,Alabama
...,...,...,...,...,...,...,...
2495,Casper,,39,--,44,55316,Wyoming
2496,Laramie,,34,--,67,30816,Wyoming
2497,Gillette,,21,--,34,29087,Wyoming
2498,Rock Springs,,29,--,32,23036,Wyoming


In [625]:
# Check dtypes and notice Transit Score is an object because values contain "--". Should these values be replaced with 0's?

walk.dtypes

City              object
Zip Code         float64
Walk Score         int64
Transit Score     object
Bike Score         int64
Population         int64
State             object
dtype: object

In [626]:
# Count null values in all of the walk columns

walk.isnull().sum(axis = 0)

City                0
Zip Code         1489
Walk Score          0
Transit Score       0
Bike Score          0
Population          0
State               0
dtype: int64

In [627]:
 # Fill the null values so that we can change Zip Code column to an integer format and maybe use it join data later
 
 walk = walk.fillna(0)

In [628]:
# Change the Zip Code column to be integer type data

walk['Zip Code'] = walk['Zip Code'].astype(int)

In [629]:
# Make new Location column with City and State data, so the data can be joined with the other data on this column

walk['Location'] = walk['City'] + ',' + ' ' + walk['State']
walk

Unnamed: 0,City,Zip Code,Walk Score,Transit Score,Bike Score,Population,State,Location
0,Birmingham,35211,35,25,31,212237,Alabama,"Birmingham, Alabama"
1,Montgomery,36109,26,16,38,205764,Alabama,"Montgomery, Alabama"
2,Mobile,36605,32,--,39,195111,Alabama,"Mobile, Alabama"
3,Huntsville,35810,24,13,40,180105,Alabama,"Huntsville, Alabama"
4,Tuscaloosa,0,33,--,37,90468,Alabama,"Tuscaloosa, Alabama"
...,...,...,...,...,...,...,...,...
2495,Casper,0,39,--,44,55316,Wyoming,"Casper, Wyoming"
2496,Laramie,0,34,--,67,30816,Wyoming,"Laramie, Wyoming"
2497,Gillette,0,21,--,34,29087,Wyoming,"Gillette, Wyoming"
2498,Rock Springs,0,29,--,32,23036,Wyoming,"Rock Springs, Wyoming"


In [630]:
# Drop City and State columns since they are redundant with Location and they already exist in first merged data

walk = walk.drop(columns=['City', 'State'])

In [631]:
# Bin the walk score according to quartiles(also Bike Score and Transit score but need to investigate and missing value denoted as  "--")

# Describe the data to begin binning data with pandas cut or qcut

walk.describe()

Unnamed: 0,Zip Code,Walk Score,Bike Score,Population
count,2500.0,2500.0,2500.0,2500.0
mean,20787.8828,36.3648,43.452,66879.73
std,32514.371193,15.711144,11.58688,215737.6
min,0.0,2.0,7.0,16522.0
25%,0.0,25.0,36.0,22467.75
50%,0.0,35.0,43.0,32627.0
75%,36675.75,44.25,50.0,58570.0
max,99336.0,96.0,96.0,8175133.0


In [632]:
# Quartile cut the Walk Score data, to see how it is evenly distributed

pd.qcut(walk['Walk Score'], q=5)

0        (31.0, 38.0]
1        (23.0, 31.0]
2        (31.0, 38.0]
3        (23.0, 31.0]
4        (31.0, 38.0]
            ...      
2495     (38.0, 48.0]
2496     (31.0, 38.0]
2497    (1.999, 23.0]
2498     (23.0, 31.0]
2499     (23.0, 31.0]
Name: Walk Score, Length: 2500, dtype: category
Categories (5, interval[float64]): [(1.999, 23.0] < (23.0, 31.0] < (31.0, 38.0] < (38.0, 48.0] < (48.0, 96.0]]

In [633]:
# Bin the Walk Score data into categories and ranges

bins = [1.999, 23.0, 31.0, 38.0, 48.0, 96.0]
labels = ["Lowest Walk Score", "Lower Walk Score", "Average Walk Score", "Higher Walk Score", "Highest Walk Score"]
walk['Walk Score Categories'] = pd.cut(walk['Walk Score'], bins=bins, labels=labels)

bins = [1.999, 23.0, 31.0, 38.0, 48.0, 96.0]
labels = ["1.999-23.0", "23,0-31.0", "31.0-38.0", "38.0-48.0", "48.0-96.0"]
walk['Walk Score Ranges'] = pd.cut(walk['Walk Score'], bins=bins, labels=labels)

walk

Unnamed: 0,Zip Code,Walk Score,Transit Score,Bike Score,Population,Location,Walk Score Categories,Walk Score Ranges
0,35211,35,25,31,212237,"Birmingham, Alabama",Average Walk Score,31.0-38.0
1,36109,26,16,38,205764,"Montgomery, Alabama",Lower Walk Score,"23,0-31.0"
2,36605,32,--,39,195111,"Mobile, Alabama",Average Walk Score,31.0-38.0
3,35810,24,13,40,180105,"Huntsville, Alabama",Lower Walk Score,"23,0-31.0"
4,0,33,--,37,90468,"Tuscaloosa, Alabama",Average Walk Score,31.0-38.0
...,...,...,...,...,...,...,...,...
2495,0,39,--,44,55316,"Casper, Wyoming",Higher Walk Score,38.0-48.0
2496,0,34,--,67,30816,"Laramie, Wyoming",Average Walk Score,31.0-38.0
2497,0,21,--,34,29087,"Gillette, Wyoming",Lowest Walk Score,1.999-23.0
2498,0,29,--,32,23036,"Rock Springs, Wyoming",Lower Walk Score,"23,0-31.0"


In [634]:
# Quartile cut the Bike Score data, to see how it is evenly distributed

pd.qcut(walk['Bike Score'], q=5)

0       (6.999, 34.0]
1        (34.0, 41.0]
2        (34.0, 41.0]
3        (34.0, 41.0]
4        (34.0, 41.0]
            ...      
2495     (41.0, 46.0]
2496     (52.0, 96.0]
2497    (6.999, 34.0]
2498    (6.999, 34.0]
2499     (41.0, 46.0]
Name: Bike Score, Length: 2500, dtype: category
Categories (5, interval[float64]): [(6.999, 34.0] < (34.0, 41.0] < (41.0, 46.0] < (46.0, 52.0] < (52.0, 96.0]]

In [635]:
# Bin the Bike Score data into categories and ranges

bins = [6.999, 34.0, 41.0, 46.0, 52.0, 96.0]
labels = ["Lowest Bike Scores", "Lower Bike Scores", "Average Bike Scores", "Higher Bike Scores", "Highest Bike Scores"]
walk['Bike Score Categories'] = pd.cut(walk['Bike Score'], bins=bins, labels=labels)

bins = [6.999, 34.0, 41.0, 46.0, 52.0, 96.0]
labels = ["6.999-34.0", "34.0-41.0", "41.0-46.0", "46.0-52.0", "52.0-96.0"]
walk['Bike Score Ranges'] = pd.cut(walk['Bike Score'], bins=bins, labels=labels)

walk

Unnamed: 0,Zip Code,Walk Score,Transit Score,Bike Score,Population,Location,Walk Score Categories,Walk Score Ranges,Bike Score Categories,Bike Score Ranges
0,35211,35,25,31,212237,"Birmingham, Alabama",Average Walk Score,31.0-38.0,Lowest Bike Scores,6.999-34.0
1,36109,26,16,38,205764,"Montgomery, Alabama",Lower Walk Score,"23,0-31.0",Lower Bike Scores,34.0-41.0
2,36605,32,--,39,195111,"Mobile, Alabama",Average Walk Score,31.0-38.0,Lower Bike Scores,34.0-41.0
3,35810,24,13,40,180105,"Huntsville, Alabama",Lower Walk Score,"23,0-31.0",Lower Bike Scores,34.0-41.0
4,0,33,--,37,90468,"Tuscaloosa, Alabama",Average Walk Score,31.0-38.0,Lower Bike Scores,34.0-41.0
...,...,...,...,...,...,...,...,...,...,...
2495,0,39,--,44,55316,"Casper, Wyoming",Higher Walk Score,38.0-48.0,Average Bike Scores,41.0-46.0
2496,0,34,--,67,30816,"Laramie, Wyoming",Average Walk Score,31.0-38.0,Highest Bike Scores,52.0-96.0
2497,0,21,--,34,29087,"Gillette, Wyoming",Lowest Walk Score,1.999-23.0,Lowest Bike Scores,6.999-34.0
2498,0,29,--,32,23036,"Rock Springs, Wyoming",Lower Walk Score,"23,0-31.0",Lowest Bike Scores,6.999-34.0


In [636]:
# Replace "--" in Transit Score data with "0"

walk["Transit Score"] = walk['Transit Score'].str.replace('--', '0', regex=False)

In [637]:
# Make Transit Score data int

walk['Transit Score'] = walk['Transit Score'].astype('int')

In [638]:
walk.describe()

Unnamed: 0,Zip Code,Walk Score,Transit Score,Bike Score,Population
count,2500.0,2500.0,2500.0,2500.0,2500.0
mean,20787.8828,36.3648,13.3976,43.452,66879.73
std,32514.371193,15.711144,17.595595,11.58688,215737.6
min,0.0,2.0,0.0,7.0,16522.0
25%,0.0,25.0,0.0,36.0,22467.75
50%,0.0,35.0,0.0,43.0,32627.0
75%,36675.75,44.25,29.0,50.0,58570.0
max,99336.0,96.0,84.0,96.0,8175133.0


In [639]:
# Quartile cut the Bike Score data, to see how it is evenly distributed

pd.qcut(walk['Transit Score'], q=1)

# Leaving out Transit Score data for now, not enough data

0       (-0.001, 84.0]
1       (-0.001, 84.0]
2       (-0.001, 84.0]
3       (-0.001, 84.0]
4       (-0.001, 84.0]
             ...      
2495    (-0.001, 84.0]
2496    (-0.001, 84.0]
2497    (-0.001, 84.0]
2498    (-0.001, 84.0]
2499    (-0.001, 84.0]
Name: Transit Score, Length: 2500, dtype: category
Categories (1, interval[float64]): [(-0.001, 84.0]]

In [640]:
# Drop Zip Code because too granular, Population because it is from previous years (search for better walk scores), and drop Transit Score because too many missing values

walk = walk.drop(columns=['Zip Code', 'Population', 'Transit Score'])

In [641]:
# Check out cost data

cost

Unnamed: 0,City,State,Cost of Living Index
0,Aberdeen,WA,95.5
1,Abilene,TX,89.6
2,Adrian,MI,89.9
3,Akron,OH,90.8
4,Alamogordo,NM,86.5
...,...,...,...
504,Vero Beach,FL,100.0
505,Weirton,WV,82.6
506,Wheeling,WV,84.5
507,New London,CT,107.3


In [642]:
# Replace state abbreviations with full state names

cost["State"] = cost['State'].str.replace('AK', 'Alaska', regex=False)
cost["State"] = cost['State'].str.replace('AL', 'Alabama', regex=False)
cost["State"] = cost['State'].str.replace('AR', 'Arkansas', regex=False)
cost["State"] = cost['State'].str.replace('AS', 'American Samoa', regex=False)
cost["State"] = cost['State'].str.replace('AZ', 'Arizona', regex=False)
cost["State"] = cost['State'].str.replace('CA', 'California', regex=False)
cost["State"] = cost['State'].str.replace('CO', 'Colorado', regex=False)
cost["State"] = cost['State'].str.replace('CT', 'Connecticut', regex=False)
cost["State"] = cost['State'].str.replace('DC', 'District of Columbia', regex=False)
cost["State"] = cost['State'].str.replace('DE', 'Delaware', regex=False)
cost["State"] = cost['State'].str.replace('FL', 'Florida', regex=False)
cost["State"] = cost['State'].str.replace('GA', 'Georgia', regex=False)
cost["State"] = cost['State'].str.replace('GU', 'Guam', regex=False)
cost["State"] = cost['State'].str.replace('HI', 'Hawaii', regex=False)
cost["State"] = cost['State'].str.replace('IA', 'Iowa', regex=False)
cost["State"] = cost['State'].str.replace('ID', 'Idaho', regex=False)
cost["State"] = cost['State'].str.replace('IL', 'Illinois', regex=False)
cost["State"] = cost['State'].str.replace('IN', 'Indiana', regex=False)
cost["State"] = cost['State'].str.replace('KS', 'Kansas', regex=False)
cost["State"] = cost['State'].str.replace('KY', 'Kentucky', regex=False)
cost["State"] = cost['State'].str.replace('LA', 'Louisiana', regex=False)
cost["State"] = cost['State'].str.replace('MA', 'Massachusetts', regex=False)
cost["State"] = cost['State'].str.replace('MD', 'Maryland', regex=False)
cost["State"] = cost['State'].str.replace('ME', 'Maine', regex=False)
cost["State"] = cost['State'].str.replace('MI', 'Michigan', regex=False)
cost["State"] = cost['State'].str.replace('MN', 'Minnesota', regex=False)
cost["State"] = cost['State'].str.replace('MO', 'Missouri', regex=False)
cost["State"] = cost['State'].str.replace('MP', 'Northern Mariana Islands', regex=False)
cost["State"] = cost['State'].str.replace('MS', 'Mississippi', regex=False)
cost["State"] = cost['State'].str.replace('MT', 'Montana', regex=False)
cost["State"] = cost['State'].str.replace('NA', 'National', regex=False)
cost["State"] = cost['State'].str.replace('NC', 'North Carolina', regex=False)
cost["State"] = cost['State'].str.replace('ND', 'North Dakota', regex=False)
cost["State"] = cost['State'].str.replace('NE', 'Nebraska', regex=False)
cost["State"] = cost['State'].str.replace('NH', 'New Hampshire', regex=False)
cost["State"] = cost['State'].str.replace('NJ', 'New Jersey', regex=False)
cost["State"] = cost['State'].str.replace('NM', 'New Mexico', regex=False)
cost["State"] = cost['State'].str.replace('NV', 'Nevada', regex=False)
cost["State"] = cost['State'].str.replace('NY', 'New York', regex=False)
cost["State"] = cost['State'].str.replace('OH', 'Ohio', regex=False)
cost["State"] = cost['State'].str.replace('OK', 'Oklahoma', regex=False)
cost["State"] = cost['State'].str.replace('OR', 'Oregon', regex=False)
cost["State"] = cost['State'].str.replace('PA', 'Pennsylvania', regex=False)
cost["State"] = cost['State'].str.replace('PR', 'Puerto Rico', regex=False)
cost["State"] = cost['State'].str.replace('RI', 'Rhode Island', regex=False)
cost["State"] = cost['State'].str.replace('SC', 'South Carolina', regex=False)
cost["State"] = cost['State'].str.replace('SD', 'South Dakota', regex=False)
cost["State"] = cost['State'].str.replace('TN', 'Tennessee', regex=False)
cost["State"] = cost['State'].str.replace('TX', 'Texas', regex=False)
cost["State"] = cost['State'].str.replace('UT', 'Utah', regex=False)
cost["State"] = cost['State'].str.replace('VA', 'Virginia', regex=False)
cost["State"] = cost['State'].str.replace('VI', 'Virgin Islands', regex=False)
cost["State"] = cost['State'].str.replace('VT', 'Vermont', regex=False)
cost["State"] = cost['State'].str.replace('WA', 'Washington', regex=False)
cost["State"] = cost['State'].str.replace('WI', 'Wisconsin', regex=False)
cost["State"] = cost['State'].str.replace('WV', 'West Virginia', regex=False)
cost["State"] = cost['State'].str.replace('WY', 'Wyoming', regex=False)

In [643]:
# Make new Location column with City and State data, so the data can be joined with the other data on this column

cost['Location'] = cost['City'] + ',' + ' ' + cost['State']
cost

Unnamed: 0,City,State,Cost of Living Index,Location
0,Aberdeen,Washington,95.5,"Aberdeen, Washington"
1,Abilene,Texas,89.6,"Abilene, Texas"
2,Adrian,Michigan,89.9,"Adrian, Michigan"
3,Akron,Ohio,90.8,"Akron, Ohio"
4,Alamogordo,New Mexico,86.5,"Alamogordo, New Mexico"
...,...,...,...,...
504,Vero Beach,Florida,100.0,"Vero Beach, Florida"
505,Weirton,West Virginia,82.6,"Weirton, West Virginia"
506,Wheeling,West Virginia,84.5,"Wheeling, West Virginia"
507,New London,Connecticut,107.3,"New London, Connecticut"


In [644]:
# Drop City and State columns since they are redundant with Location and they already exist in first merged data

cost = cost.drop(columns=['City', 'State'])

In [645]:
# Bin the cost data according to quartiles

# Describe the data to begin binning data with pandas cut or qcut

cost.describe()

Unnamed: 0,Cost of Living Index
count,509.0
mean,97.500196
std,13.91586
min,78.8
25%,89.0
50%,93.1
75%,101.9
max,183.0


In [646]:
# Quartile cut the Walk Score data, to see how it is evenly distributed

pd.qcut(cost['Cost of Living Index'], q=5)

0                  (91.4, 96.18]
1                   (88.5, 91.4]
2                   (88.5, 91.4]
3                   (88.5, 91.4]
4      (78.79899999999999, 88.5]
                 ...            
504              (96.18, 103.74]
505    (78.79899999999999, 88.5]
506    (78.79899999999999, 88.5]
507              (103.74, 183.0]
508                 (88.5, 91.4]
Name: Cost of Living Index, Length: 509, dtype: category
Categories (5, interval[float64]): [(78.79899999999999, 88.5] < (88.5, 91.4] < (91.4, 96.18] < (96.18, 103.74] < (103.74, 183.0]]

In [647]:
# Bin the Cost of Living Index data into categories and ranges

bins = [78.79899999999999, 88.5, 91.4, 96.18, 103.74, 183.0]
labels = ["Lowest Cost of Living Index", "Lower Cost of Living Index", "Average Cost of Living Index", "Higher Cost of Living Index", "Highest Cost of Living Index"]
cost['Cost of Living Index Categories'] = pd.cut(cost['Cost of Living Index'], bins=bins, labels=labels)

bins = [78.79899999999999, 88.5, 91.4, 96.18, 103.74, 183.0]
labels = ["78.79-88.5", "88.5-91.4", "91.4-96.18", "96.18-103.74", "103.74-183.0"]
cost['Cost of Living Index Ranges'] = pd.cut(cost['Cost of Living Index'], bins=bins, labels=labels)

cost

Unnamed: 0,Cost of Living Index,Location,Cost of Living Index Categories,Cost of Living Index Ranges
0,95.5,"Aberdeen, Washington",Average Cost of Living Index,91.4-96.18
1,89.6,"Abilene, Texas",Lower Cost of Living Index,88.5-91.4
2,89.9,"Adrian, Michigan",Lower Cost of Living Index,88.5-91.4
3,90.8,"Akron, Ohio",Lower Cost of Living Index,88.5-91.4
4,86.5,"Alamogordo, New Mexico",Lowest Cost of Living Index,78.79-88.5
...,...,...,...,...
504,100.0,"Vero Beach, Florida",Higher Cost of Living Index,96.18-103.74
505,82.6,"Weirton, West Virginia",Lowest Cost of Living Index,78.79-88.5
506,84.5,"Wheeling, West Virginia",Lowest Cost of Living Index,78.79-88.5
507,107.3,"New London, Connecticut",Highest Cost of Living Index,103.74-183.0


In [648]:
# Check shape of merged_population_rent_crime data

merged_population_rent_crime.shape

(68, 24)

In [649]:
# Merge the walk data with the merged_population_rent_crime data

merged_population_rent_crime_walk = pd.merge(left=merged_population_rent_crime, right=walk, left_on='Location', right_on='Location')
merged_population_rent_crime_walk

# When is this walk_data from? Consider dropping the populatin data from walk_data since we already have 2019 population data, and 2019 crime population
# Also consider dropping Zip Code from the walk data because it is too high granularity and it does not match the true area of different areas with different rental rates

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,...,Rental Rate Categories,Rental Rate Ranges,Crime Rate Categories,Crime Rate Ranges,Walk Score,Bike Score,Walk Score Categories,Walk Score Ranges,Bike Score Categories,Bike Score Ranges
0,"Phoenix, Arizona",1680992,city,1447,Arizona,Phoenix,11803,131,1139,3197,...,Average Rent,"$1,294.50-1,549",Average Crime,"7,951.131-9,683.737",41,56,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0
1,"Tucson, Arizona",548073,city,1248,Arizona,Tucson,3775,40,527,1105,...,Low Rent,"$581.99-1,294.50",Lower Crime,"5,831.013-7,951.131",42,67,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0
2,"Little Rock, Arkansas",197312,city,895,Arkansas,Little Rock,3009,38,209,391,...,Low Rent,"$581.99-1,294.50",Highest Crime,"11,316.718-16,274.42",32,32,Average Walk Score,31.0-38.0,Lowest Bike Scores,6.999-34.0
3,"Bakersfield, California",384145,city,1340,California,Bakersfield,1766,34,116,701,...,Average Rent,"$1,294.50-1,549",Average Crime,"7,951.131-9,683.737",34,44,Average Walk Score,31.0-38.0,Average Bike Scores,41.0-46.0
4,"Riverside, California",331360,city,1944,California,Riverside,1686,17,139,476,...,Highest Rent,"$1891.50-6,373",Lower Crime,"5,831.013-7,951.131",42,49,Higher Walk Score,38.0-48.0,Higher Bike Scores,46.0-52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,"Virginia Beach, Virginia",449974,city,1356,Virginia,Virginia Beach,581,30,79,196,...,Average Rent,"$1,294.50-1,549",Lowest Crime,"1,972.79-5,831.013",32,44,Average Walk Score,31.0-38.0,Average Bike Scores,41.0-46.0
64,"Seattle, Washington",753675,city,1964,Washington,Seattle,4471,28,358,1339,...,Highest Rent,"$1891.50-6,373",Higher Crime,"9,683.737-11,316.718",74,70,Highest Walk Score,48.0-96.0,Highest Bike Scores,52.0-96.0
65,"Spokane, Washington",222081,city,1111,Washington,Spokane,1520,6,230,311,...,Low Rent,"$581.99-1,294.50",Highest Crime,"11,316.718-16,274.42",49,52,Highest Walk Score,48.0-96.0,Higher Bike Scores,46.0-52.0
66,"Madison, Wisconsin",259680,city,1415,Wisconsin,Madison,940,4,107,217,...,Average Rent,"$1,294.50-1,549",Lowest Crime,"1,972.79-5,831.013",48,65,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0


In [650]:
# Merge the cost data with the merged_population_rent_crime_walk data

merged_population_rent_crime_walk_cost = pd.merge(left=merged_population_rent_crime_walk, right=cost, left_on='Location', right_on='Location')
merged_population_rent_crime_walk_cost

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,...,Crime Rate Ranges,Walk Score,Bike Score,Walk Score Categories,Walk Score Ranges,Bike Score Categories,Bike Score Ranges,Cost of Living Index,Cost of Living Index Categories,Cost of Living Index Ranges
0,"Phoenix, Arizona",1680992,city,1447,Arizona,Phoenix,11803,131,1139,3197,...,"7,951.131-9,683.737",41,56,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,105.8,Highest Cost of Living Index,103.74-183.0
1,"Tucson, Arizona",548073,city,1248,Arizona,Tucson,3775,40,527,1105,...,"5,831.013-7,951.131",42,67,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,99.7,Higher Cost of Living Index,96.18-103.74
2,"Little Rock, Arkansas",197312,city,895,Arkansas,Little Rock,3009,38,209,391,...,"11,316.718-16,274.42",32,32,Average Walk Score,31.0-38.0,Lowest Bike Scores,6.999-34.0,89.4,Lower Cost of Living Index,88.5-91.4
3,"Bakersfield, California",384145,city,1340,California,Bakersfield,1766,34,116,701,...,"7,951.131-9,683.737",34,44,Average Walk Score,31.0-38.0,Average Bike Scores,41.0-46.0,103.6,Higher Cost of Living Index,96.18-103.74
4,"Riverside, California",331360,city,1944,California,Riverside,1686,17,139,476,...,"5,831.013-7,951.131",42,49,Higher Walk Score,38.0-48.0,Higher Bike Scores,46.0-52.0,116.8,Highest Cost of Living Index,103.74-183.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,"Virginia Beach, Virginia",449974,city,1356,Virginia,Virginia Beach,581,30,79,196,...,"1,972.79-5,831.013",32,44,Average Walk Score,31.0-38.0,Average Bike Scores,41.0-46.0,101.1,Higher Cost of Living Index,96.18-103.74
64,"Seattle, Washington",753675,city,1964,Washington,Seattle,4471,28,358,1339,...,"9,683.737-11,316.718",74,70,Highest Walk Score,48.0-96.0,Highest Bike Scores,52.0-96.0,129.3,Highest Cost of Living Index,103.74-183.0
65,"Spokane, Washington",222081,city,1111,Washington,Spokane,1520,6,230,311,...,"11,316.718-16,274.42",49,52,Highest Walk Score,48.0-96.0,Higher Bike Scores,46.0-52.0,99.9,Higher Cost of Living Index,96.18-103.74
66,"Madison, Wisconsin",259680,city,1415,Wisconsin,Madison,940,4,107,217,...,"1,972.79-5,831.013",48,65,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,101.7,Higher Cost of Living Index,96.18-103.74


In [651]:
# Count unique Locations to be sure we are not losing any data with our merges, as we should not

merged_population_rent_crime['Location'].nunique()

68

In [652]:
merged_population_rent_crime_walk['Location'].nunique()

68

In [653]:
merged_population_rent_crime_walk_cost['Location'].nunique()

68

In [654]:
# Calculating Livability Score with Walk Score, Bike Score, Cost of Living Index, and Crime Rate (Subtracted from the max crime rate of 16274.419781 to reverse the scores)

merged_population_rent_crime_walk_cost['Livability Score'] = merged_population_rent_crime_walk_cost["Walk Score"] + merged_population_rent_crime_walk_cost["Bike Score"] + merged_population_rent_crime_walk_cost["Cost of Living Index"] + 16274.419781 - merged_population_rent_crime_walk_cost["Crime Rate"]

# https://scikit-learn.org/stable/modules/preprocessing.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler

In [655]:
# Scale the Livability score then multiply it by 100 to make the scores between 0 and 100

In [656]:
# Bin the cost data according to quartiles

# Describe the data to begin binning data with pandas cut or qcut

merged_population_rent_crime_walk_cost.describe()

Unnamed: 0,2019 Population,2019 Rental Rates,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson,Crime Rate,Walk Score,Bike Score,Cost of Living Index,Livability Score
count,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0
mean,449981.6,1448.794118,3677.397059,56.25,291.323529,979.235294,2350.588235,16483.838235,2615.676471,11702.088235,2166.073529,103.058824,8700.517414,48.676471,53.25,104.394118,7780.222955
std,504645.8,458.855701,4941.783853,87.296095,357.725587,1602.208384,3050.350321,19044.851768,2976.359418,13913.736598,2541.341127,138.326769,3229.769757,14.729847,12.203957,17.184312,3233.114074
min,49271.0,678.0,135.0,0.0,18.0,13.0,77.0,1069.0,163.0,728.0,81.0,1.0,1972.791138,13.0,30.0,88.0,218.0
25%,136093.0,1180.75,867.5,8.75,62.25,194.25,570.25,3640.75,511.75,2634.75,447.75,12.25,6334.47697,37.0,44.0,93.45,5677.419093
50%,242860.0,1369.0,1726.0,29.5,145.0,388.0,1163.5,9948.0,1498.5,7345.5,1026.0,47.5,8387.70033,46.0,52.0,101.0,8100.019451
75%,590990.2,1604.0,4493.0,51.25,367.25,1169.75,2889.25,20903.25,3907.75,13803.75,3047.75,136.75,10806.325688,59.25,61.25,107.7,10136.617811
max,2693976.0,3362.0,25532.0,492.0,1761.0,9147.0,15296.0,101750.0,17038.0,71614.0,13098.0,789.0,16274.419781,87.0,86.0,183.0,14454.628643


In [657]:
# Quartile cut the Livability Score data, to see how it is evenly distributed

pd.qcut(merged_population_rent_crime_walk_cost['Livability Score'], q=5)

0        (7083.74, 8557.383]
1        (7083.74, 8557.383]
2        (217.999, 4619.026]
3        (4619.026, 7083.74]
4      (8557.383, 10829.277]
               ...          
63    (10829.277, 14454.629]
64       (4619.026, 7083.74]
65       (217.999, 4619.026]
66     (8557.383, 10829.277]
67     (8557.383, 10829.277]
Name: Livability Score, Length: 68, dtype: category
Categories (5, interval[float64]): [(217.999, 4619.026] < (4619.026, 7083.74] < (7083.74, 8557.383] < (8557.383, 10829.277] < (10829.277, 14454.629]]

In [658]:
# Bin the Cost of Living Index data into categories and ranges

bins = [217.999, 5161.602, 6756.682, 8531.989, 10735.507, 14454.629]
labels = ["Lowest Livability Scores", "Lower Livability Scores", "Average Livability Scores", "Higher Livability Scores", "Highest Livability Scores"]
merged_population_rent_crime_walk_cost['Livability Score Categories'] = pd.cut(merged_population_rent_crime_walk_cost['Livability Score'], bins=bins, labels=labels)

bins = [217.999, 5161.602, 6756.682, 8531.989, 10735.507, 14454.629]
labels = ["217.99-5,161.60", "5,161.60-6,756.68", "6,756.68-8,531.98", "8,531.98-10,735.50", "10,735.50-14,454.62"]
merged_population_rent_crime_walk_cost['Livability Score Ranges'] = pd.cut(merged_population_rent_crime_walk_cost['Livability Score'], bins=bins, labels=labels)

merged_population_rent_crime_walk_cost

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,...,Walk Score Categories,Walk Score Ranges,Bike Score Categories,Bike Score Ranges,Cost of Living Index,Cost of Living Index Categories,Cost of Living Index Ranges,Livability Score,Livability Score Categories,Livability Score Ranges
0,"Phoenix, Arizona",1680992,city,1447,Arizona,Phoenix,11803,131,1139,3197,...,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,105.8,Highest Cost of Living Index,103.74-183.0,8401.333638,Average Livability Scores,"6,756.68-8,531.98"
1,"Tucson, Arizona",548073,city,1248,Arizona,Tucson,3775,40,527,1105,...,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,99.7,Higher Cost of Living Index,96.18-103.74,8531.989183,Higher Livability Scores,"8,531.98-10,735.50"
2,"Little Rock, Arkansas",197312,city,895,Arkansas,Little Rock,3009,38,209,391,...,Average Walk Score,31.0-38.0,Lowest Bike Scores,6.999-34.0,89.4,Lower Cost of Living Index,88.5-91.4,1044.568889,Lowest Livability Scores,"217.99-5,161.60"
3,"Bakersfield, California",384145,city,1340,California,Bakersfield,1766,34,116,701,...,Average Walk Score,31.0-38.0,Average Bike Scores,41.0-46.0,103.6,Higher Cost of Living Index,96.18-103.74,7045.510728,Average Livability Scores,"6,756.68-8,531.98"
4,"Riverside, California",331360,city,1944,California,Riverside,1686,17,139,476,...,Higher Walk Score,38.0-48.0,Higher Bike Scores,46.0-52.0,116.8,Highest Cost of Living Index,103.74-183.0,9532.980283,Higher Livability Scores,"8,531.98-10,735.50"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,"Virginia Beach, Virginia",449974,city,1356,Virginia,Virginia Beach,581,30,79,196,...,Average Walk Score,31.0-38.0,Average Bike Scores,41.0-46.0,101.1,Higher Cost of Living Index,96.18-103.74,12671.745839,Highest Livability Scores,"10,735.50-14,454.62"
64,"Seattle, Washington",753675,city,1964,Washington,Seattle,4471,28,358,1339,...,Highest Walk Score,48.0-96.0,Highest Bike Scores,52.0-96.0,129.3,Highest Cost of Living Index,103.74-183.0,6237.440151,Lower Livability Scores,"5,161.60-6,756.68"
65,"Spokane, Washington",222081,city,1111,Washington,Spokane,1520,6,230,311,...,Highest Walk Score,48.0-96.0,Higher Bike Scores,46.0-52.0,99.9,Higher Cost of Living Index,96.18-103.74,3335.519438,Lowest Livability Scores,"217.99-5,161.60"
66,"Madison, Wisconsin",259680,city,1415,Wisconsin,Madison,940,4,107,217,...,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,101.7,Higher Cost of Living Index,96.18-103.74,10783.251019,Highest Livability Scores,"10,735.50-14,454.62"


In [659]:
# Round all columns to 2 decimal places

merged_population_rent_crime_walk_cost = merged_population_rent_crime_walk_cost.round(2)

merged_population_rent_crime_walk_cost

Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,...,Walk Score Categories,Walk Score Ranges,Bike Score Categories,Bike Score Ranges,Cost of Living Index,Cost of Living Index Categories,Cost of Living Index Ranges,Livability Score,Livability Score Categories,Livability Score Ranges
0,"Phoenix, Arizona",1680992,city,1447,Arizona,Phoenix,11803,131,1139,3197,...,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,105.8,Highest Cost of Living Index,103.74-183.0,8401.33,Average Livability Scores,"6,756.68-8,531.98"
1,"Tucson, Arizona",548073,city,1248,Arizona,Tucson,3775,40,527,1105,...,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,99.7,Higher Cost of Living Index,96.18-103.74,8531.99,Higher Livability Scores,"8,531.98-10,735.50"
2,"Little Rock, Arkansas",197312,city,895,Arkansas,Little Rock,3009,38,209,391,...,Average Walk Score,31.0-38.0,Lowest Bike Scores,6.999-34.0,89.4,Lower Cost of Living Index,88.5-91.4,1044.57,Lowest Livability Scores,"217.99-5,161.60"
3,"Bakersfield, California",384145,city,1340,California,Bakersfield,1766,34,116,701,...,Average Walk Score,31.0-38.0,Average Bike Scores,41.0-46.0,103.6,Higher Cost of Living Index,96.18-103.74,7045.51,Average Livability Scores,"6,756.68-8,531.98"
4,"Riverside, California",331360,city,1944,California,Riverside,1686,17,139,476,...,Higher Walk Score,38.0-48.0,Higher Bike Scores,46.0-52.0,116.8,Highest Cost of Living Index,103.74-183.0,9532.98,Higher Livability Scores,"8,531.98-10,735.50"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,"Virginia Beach, Virginia",449974,city,1356,Virginia,Virginia Beach,581,30,79,196,...,Average Walk Score,31.0-38.0,Average Bike Scores,41.0-46.0,101.1,Higher Cost of Living Index,96.18-103.74,12671.75,Highest Livability Scores,"10,735.50-14,454.62"
64,"Seattle, Washington",753675,city,1964,Washington,Seattle,4471,28,358,1339,...,Highest Walk Score,48.0-96.0,Highest Bike Scores,52.0-96.0,129.3,Highest Cost of Living Index,103.74-183.0,6237.44,Lower Livability Scores,"5,161.60-6,756.68"
65,"Spokane, Washington",222081,city,1111,Washington,Spokane,1520,6,230,311,...,Highest Walk Score,48.0-96.0,Higher Bike Scores,46.0-52.0,99.9,Higher Cost of Living Index,96.18-103.74,3335.52,Lowest Livability Scores,"217.99-5,161.60"
66,"Madison, Wisconsin",259680,city,1415,Wisconsin,Madison,940,4,107,217,...,Higher Walk Score,38.0-48.0,Highest Bike Scores,52.0-96.0,101.7,Higher Cost of Living Index,96.18-103.74,10783.25,Highest Livability Scores,"10,735.50-14,454.62"


In [660]:
# Check the datatypes of each column in the df, category and object dtypes would be dropped if using groupby at this point for numerous locations, so it is done earlier

merged_population_rent_crime_walk_cost.dtypes

Location                                      object
2019 Population                                int32
Town or City                                  object
2019 Rental Rates                              int32
State                                         object
City                                          object
Violent crime                                  int32
Murder and nonnegligent manslaughter           int32
Rape                                           int32
Robbery                                        int32
Aggravated assault                             int32
Property crime                                 int32
Burglary                                       int32
Larceny-theft                                  int32
Motor vehicle theft                            int32
Arson                                          int32
Crime Population                              object
Crime Rate                                   float64
Urban Population by City Size Categories    ca

In [661]:
# Export merged_population_rent_crime_walk_cost with binned categories for csv and continue modeling in another notebook

merged_population_rent_crime_walk_cost.to_csv('../data/pop_rent_crime_walk_cost_livability_bins.csv')

In [662]:
# Consider scaling data between 0-100, like Livability Score since we calculated it, perhaps then it is more interpretable by users.

In [663]:
# Calculate the mean values of duplication Location entries (this drops all categorical columns so was instead used above on rent df with the most duplicates)

#merged_population_rent_crime_walk_cost = merged_population_rent_crime_walk_cost.groupby(['Location'], as_index=False).mean()
#merged_population_rent_crime_walk_cost