In [26]:
import pandas as pd
import os

In [27]:
cwd = os.getcwd()

In [28]:
df = pd.read_csv(cwd + "/data_pcv_project.csv")
df

Unnamed: 0,Company ID,Wage,Wage Type,Zip Code of Residence
0,4,112291.00,Salary,56024
1,4,64701.00,Salary,56082
2,4,10.81,Hourly,56001
3,4,53505.00,Salary,56024
4,4,20.48,Hourly,56001
...,...,...,...,...
486,33,10.52,Hourly,97527
487,33,10.52,Hourly,97527
488,33,10.34,Hourly,97530
489,33,10.34,Hourly,97544


The goal of this practice is to identify if there is a satatistically significant difference between the wages of jobs in rural locations vs urban locations.

In [29]:
df_mod = df.copy()

In [30]:
df_mod.dtypes

Company ID                int64
Wage                     object
Wage Type                object
Zip Code of Residence     int64
dtype: object

In [39]:
df_mod["Wage"] = df_mod["Wage"].str.replace(',', '').astype('float64')
df_mod.dtypes

Company ID                 int64
Wage                     float64
Wage Type                 object
Zip Code of Residence      int64
dtype: object

In [40]:
df_mod

Unnamed: 0,Company ID,Wage,Wage Type,Zip Code of Residence
0,4,112291.00,Salary,56024
1,4,64701.00,Salary,56082
2,4,10.81,Hourly,56001
3,4,53505.00,Salary,56024
4,4,20.48,Hourly,56001
...,...,...,...,...
486,33,10.52,Hourly,97527
487,33,10.52,Hourly,97527
488,33,10.34,Hourly,97530
489,33,10.34,Hourly,97544


We are bringing in census data from 2010 because it contains something called a MEMI (Metropolitan/Micropolitan Statistical Status Indicator)
1 = micropolitan
2 = metropolitan

We need this as a way to identify certain zip codes as rural/urban.

In [42]:
census = pd.read_csv(cwd + "/zcta_county_rel_census_2010.csv")
census

Unnamed: 0,ZCTA5,MEMI,CBSA,POPPT,HUPT,AREAPT,AREALANDPT,ZPOP,ZHU,ZAREA,...,MAREA,MAREALAND,ZPOPPCT,ZHUPCT,ZAREAPCT,ZAREALANDPCT,MPOPPCT,MHUPCT,MAREAPCT,MAREALANDPCT
0,601,2,10260,18465,7695,165132671,164333375,18570,7744,167459085,...,173777444,172725651,99.43,99.37,98.61,98.60,94.77,94.71,95.03,95.14
1,601,2,46580,105,49,2326414,2326414,18570,7744,167459085,...,298027589,294039825,0.57,0.63,1.39,1.40,0.32,0.35,0.78,0.79
2,602,1,10380,41520,18073,83734431,79288158,41520,18073,83734431,...,1283409618,928441564,100.00,100.00,100.00,100.00,13.56,13.02,6.52,8.54
3,603,1,10380,54689,25653,82063867,81880442,54689,25653,82063867,...,1283409618,928441564,100.00,100.00,100.00,100.00,17.86,18.47,6.39,8.82
4,606,1,41900,89,38,6679806,6679806,6615,2877,109592548,...,955442678,571498111,1.35,1.32,6.10,6.10,0.06,0.05,0.70,1.17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28707,99760,1,21820,28,30,145297716,145297716,785,520,6916452905,...,19279062512,19005874393,3.57,5.77,2.10,2.12,0.03,0.07,0.75,0.76
28708,99775,1,21820,1251,166,300291,300291,1251,166,300291,...,19279062512,19005874393,100.00,100.00,100.00,100.00,1.28,0.40,0.00,0.00
28709,99801,2,27940,29164,11998,6467713465,5680932497,29164,11998,6467713465,...,8427543777,6997968643,100.00,100.00,100.00,100.00,93.25,91.90,76.74,81.18
28710,99824,2,27940,2111,1049,5507792,4472894,2111,1049,5507792,...,8427543777,6997968643,100.00,100.00,100.00,100.00,6.75,8.04,0.07,0.06


In [57]:
census_mod = census.copy()


In [58]:
census_mod = census_mod[["ZCTA5", "MEMI"]]
census_mod = census_mod.drop_duplicates(subset = "ZCTA5")
census_mod

Unnamed: 0,ZCTA5,MEMI
0,601,2
2,602,1
3,603,1
4,606,1
6,610,1
...,...,...
28707,99760,1
28708,99775,1
28709,99801,2
28710,99824,2


In [66]:
df_w_census = pd.merge(left = df_mod,
                right = census_mod,
                how = "left",
                left_on = "Zip Code of Residence",
                right_on = "ZCTA5")
print("Original Data:", len(df_mod))
print("Merged Data:", len(df_w_census))
df_w_census = df_w_census.drop(columns = "ZCTA5")
df_w_census = df_w_census[df_w_census["MEMI"].notna()]
print("Data After Drop:", len(df_w_census))
df_w_census["MEMI"] = df_w_census["MEMI"].astype('int64')


Original Data: 491
Merged Data: 491
Data After Drop: 434


In [67]:
df_w_census.dtypes

Company ID                 int64
Wage                     float64
Wage Type                 object
Zip Code of Residence      int64
MEMI                       int64
dtype: object

After this we have successfully completed data tidying and can now begin the analysis.