In [10]:
import pandas as pd
import boto3
import awswrangler as wr
import os

In [13]:
### Test to see if the local standardized population dataset is
### identical to the one we standardzied with Spark + AWS EMR

# Set variables for both dataframes
start_year = 2013
end_year = 2021

In [54]:
# call in the local df
local_df = pd.read_csv(
    "datasets/cleaned_census_api_files/standardized/population_standardized.csv",
    dtype={'geoid_block':str}
).drop(columns=[
    'geoid_tract','state','county','tract','z_score',
    'ratio'
])

local_df = local_df.round()

local_df = local_df.iloc[:, :10]
for i in range(start_year, end_year + 1):
    local_df.rename(columns={f'{i}_block':f'{i}'}, inplace=True)
    local_df[str(i)] = local_df[str(i)].astype(int)
    
local_df = local_df.sort_values('geoid_block').reset_index(drop=True)
local_df

Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,010010201001,637,676,649,745,692,636,730,674,693
1,010010201002,1171,1224,1299,1265,1153,1287,1263,1267,1098
2,010010202001,1383,1289,1074,960,1020,810,835,706,844
3,010010202002,972,1053,1082,1236,1152,1218,1124,1051,1166
4,010010203001,2366,2376,2143,2364,2555,2641,2774,2912,2685
...,...,...,...,...,...,...,...,...,...,...
242330,721537506011,2562,2434,2346,2116,2032,1675,1874,1825,1894
242331,721537506012,1315,1125,1118,1034,899,931,966,1504,1820
242332,721537506013,2122,1815,1803,1669,1449,1501,1557,1276,1213
242333,721537506021,2332,2351,1994,2005,2055,1707,1577,1410,1295


In [55]:
# call in the standardized df from s3
os.environ.setdefault("AWS_PROFILE", "default")
os.environ.setdefault("AWS_DEFAULT_REGION", "us-west-1")

src_filepath = "s3://real-estate-wolff/census-data/block-groups/standardized/population_blocks_standardized.csv/"

spark_df = wr.s3.read_csv(
    src_filepath, dtype={'geoid_block':str}
                         )
spark_df = spark_df.sort_values('geoid_block').reset_index(drop=True)
spark_df = spark_df.round()

for i in range(start_year, end_year + 1):
    spark_df[str(i)] = spark_df[str(i)].astype(int)

spark_df

Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,010010201001,637,676,649,745,692,636,730,674,693
1,010010201002,1171,1224,1299,1265,1153,1287,1263,1267,1098
2,010010202001,1383,1289,1074,960,1020,810,835,706,844
3,010010202002,972,1053,1082,1236,1152,1218,1124,1051,1166
4,010010203001,2366,2376,2143,2364,2555,2641,2774,2912,2685
...,...,...,...,...,...,...,...,...,...,...
242330,721537506011,2562,2434,2346,2116,2032,1675,1874,1825,1894
242331,721537506012,1315,1125,1118,1034,899,931,966,1504,1820
242332,721537506013,2122,1815,1803,1669,1449,1501,1557,1276,1213
242333,721537506021,2332,2351,1994,2005,2055,1707,1577,1410,1295


In [39]:
# Test for identical-ness
new_df = local_df[local_df == spark_df]
new_df[new_df['geoid_block'].notna()]

Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,010010201001,637.0,676.0,649.0,745.0,692.0,636.0,730.0,674,693
1,010010201002,1171.0,1224.0,1299.0,1265.0,1153.0,1287.0,1263.0,1267,1098
2,010010202001,1383.0,1289.0,1074.0,960.0,1020.0,810.0,835.0,706,844
3,010010202002,972.0,1053.0,1082.0,1236.0,1152.0,1218.0,1124.0,1051,1166
4,010010203001,2366.0,2376.0,2143.0,2364.0,2555.0,2641.0,2774.0,2912,2685
...,...,...,...,...,...,...,...,...,...,...
242330,721537506011,2562.0,2434.0,2346.0,2116.0,2032.0,1675.0,1874.0,1825,1894
242331,721537506012,1315.0,1125.0,,1034.0,,,,1504,1820
242332,721537506013,,,1803.0,,1449.0,1501.0,1557.0,1276,1213
242333,721537506021,2332.0,2351.0,1994.0,2005.0,2055.0,1707.0,1577.0,1410,1295


In [56]:
for i in range(start_year, end_year + 1):
    new_df = local_df[local_df[str(i)] != spark_df[str(i)]]
    if new_df.shape[0] > 0:
        print(i)
        display(new_df)
        

2013


Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
15463,60350403042,1091,966,818,861,835,789,671,682,972
26692,60650490012,2551,2838,3053,3146,3267,3081,3096,4099,4201
38413,80310026021,543,578,645,612,617,644,654,480,498
83018,181630002042,1579,1626,1577,1495,1500,1206,1099,648,645
83459,181790401002,729,711,633,855,964,968,1083,1145,1145
92265,212390501031,1847,1831,1456,1940,2141,2113,2261,1962,1564
125663,291650305021,737,835,986,1116,1054,1002,989,1136,967
134418,340076033014,991,976,966,925,1005,1052,1183,748,973
154524,360910613012,737,728,737,765,748,802,909,1143,1158
154525,360910613013,1563,1543,1560,1621,1586,1698,1927,1590,1563


2014


Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
29559,60730076014,674,447,491,480,469,416,368,320,357
30172,60730133214,2629,2735,2797,3126,3352,3336,3648,3653,3522
43574,90117051011,1954,2305,2161,2180,2067,2041,1891,2141,1956
101545,245102303002,1030,1079,1063,1165,1199,1310,1299,1270,1342
107769,260350005002,455,523,528,524,544,561,559,568,508
141013,350490001032,558,515,450,488,353,380,322,232,279
142118,360050239006,950,1083,1022,891,1051,975,808,722,541
157276,361219706001,1613,1643,1611,1512,1457,1547,1479,1730,1755
170700,390930715001,1263,1069,1083,1222,1309,1231,1351,1305,1144
187252,420893003131,436,493,488,395,421,426,567,488,510


2015


Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
43383,90093513004,1029,989,969,862,1141,1239,1201,1362,1262
50311,120710102101,1089,1165,1289,1206,1283,1564,1559,928,871
52869,120860100151,2680,2937,3093,3117,3317,2785,2440,2107,1811
52870,120860100152,1489,1063,921,990,793,1414,1492,1188,1347
66822,150030105091,709,788,727,693,716,730,584,486,573
67099,160010005013,1060,1110,1009,1103,1018,1193,1299,1205,1466
84687,191114906004,605,611,609,620,589,586,574,626,673
92957,220190018032,1252,1200,1333,1365,1135,1167,1100,882,824
104044,250173353022,577,706,845,942,943,1134,1085,831,836
111876,261251425001,1862,1970,2055,2243,2380,2229,2159,2214,2308


2016


Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
42397,90035109001,1325,1243,1332,1193,1200,1404,1289,1474,1584
47006,120150201013,940,977,984,927,988,1076,1043,1064,1467
58290,121270830091,2487,2874,2764,2591,2614,2847,2811,2484,2338
65281,132551604012,892,717,883,1103,966,1003,1202,888,1066
92619,220150109002,1388,1539,1754,1999,2319,2165,1945,1113,1069
176216,401091068043,1069,1063,992,901,1034,924,739,481,547
176433,401091081065,294,326,311,333,321,498,519,845,698
186045,420710121061,742,788,865,1003,1067,1162,859,722,691
193322,450450024061,1560,1609,1517,1037,1369,1535,1608,1880,2127
210536,482015223012,1851,1798,1673,1633,1600,1545,1510,1532,1441


2017


Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
11438,51450706002,1156,1068,1349,1392,1367,1424,1334,1195,1201
12964,60133032081,1981,1955,2068,2080,2135,2232,2218,1304,1304
34565,60855130003,478,444,508,605,599,550,551,649,577
47520,120210110033,1242,1114,1179,1224,1157,1162,1079,982,786
54095,120950167372,2311,2361,2601,3032,3017,3206,3488,4395,5478
115187,261635915022,12,12,13,13,12,13,14,7,0
117199,270530265143,1018,1011,1019,1114,1109,967,986,729,496
134418,340076033014,991,976,966,925,1005,1052,1183,748,973
150725,360650250021,980,817,751,723,847,852,914,1356,1266
155948,361031907065,1221,1388,1260,1164,1119,1109,1002,1708,2085


2018


Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
3530,11170306162,1315,1383,1418,1460,1436,1463,1496,1259,1166
17717,60372652031,1331,1234,1126,1273,1138,1361,1171,1198,714
26581,60650456122,780,931,1165,1080,1049,911,932,594,193
29398,60730033031,1646,1594,1738,1871,2101,2079,2193,1860,1851
34549,60855125141,1166,1171,1098,1014,1164,1203,1303,1400,1393
34550,60855125142,1633,1639,1536,1420,1630,1683,1825,1605,1856
62638,131210114412,1307,1382,1553,1587,1675,1709,1734,1114,1157
67687,160270204033,1144,999,1122,1071,1098,1097,1077,968,1011
80368,180859620004,741,873,951,942,989,943,1059,1179,1230
107769,260350005002,455,523,528,524,544,561,559,568,508


2019


Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
31029,60730201052,1135,1314,1424,1366,1417,1422,1439,1365,1032
31030,60730201053,1207,1397,1514,1452,1506,1511,1529,1512,1694
44825,110010049023,1227,1296,1391,1554,1459,1647,1781,1835,1952
52788,120860093222,1131,1137,1121,934,706,660,635,1542,1294
57204,121113815082,1702,1610,1826,1892,1858,1646,1730,973,1094
58479,121319501042,495,468,491,498,562,553,503,149,207
58790,130210108001,1141,1179,1115,1145,1020,1095,1117,1188,1148
68333,160859701001,109,122,125,103,62,57,65,252,321
78742,180179516004,563,693,607,804,451,665,581,766,598
79863,180632106072,1143,1174,1180,1240,1380,1125,1281,1537,1937


In [60]:
local_df[local_df['geoid_block']=='060730201052']

Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
31029,60730201052,1135,1314,1424,1366,1417,1422,1439,1365,1032


In [61]:
spark_df[spark_df['geoid_block']=='060730201052']

Unnamed: 0,geoid_block,2013,2014,2015,2016,2017,2018,2019,2020,2021
31029,60730201052,1135,1314,1424,1366,1417,1422,1438,1365,1032


In [40]:
local_df.columns

Index(['geoid_block', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021'],
      dtype='object')

In [41]:
spark_df.columns

Index(['geoid_block', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021'],
      dtype='object')

In [42]:
from pandas.util.testing import assert_frame_equal
assert_frame_equal(local_df, spark_df)

AssertionError: DataFrame.iloc[:, 1] (column name="2013") are different

DataFrame.iloc[:, 1] (column name="2013") values are different (19.81059 %)
[index]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
[left]:  [637, 1171, 1383, 972, 2366, 691, 1088, 1895, 966, 454, 1859, 1991, 2076, 1288, 599, 1378, 1657, 800, 714, 1893, 1169, 904, 670, 1225, 713, 1371, 729, 480, 1764, 1324, 1721, 1707, 641, 1812, 1615, 1295, 1179, 1755, 1020, 1402, 969, 377, 745, 1283, 1146, 738, 1167, 1518, 1743, 1106, 857, 1956, 1639, 635, 1882, 958, 1525, 2331, 1491, 1192, 1411, 801, 1086, 603, 1645, 2458, 2526, 1105, 2750, 873, 1397, 1815, 1173, 2359, 2667, 1139, 3286, 1363, 2668, 1213, 2415, 988, 2329, 1711, 1700, 3182, 3013, 1515, 1943, 2170, 2240, 2238, 2284, 2178, 1216, 1196, 1946, 971, 2016, 2434, ...]
[right]: [637, 1171, 1383, 972, 2366, 691, 1088, 1895, 966, 454, 1859, 1991, 2077, 1288, 600, 1378, 1658, 800, 715, 1893, 1169, 904, 671, 1225, 714, 1371, 730, 480, 1764, 1325, 1721, 1708, 641, 1813, 1615, 1295, 1179, 1755, 1021, 1402, 970, 377, 745, 1284, 1146, 739, 1167, 1518, 1743, 1106, 858, 1956, 1639, 635, 1883, 958, 1525, 2331, 1491, 1192, 1412, 801, 1086, 603, 1645, 2458, 2526, 1105, 2750, 873, 1397, 1816, 1173, 2360, 2668, 1140, 3287, 1363, 2669, 1213, 2416, 988, 2329, 1711, 1700, 3182, 3013, 1515, 1943, 2170, 2240, 2238, 2284, 2178, 1216, 1196, 1946, 972, 2016, 2434, ...]