In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
train = pd.read_csv("train.csv")
train['cfips'] = train['cfips'].apply(str)
train['date'] = train['first_day_of_month'].apply(lambda s: pd.to_datetime(s))
train

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,date
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249,2019-08-01
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.884870,1198,2019-09-01
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269,2019-10-01
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243,2019-11-01
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243,2019-12-01
...,...,...,...,...,...,...,...,...
122260,56045_2022-06-01,56045,Weston County,Wyoming,2022-06-01,1.803249,101,2022-06-01
122261,56045_2022-07-01,56045,Weston County,Wyoming,2022-07-01,1.803249,101,2022-07-01
122262,56045_2022-08-01,56045,Weston County,Wyoming,2022-08-01,1.785395,100,2022-08-01
122263,56045_2022-09-01,56045,Weston County,Wyoming,2022-09-01,1.785395,100,2022-09-01


In [None]:
train[['microbusiness_density', 'active']].corr(method='spearman')

In [None]:
train[['microbusiness_density', 'active']].describe()

In [4]:
cfips_list = list(set(train['cfips']))

## Detect and clean outliers
Using the IQR method. If an obs is under $IQR - 1.5*IQR$ or above $IQR + 1.5*IQR$ it is an outlier. 

If it is an outlier, replace with the average of the previous and next obs. If the prev obs is not available (i.e. it is the first obs) use the next; if the next obs is not available (i.e. it is the last obs) use the prev.

In [119]:


res = []
outlier_indices = {}
for cfip in cfips_list:
    X = train.query(f"cfips == '{cfip}'").sort_values('date', ascending=True)
    
    y = X['microbusiness_density']
    q025, q75 = np.quantile(y, q = [0.25, 0.75])
    lo_iqr, hi_iqr = q025 - 1.5*q025, q75 + 1.5*q75
    
    def outlier(x, lo_iqr, hi_iqr):
        if x < 2*lo_iqr:
            return True
        if x > 2*hi_iqr:
            return True
        return False

    X['outlier'] = y.apply(outlier, args=(lo_iqr, hi_iqr))
    
    idx = (X['outlier'] == True)
    if sum(idx) > 0:
        print(cfip)
        outlier_indices[cfip] = idx
        X.loc[idx, ['microbusiness_density', 'active']] = np.nan

        X['microbusiness_density_ffill'] = X['microbusiness_density'].fillna(method='ffill').fillna(method='bfill')
        X['microbusiness_density_bfill'] = X['microbusiness_density'].fillna(method='bfill').fillna(method='ffill')
        X['microbusiness_density'] = .5*(X['microbusiness_density_bfill'] + X['microbusiness_density_ffill'])
        
        X['active_ffill'] = X['active'].fillna(method='ffill').fillna(method='bfill')
        X['active_bfill'] = X['active'].fillna(method='bfill').fillna(method='ffill')
        X['active'] = .5*(X['active_bfill'] + X['active_ffill'])
        
        X = X.drop(labels=['microbusiness_density_ffill', 
                       'microbusiness_density_bfill',
                       'active_ffill',
                       'active_bfill'
                      ],
               axis=1
              )

    res.append(X)
train_imputed = pd.concat(res, axis=0)

print(train_imputed.shape)
train_imputed.query("outlier == True")

46127
31009
29171
29063
(122265, 9)


Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,date,outlier
94380,46127_2019-08-01,46127,Union County,South Dakota,2019-08-01,5.880263,662.0,2019-08-01,True
94382,46127_2019-10-01,46127,Union County,South Dakota,2019-10-01,5.875822,661.5,2019-10-01,True
64595,31009_2020-07-01,31009,Blaine County,Nebraska,2020-07-01,0.266667,1.0,2020-07-01,True
61083,29171_2020-05-01,29171,Putnam County,Missouri,2020-05-01,1.437399,53.5,2020-05-01,True
59006,29063_2022-10-01,29063,DeKalb County,Missouri,2022-10-01,1.32329,130.0,2022-10-01,True


In [120]:
train_imputed

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,date,outlier
117624,54063_2019-08-01,54063,Monroe County,West Virginia,2019-08-01,0.859202,93.0,2019-08-01,False
117625,54063_2019-09-01,54063,Monroe County,West Virginia,2019-09-01,0.877679,95.0,2019-09-01,False
117626,54063_2019-10-01,54063,Monroe County,West Virginia,2019-10-01,0.914634,99.0,2019-10-01,False
117627,54063_2019-11-01,54063,Monroe County,West Virginia,2019-11-01,0.933112,101.0,2019-11-01,False
117628,54063_2019-12-01,54063,Monroe County,West Virginia,2019-12-01,0.914634,99.0,2019-12-01,False
...,...,...,...,...,...,...,...,...,...
94960,47021_2022-06-01,47021,Cheatham County,Tennessee,2022-06-01,5.811858,1838.0,2022-06-01,False
94961,47021_2022-07-01,47021,Cheatham County,Tennessee,2022-07-01,5.856126,1852.0,2022-07-01,False
94962,47021_2022-08-01,47021,Cheatham County,Tennessee,2022-08-01,5.878261,1859.0,2022-08-01,False
94963,47021_2022-09-01,47021,Cheatham County,Tennessee,2022-09-01,5.897233,1865.0,2022-09-01,False


In [121]:
train_imputed.query("outlier == True")

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,date,outlier
94380,46127_2019-08-01,46127,Union County,South Dakota,2019-08-01,5.880263,662.0,2019-08-01,True
94382,46127_2019-10-01,46127,Union County,South Dakota,2019-10-01,5.875822,661.5,2019-10-01,True
64595,31009_2020-07-01,31009,Blaine County,Nebraska,2020-07-01,0.266667,1.0,2020-07-01,True
61083,29171_2020-05-01,29171,Putnam County,Missouri,2020-05-01,1.437399,53.5,2020-05-01,True
59006,29063_2022-10-01,29063,DeKalb County,Missouri,2022-10-01,1.32329,130.0,2022-10-01,True


In [112]:
train_imputed.drop(labels=['outlier'], axis=1).sort_values(['cfips', 'date'], ascending=True).to_csv("train-imputed.csv", index=False)

In [127]:
for cfip in cfips_list:
    X = train_imputed.query(f"cfips == '{cfip}'").sort_values('date', ascending=True)
    y = X['microbusiness_density']
    q05, q025, q75, q95 = np.quantile(y, q = [0.05, 0.25, 0.75, .95])
    
    lo_iqr, hi_iqr = q025 - 1.5*q025, q75 + 1.5*q75
    lo_iqr, hi_iqr = q05, q95
    med = np.median(y)
    if (min(y) < lo_iqr) or (max(y) > hi_iqr):
        print(cfip, '-- loiqr:', lo_iqr, 'hiiqr:', hi_iqr, 'min:', min(y), 'median:', med, 'max:', max(y))

54063 -- loiqr: 0.9046615 hiiqr: 1.1971047 min: 0.85920179 median: 1.0686164 max: 1.2688709
29227 -- loiqr: 1.23334966 hiiqr: 1.4715099999999999 min: 1.1385199 median: 1.3605442 max: 1.546073
39071 -- loiqr: 1.6106846700000002 hiiqr: 1.85214619 min: 1.5959399 median: 1.7470045 max: 1.8741592
6069 -- loiqr: 4.26980847 hiiqr: 5.00951786 min: 4.2635746 median: 4.7049084 max: 5.0346804
35059 -- loiqr: 1.2500000599999999 hiiqr: 1.48005911 min: 1.2224939 median: 1.3648771 max: 1.506647
29019 -- loiqr: 6.6027743899999995 hiiqr: 7.1067889300000004 min: 6.4339795 median: 6.8513999 max: 7.1487088
29149 -- loiqr: 1.5884847800000002 hiiqr: 2.05184654 min: 1.5317286 median: 1.645448 max: 2.1521153
1051 -- loiqr: 2.6402397900000003 hiiqr: 3.35869308 min: 2.5783718 median: 2.7656362 max: 3.4046247
34023 -- loiqr: 7.43299673 hiiqr: 7.90512219 min: 7.4144273 median: 7.752809 max: 7.935492
40001 -- loiqr: 0.7312277 hiiqr: 0.994137593 min: 0.7312277 median: 0.9617756 max: 1.0049322
49013 -- loiqr: 4.0827

6045 -- loiqr: 5.97603694 hiiqr: 6.50235752 min: 5.8944573 median: 6.2234612 max: 6.5690765
42087 -- loiqr: 2.17374897 hiiqr: 2.3083928499999997 min: 2.1607773 median: 2.2359591 max: 2.3099186
1067 -- loiqr: 2.42408598 hiiqr: 2.7531799 min: 2.4027123 median: 2.6446037 max: 2.8235643
13263 -- loiqr: 0.64638782 hiiqr: 1.0128033 min: 0.62405449 median: 0.84081787 max: 1.0701319
20015 -- loiqr: 3.97014572 hiiqr: 4.09804176 min: 3.9412861 median: 3.9985476 max: 4.115499
50011 -- loiqr: 3.69895887 hiiqr: 5.218952209999999 min: 3.6814778 median: 4.0543017 max: 5.2329597
12089 -- loiqr: 6.25519589 hiiqr: 6.76725517 min: 6.1392565 median: 6.3932428 max: 6.8235364
27023 -- loiqr: 1.71345466 hiiqr: 2.17286384 min: 1.6917905 median: 1.9751201 max: 2.1827354
48335 -- loiqr: 0.884000643 hiiqr: 1.0707241 min: 0.84857351 median: 0.95098758 max: 1.1000144
17143 -- loiqr: 4.1005016 hiiqr: 4.53508261 min: 4.0654163 median: 4.3069963 max: 4.5723891
55115 -- loiqr: 1.88528686 hiiqr: 1.97073692 min: 1.87344

32019 -- loiqr: 3.19375285 hiiqr: 3.66754866 min: 3.1820564 median: 3.3484535 max: 3.7118113
55083 -- loiqr: 2.87412422 hiiqr: 3.9181596200000004 min: 2.7638526 median: 3.5859358 max: 3.969723
56005 -- loiqr: 2.4135726 hiiqr: 2.76389862 min: 2.3833656 median: 2.6624444 max: 2.7973258
38057 -- loiqr: 1.63174319 hiiqr: 2.3519663100000003 min: 1.6049933 median: 1.7755134 max: 2.3965807
54049 -- loiqr: 1.88153 hiiqr: 2.12532727 min: 1.8638215 median: 2.056983 max: 2.1457417
8059 -- loiqr: 10.417515700000001 hiiqr: 11.405503 min: 10.379384 median: 10.705688 max: 11.479008
21157 -- loiqr: 2.80791672 hiiqr: 2.9709737 min: 2.7916734 median: 2.8890772 max: 2.9830346
17141 -- loiqr: 2.7385893 hiiqr: 3.3755624600000003 min: 2.7360744 median: 3.2357786 max: 3.3925686
51115 -- loiqr: 5.17269163 hiiqr: 5.96009645 min: 5.1605968 median: 5.4436293 max: 5.9855385
4021 -- loiqr: 4.3149457600000005 hiiqr: 4.66213764 min: 4.3044987 median: 4.5041318 max: 4.6939907
29151 -- loiqr: 1.6792578599999999 hiiqr:

27173 -- loiqr: 1.8818124699999998 hiiqr: 2.09132059 min: 1.8593689 median: 1.9462465 max: 2.1166134
54037 -- loiqr: 5.319610160000001 hiiqr: 6.0615156500000005 min: 5.2671218 median: 5.6545401 max: 6.088737
33003 -- loiqr: 7.71397847 hiiqr: 8.11818274 min: 7.6795268 median: 7.9478383 max: 8.1849365
6043 -- loiqr: 7.1111112400000005 hiiqr: 7.871647100000001 min: 7.0919065 median: 7.3022451 max: 7.9698215
40133 -- loiqr: 1.0093759900000001 hiiqr: 1.13891073 min: 0.99311227 median: 1.1105772 max: 1.1549826
45019 -- loiqr: 12.1089655 hiiqr: 13.8054285 min: 12.007542 median: 12.603252 max: 14.086524
28155 -- loiqr: 0.618015381 hiiqr: 0.830324915 min: 0.60589743 median: 0.73815596 max: 0.92394215
19135 -- loiqr: 1.0390481 hiiqr: 1.20500102 min: 1.0390481 median: 1.1369421 max: 1.2217885
28139 -- loiqr: 1.30382269 hiiqr: 1.57619478 min: 1.2681925 median: 1.4186538 max: 1.5930268
48237 -- loiqr: 1.8944189 hiiqr: 2.2888825 min: 1.8491973 median: 2.1072247 max: 2.3173158
39009 -- loiqr: 4.22722

45075 -- loiqr: 1.71470889 hiiqr: 2.1336950800000003 min: 1.6968768 median: 2.0156152 max: 2.1658361
9015 -- loiqr: 3.52176386 hiiqr: 4.04863523 min: 3.4920397 median: 3.5848753 max: 4.0765867
17117 -- loiqr: 2.09269696 hiiqr: 2.19723236 min: 2.0901721 median: 2.1492455 max: 2.3184226
12119 -- loiqr: 2.98367329 hiiqr: 3.3524501100000004 min: 2.9006572 median: 3.2501907 max: 3.3622091
50023 -- loiqr: 6.59072032 hiiqr: 6.75996643 min: 6.5599799 median: 6.6514344 max: 6.8234549
37185 -- loiqr: 1.15408152 hiiqr: 1.5479723900000002 min: 1.1258767 median: 1.4519503 max: 1.5764095
5013 -- loiqr: 0.28409091 hiiqr: 0.573505558 min: 0.28409091 median: 0.37878788 max: 0.64485312
41061 -- loiqr: 50.9558206 hiiqr: 61.1653023 min: 50.874134 median: 53.367092 max: 61.442341
29039 -- loiqr: 2.42836945 hiiqr: 2.84982761 min: 2.1175578 median: 2.7372944 max: 2.8961136
42077 -- loiqr: 5.40514977 hiiqr: 6.13468521 min: 5.3814487 median: 5.6268535 max: 6.170928
48047 -- loiqr: 0.37357814899999997 hiiqr: 0.

31079 -- loiqr: 2.31390182 hiiqr: 2.5469832 min: 2.3019681 median: 2.4618282 max: 2.5874472
1049 -- loiqr: 2.14333637 hiiqr: 2.3179935300000003 min: 2.1192918 median: 2.2286246 max: 2.3271375
20041 -- loiqr: 1.66895338 hiiqr: 1.91407347 min: 1.6383287 median: 1.7835993 max: 1.9554248
30075 -- loiqr: 1.92995533 hiiqr: 3.9045553 min: 1.7883756 median: 3.4175334 max: 4.0491686
17005 -- loiqr: 2.15962102 hiiqr: 2.7341412700000003 min: 1.9809353 median: 2.316056 max: 2.8201675
29186 -- loiqr: 1.9806936 hiiqr: 2.12098223 min: 1.9617707 median: 2.0622985 max: 2.1477847
6067 -- loiqr: 6.90208499 hiiqr: 7.58282947 min: 6.8845 median: 7.1063976 max: 7.6843801
38103 -- loiqr: 1.58407526 hiiqr: 1.8023834 min: 1.536 median: 1.664 max: 1.8209409
51049 -- loiqr: 0.8971890770000001 hiiqr: 1.2949092 min: 0.87280196 median: 1.1263279 max: 1.3076044
26107 -- loiqr: 2.28959473 hiiqr: 2.6300782899999997 min: 2.2816389 median: 2.3895171 max: 2.7130799
2105 -- loiqr: 4.951561 hiiqr: 5.5679288 min: 4.951561 m

19193 -- loiqr: 2.69333737 hiiqr: 2.94111402 min: 2.6896734 median: 2.7226028 max: 2.9462783
55085 -- loiqr: 4.55847681 hiiqr: 5.41561555 min: 4.4508157 median: 4.9662094 max: 5.4551654
37153 -- loiqr: 1.1796200000000001 hiiqr: 1.2640004 min: 1.1689845 median: 1.2110176 max: 1.2697591
51107 -- loiqr: 13.8704607 hiiqr: 15.456912899999999 min: 13.761666 median: 14.195013 max: 15.552212
2170 -- loiqr: 5.3382339 hiiqr: 5.76766592 min: 5.2810726 median: 5.5407228 max: 5.7797441
47175 -- loiqr: 0.99180681 hiiqr: 1.2765957 min: 0.97872341 median: 1.1276596 max: 1.2978723
21043 -- loiqr: 0.70002836 hiiqr: 0.902914931 min: 0.67273074 median: 0.78643167 max: 0.96362323
35037 -- loiqr: 1.54873377 hiiqr: 1.8018018 min: 1.5195702 median: 1.7086052 max: 1.8018018
21167 -- loiqr: 2.72755435 hiiqr: 3.2837773500000003 min: 2.7235966 median: 3.0039618 max: 3.2996275
48117 -- loiqr: 0.917883718 hiiqr: 1.1319135 min: 0.91028309 median: 1.080031 max: 1.1888112
21051 -- loiqr: 0.432992271 hiiqr: 0.640346043

29049 -- loiqr: 2.98475929 hiiqr: 3.61282435 min: 2.8688524 median: 3.5331633 max: 3.6494474
5015 -- loiqr: 5.20456593 hiiqr: 6.21817699 min: 5.1390486 median: 5.8957953 max: 6.2733827
12035 -- loiqr: 8.56724298 hiiqr: 9.43740699 min: 8.4386272 median: 8.9327135 max: 9.514822
17025 -- loiqr: 1.7471836 hiiqr: 1.85242497 min: 1.7376361 median: 1.8013632 max: 1.9213803
19197 -- loiqr: 1.2734928 hiiqr: 1.56484573 min: 1.2632228 median: 1.4046685 max: 1.5846539
28149 -- loiqr: 2.06404998 hiiqr: 2.38949775 min: 2.0411582 median: 2.2355537 max: 2.4025128
30031 -- loiqr: 12.0621434 hiiqr: 13.1130081 min: 12.041204 median: 12.387896 max: 13.614101
35053 -- loiqr: 2.80219363 hiiqr: 3.59103762 min: 2.7805669 median: 2.9811757 max: 3.6323316
53075 -- loiqr: 3.2191777 hiiqr: 3.6204075600000003 min: 3.161567 median: 3.2779109 max: 3.7001021
38075 -- loiqr: 1.2195122 hiiqr: 1.6175329 min: 1.2195122 median: 1.416122 max: 1.6736401
19033 -- loiqr: 2.7438221 hiiqr: 3.89578666 min: 2.7320962 median: 3.80

40087 -- loiqr: 3.99210468 hiiqr: 4.5261492699999994 min: 3.967922 median: 4.3503599 max: 4.5635056
18127 -- loiqr: 4.525779549999999 hiiqr: 4.96534081 min: 4.4718642 median: 4.7993426 max: 4.9952474
54087 -- loiqr: 1.0202811280000001 hiiqr: 1.30450423 min: 0.98736882 median: 1.1791383 max: 1.3215859
8113 -- loiqr: 21.5081763 hiiqr: 23.1642845 min: 21.214415 median: 22.483479 max: 23.54052
28099 -- loiqr: 1.26511412 hiiqr: 1.5355539 min: 1.2142688 median: 1.3849633 max: 1.5497283
16075 -- loiqr: 2.83757671 hiiqr: 3.15774118 min: 2.7509511 median: 3.0609212 max: 3.1806998
37191 -- loiqr: 1.83469738 hiiqr: 2.04855532 min: 1.8242438 median: 1.9757726 max: 2.0633845
8109 -- loiqr: 6.4042722 hiiqr: 7.012261420000001 min: 6.3686709 median: 6.7513747 max: 7.1193018
1005 -- loiqr: 0.9950059520000001 hiiqr: 1.2078368 min: 0.9829942 median: 1.1020246 max: 1.2320744
28045 -- loiqr: 2.30342235 hiiqr: 2.52955277 min: 2.2877932 median: 2.3887346 max: 2.5902925
40149 -- loiqr: 1.3736924899999998 hiiq

40007 -- loiqr: 1.28323603 hiiqr: 2.1271334 min: 1.2594458 median: 1.3468869 max: 2.1449704
26135 -- loiqr: 1.8431314 hiiqr: 2.1670902 min: 1.8146371 median: 1.9578538 max: 2.2119265
41013 -- loiqr: 4.62281998 hiiqr: 5.5169850899999995 min: 4.5650349 median: 5.0075874 max: 5.5530138
5041 -- loiqr: 0.699176106 hiiqr: 0.90569454 min: 0.66800267 median: 0.82644629 max: 0.91701573
51019 -- loiqr: 3.99256794 hiiqr: 4.27972464 min: 3.9568462 median: 4.1231842 max: 4.3271894
13205 -- loiqr: 0.855587079 hiiqr: 1.1449494 min: 0.82836121 median: 1.0858778 max: 1.1565733
18131 -- loiqr: 1.5483072 hiiqr: 1.7990179100000003 min: 1.5483072 median: 1.6102395 max: 2.0103564
48277 -- loiqr: 2.75694878 hiiqr: 3.0502746 min: 2.7158792 median: 2.8767269 max: 3.0550275
17073 -- loiqr: 1.99581496 hiiqr: 2.14746789 min: 1.9678788 median: 2.033587 max: 2.1569142
48027 -- loiqr: 3.5875740900000004 hiiqr: 4.23796659 min: 3.5487683 median: 3.7551391 max: 4.2745075
8123 -- loiqr: 6.0972738 hiiqr: 6.84631223000000

17001 -- loiqr: 3.94700065 hiiqr: 4.46353665 min: 3.923769 median: 4.2862167 max: 4.5619874
46093 -- loiqr: 4.40235305 hiiqr: 5.41068132 min: 4.2755122 median: 4.9730463 max: 5.4419889
18037 -- loiqr: 3.0735756199999997 hiiqr: 3.5062456 min: 3.055547 median: 3.181293 max: 3.509376
35043 -- loiqr: 5.00560205 hiiqr: 5.49211973 min: 4.9230537 median: 5.272809 max: 5.5249429
31001 -- loiqr: 2.3633494 hiiqr: 2.4920847000000004 min: 2.3550425 median: 2.4282744 max: 2.5037494
46077 -- loiqr: 3.3227897200000003 hiiqr: 4.391944280000001 min: 3.3028455 median: 3.5407445 max: 4.4668217
53023 -- loiqr: 1.5087361000000001 hiiqr: 1.86711227 min: 1.2820513 median: 1.7699115 max: 1.9469026
21113 -- loiqr: 4.67285796 hiiqr: 5.2931787 min: 4.660964 median: 4.8751049 max: 5.7348776
21173 -- loiqr: 1.95437559 hiiqr: 2.35115848 min: 1.9382423 median: 2.1026621 max: 2.3870578
19007 -- loiqr: 1.75497707 hiiqr: 1.9286739000000002 min: 1.7457887 median: 1.8493646 max: 1.9549336
45013 -- loiqr: 7.27462688000000

45025 -- loiqr: 1.15503377 hiiqr: 1.25150757 min: 1.1465254 median: 1.2081966 max: 1.2642295
48163 -- loiqr: 0.75351971 hiiqr: 0.8821438770000001 min: 0.75351971 median: 0.83283758 max: 0.91365921
48477 -- loiqr: 4.774297809999999 hiiqr: 5.63227529 min: 4.7529774 median: 4.9957538 max: 5.7515726
12047 -- loiqr: 1.32655651 hiiqr: 1.459478 min: 1.2937397 median: 1.3850416 max: 1.5024039
48317 -- loiqr: 2.19501377 hiiqr: 2.6143792 min: 2.0984457 median: 2.357513 max: 2.6683939
6035 -- loiqr: 1.93231236 hiiqr: 2.57624599 min: 1.6600492 median: 2.3613763 max: 2.5900214
22063 -- loiqr: 3.0957024399999997 hiiqr: 3.3777008 min: 3.0763953 median: 3.263586 max: 3.3921497
18007 -- loiqr: 2.99038471 hiiqr: 3.3455994600000003 min: 2.9547553 median: 3.0847147 max: 3.3731985
21047 -- loiqr: 1.90550245 hiiqr: 2.13718248 min: 1.8853434 median: 1.9762846 max: 2.1646504
31053 -- loiqr: 2.92956628 hiiqr: 3.06713461 min: 2.9263189 median: 2.977258 max: 3.0868397
48063 -- loiqr: 2.66744016 hiiqr: 2.95942218

6105 -- loiqr: 3.56722449 hiiqr: 3.74780863 min: 3.5497518 median: 3.6482694 max: 3.7668593
19165 -- loiqr: 2.3979816 hiiqr: 3.2994932699999997 min: 2.3475208 median: 2.8301888 max: 3.3701658
13017 -- loiqr: 1.6090601800000002 hiiqr: 1.78123864 min: 1.5518458 median: 1.7097354 max: 1.8120223
17151 -- loiqr: 0.64552343 hiiqr: 0.98873937 min: 0.64552343 median: 0.82394946 max: 1.0162044
42041 -- loiqr: 5.234566190000001 hiiqr: 5.48338513 min: 5.2004566 median: 5.3519316 max: 5.4906187
47125 -- loiqr: 4.30885056 hiiqr: 4.659620370000001 min: 4.2583747 median: 4.4513388 max: 4.7354417
48133 -- loiqr: 2.94972456 hiiqr: 3.44729538 min: 2.8994491 median: 3.1247849 max: 3.467128
51109 -- loiqr: 4.522043050000001 hiiqr: 4.7212343 min: 4.3428254 median: 4.6077166 max: 4.7783556
55027 -- loiqr: 2.3909013999999997 hiiqr: 2.57342079 min: 2.3479373 median: 2.4874983 max: 2.5925024
1021 -- loiqr: 1.40087466 hiiqr: 1.77270977 min: 1.3853979 median: 1.5649312 max: 1.8132015
16077 -- loiqr: 3.171365 hii

37113 -- loiqr: 6.54200267 hiiqr: 9.59995752 min: 6.4128184 median: 6.9550762 max: 9.6352863
28035 -- loiqr: 2.3801022 hiiqr: 4.24279157 min: 2.3565714 median: 3.0225871 max: 4.2767143
55123 -- loiqr: 3.48867926 hiiqr: 3.81619035 min: 3.4774895 median: 3.709564 max: 3.8531702
47105 -- loiqr: 5.08184424 hiiqr: 9.58591004 min: 4.9614692 median: 8.9496527 max: 9.6089811
37005 -- loiqr: 2.94619222 hiiqr: 3.9655735 min: 2.8819406 median: 3.7953796 max: 3.9764681
22059 -- loiqr: 0.866783509 hiiqr: 0.980664871 min: 0.85889572 median: 0.93646073 max: 0.99337751
22049 -- loiqr: 0.85695982 hiiqr: 1.19022746 min: 0.85695982 median: 0.98242813 max: 1.2004511
29183 -- loiqr: 6.76193936 hiiqr: 8.49524172 min: 6.7088156 median: 7.9608016 max: 8.5126715
29101 -- loiqr: 2.3583205 hiiqr: 2.7051556100000003 min: 2.3498695 median: 2.4092095 max: 2.7179852
46019 -- loiqr: 3.49068797 hiiqr: 4.73684201 min: 3.4014485 median: 4.1155982 max: 4.822401
21147 -- loiqr: 0.492600112 hiiqr: 0.65136933 min: 0.4720460

21015 -- loiqr: 5.84913704 hiiqr: 6.85763675 min: 5.7537436 median: 6.21207 max: 6.9305677
1031 -- loiqr: 2.0348704300000002 hiiqr: 2.19651002 min: 1.9998478 median: 2.0880721 max: 2.2163165
29147 -- loiqr: 2.0119226 hiiqr: 2.0961790500000004 min: 1.9799873 median: 2.0421152 max: 2.1527448
22023 -- loiqr: 0.511644966 hiiqr: 0.70867652 min: 0.49514028 median: 0.6149832 max: 0.72782993
48149 -- loiqr: 4.528617000000001 hiiqr: 5.2942352 min: 4.4026113 median: 4.913065 max: 5.3142514
51003 -- loiqr: 6.15958698 hiiqr: 6.90848889 min: 6.1518173 median: 6.3192878 max: 7.163506
5133 -- loiqr: 0.833058135 hiiqr: 0.97919214 min: 0.81893373 median: 0.87311304 max: 0.97919214
56013 -- loiqr: 3.0583096 hiiqr: 3.2745156200000003 min: 3.0515881 median: 3.1673753 max: 3.311347
29143 -- loiqr: 0.92767066 hiiqr: 1.3346404 min: 0.90592837 median: 1.1146615 max: 1.349721
19115 -- loiqr: 1.29420848 hiiqr: 1.556226 min: 1.2862109 median: 1.3757225 max: 1.5666705
37083 -- loiqr: 1.6515458 hiiqr: 1.99151797 m

16019 -- loiqr: 7.62430249 hiiqr: 9.008325769999999 min: 7.529644 median: 8.081419 max: 9.0851564
25023 -- loiqr: 6.386341450000001 hiiqr: 6.8173303 min: 6.335001 median: 6.5618687 max: 6.834125
21217 -- loiqr: 1.86208292 hiiqr: 2.09400838 min: 1.8382167 median: 2.0576961 max: 2.1131732
56045 -- loiqr: 1.5307939 hiiqr: 1.8032495 min: 1.5192136 median: 1.6621984 max: 1.8032495
13083 -- loiqr: 2.07446404 hiiqr: 2.662614 min: 2.0370793 median: 2.2296884 max: 2.6899695
56019 -- loiqr: 18.6780841 hiiqr: 28.3945443 min: 18.591715 median: 20.078562 max: 29.740669
27073 -- loiqr: 1.13518014 hiiqr: 1.35020561 min: 1.0991024 median: 1.2742382 max: 1.3690922
6113 -- loiqr: 5.26744318 hiiqr: 5.98313899 min: 5.2502246 median: 5.4954658 max: 6.0715184
18059 -- loiqr: 4.1529523500000005 hiiqr: 4.39146046 min: 4.0395255 median: 4.2799964 max: 4.4136262
6063 -- loiqr: 4.76582425 hiiqr: 5.207214120000001 min: 4.7090354 median: 5.0261846 max: 5.2305532
46005 -- loiqr: 1.74527951 hiiqr: 2.28776233 min: 1.

22029 -- loiqr: 1.21814504 hiiqr: 1.6066388600000001 min: 1.1927264 median: 1.4201341 max: 1.619406
26155 -- loiqr: 2.60638987 hiiqr: 2.85616987 min: 2.592062 median: 2.7922235 max: 2.8728685
31051 -- loiqr: 1.2728958300000002 hiiqr: 1.51285951 min: 1.228843 median: 1.396323 max: 1.5610439
51153 -- loiqr: 6.79575819 hiiqr: 7.260671599999999 min: 6.6841097 median: 6.9983521 max: 7.3120022
1115 -- loiqr: 2.81509387 hiiqr: 2.9758328499999998 min: 2.8095014 median: 2.8903809 max: 2.997093
18113 -- loiqr: 1.98238577 hiiqr: 2.11184142 min: 1.9637164 median: 2.0492375 max: 2.1414666
24033 -- loiqr: 6.29512711 hiiqr: 7.516388780000001 min: 6.2548389 median: 6.9599919 max: 7.5547981
27011 -- loiqr: 1.748215 hiiqr: 2.464169 min: 1.7241379 median: 1.974013 max: 2.4893136
29165 -- loiqr: 7.284347589999999 hiiqr: 7.673627010000001 min: 7.2216315 median: 7.5200143 max: 7.6921115
27083 -- loiqr: 3.73763154 hiiqr: 5.63019127 min: 3.6923237 median: 4.1424456 max: 5.7060308
18061 -- loiqr: 2.1942065 hii

31033 -- loiqr: 2.75344341 hiiqr: 3.00400606 min: 2.7134986 median: 2.9123571 max: 3.0169537
45053 -- loiqr: 2.6394246000000003 hiiqr: 2.91611738 min: 2.600287 median: 2.696867 max: 2.9506707
51085 -- loiqr: 5.92759389 hiiqr: 8.49368336 min: 5.9101567 median: 6.113234 max: 8.5303259
47045 -- loiqr: 1.35539626 hiiqr: 1.45687897 min: 1.3174536 median: 1.400364 max: 1.4702772
39165 -- loiqr: 6.44068073 hiiqr: 6.85161224 min: 6.3416367 median: 6.6318488 max: 6.8919482
48161 -- loiqr: 2.07544671 hiiqr: 2.31158068 min: 1.9963225 median: 2.2027304 max: 2.3120251
48147 -- loiqr: 3.5685060600000003 hiiqr: 3.89262794 min: 3.4130571 median: 3.7065189 max: 3.9417961
48205 -- loiqr: 0.22466861 hiiqr: 0.76653528 min: 0.22466861 median: 0.2506837 max: 0.77485055
48263 -- loiqr: 1.013745674 hiiqr: 1.8610432100000003 min: 0.85910654 median: 1.609658 max: 2.0120723
48285 -- loiqr: 2.75988489 hiiqr: 3.59133116 min: 2.7101336 median: 2.9287066 max: 3.6756473
35055 -- loiqr: 8.42891723 hiiqr: 9.03413507999

37037 -- loiqr: 3.43141693 hiiqr: 3.57587354 min: 3.4177005 median: 3.4992154 max: 3.585794
50021 -- loiqr: 5.07926328 hiiqr: 5.4402399 min: 5.0707254 median: 5.1968732 max: 5.470644
12019 -- loiqr: 6.85790218 hiiqr: 7.87441177 min: 6.7581515 median: 7.0068026 max: 7.9790769
26003 -- loiqr: 2.47388456 hiiqr: 3.59907342 min: 2.2991443 median: 3.4978623 max: 3.6792331
6025 -- loiqr: 1.97152636 hiiqr: 2.18002848 min: 1.9332503 median: 2.0458214 max: 2.2467713
13221 -- loiqr: 1.78862704 hiiqr: 1.97121335 min: 1.7395798 median: 1.8773682 max: 1.9853258
21023 -- loiqr: 1.3616213 hiiqr: 1.84310321 min: 1.3616213 median: 1.5323855 max: 1.8816775
47181 -- loiqr: 1.1356768000000002 hiiqr: 1.8646780200000002 min: 1.1287839 median: 1.7151653 max: 1.9061477
27037 -- loiqr: 8.59783551 hiiqr: 9.37630904 min: 8.5717344 median: 8.8114986 max: 9.5358334
51037 -- loiqr: 1.26610822 hiiqr: 1.6349188399999999 min: 1.2349657 median: 1.5328084 max: 1.7217848
13297 -- loiqr: 8.07579302 hiiqr: 9.51019841 min: 8

54075 -- loiqr: 2.35035592 hiiqr: 2.9515292 min: 2.3117287 median: 2.4819174 max: 3.0294287
49051 -- loiqr: 12.2451497 hiiqr: 13.540207800000001 min: 12.13254 median: 12.716049 max: 13.726153
8067 -- loiqr: 10.3278248 hiiqr: 10.9905129 min: 10.178416 median: 10.771489 max: 11.033839
18031 -- loiqr: 2.9812605 hiiqr: 4.46258894 min: 2.9712396 median: 4.0755467 max: 4.5031672
26085 -- loiqr: 1.1511663 hiiqr: 1.65351943 min: 1.1511663 median: 1.2819208 max: 1.7091424
20055 -- loiqr: 1.6478069899999999 hiiqr: 1.9586579000000002 min: 1.6286262 median: 1.7688308 max: 1.9692284
29157 -- loiqr: 1.7058742 hiiqr: 2.04701417 min: 1.6714816 median: 1.9056076 max: 2.0789456
21075 -- loiqr: 1.524645 hiiqr: 1.8363562 min: 1.524645 median: 1.6317992 max: 1.8549052
54085 -- loiqr: 1.1558294900000001 hiiqr: 1.48415722 min: 1.1445101 median: 1.238274 max: 1.5209125
29035 -- loiqr: 1.06817316 hiiqr: 1.3347458 min: 1.0258602 median: 1.1540928 max: 1.3559322
46033 -- loiqr: 3.54481246 hiiqr: 4.39516684000000

37155 -- loiqr: 0.996384635 hiiqr: 1.1750346699999998 min: 0.98453873 median: 1.0252947 max: 1.1982296
44007 -- loiqr: 4.97480365 hiiqr: 5.32477236 min: 4.9690747 median: 5.1543489 max: 5.3537889
39035 -- loiqr: 6.38521977 hiiqr: 7.0825033 min: 6.3427143 median: 6.5948429 max: 7.1046224
35039 -- loiqr: 2.45818615 hiiqr: 2.6816481100000003 min: 2.4342258 median: 2.5963624 max: 2.6864676
18153 -- loiqr: 1.52245607 hiiqr: 2.38785327 min: 1.3431672 median: 2.0897136 max: 2.4774504
46081 -- loiqr: 6.49093443 hiiqr: 8.76750361 min: 6.4581523 median: 7.1425166 max: 8.7922564
24019 -- loiqr: 2.66413998 hiiqr: 3.0225511 min: 2.6530855 median: 2.7399423 max: 3.0658429
48191 -- loiqr: 1.0961214 hiiqr: 1.6075949999999999 min: 1.0539629 median: 1.4090521 max: 1.6455696
39069 -- loiqr: 2.1381896 hiiqr: 2.34257661 min: 2.1333847 median: 2.2418089 max: 2.6857827
39151 -- loiqr: 4.11226012 hiiqr: 4.359549840000001 min: 4.0971999 median: 4.177423 max: 4.3949914
42073 -- loiqr: 2.99309598 hiiqr: 3.452570

22061 -- loiqr: 2.52616335 hiiqr: 2.8446354200000004 min: 2.4972312 median: 2.6173165 max: 2.8573704
12067 -- loiqr: 0.91583079 hiiqr: 1.16849019 min: 0.8898614 median: 0.98851579 max: 1.2545587
8083 -- loiqr: 4.5967925 hiiqr: 5.0855364199999995 min: 4.4812737 median: 4.8131762 max: 5.1130776
37075 -- loiqr: 2.26390125 hiiqr: 2.5014792900000002 min: 2.1210783 median: 2.396354 max: 2.5295858
21103 -- loiqr: 2.53072403 hiiqr: 3.0799801 min: 2.4997957 median: 2.6825633 max: 3.1296573
32033 -- loiqr: 1.54642492 hiiqr: 1.73564406 min: 1.5344603 median: 1.6124837 max: 1.7802254
6013 -- loiqr: 8.729751239999999 hiiqr: 9.25418084 min: 8.6820469 median: 9.0297737 max: 9.2794628
40051 -- loiqr: 2.0259837800000002 hiiqr: 2.2780622800000003 min: 1.9951073 median: 2.1843419 max: 2.2966576
38003 -- loiqr: 2.0900468599999997 hiiqr: 2.36236684 min: 2.0617368 median: 2.2690625 max: 2.3854289
28137 -- loiqr: 1.4749668 hiiqr: 1.67172002 min: 1.4658055 median: 1.5946479 max: 1.6807877
13231 -- loiqr: 2.86

5119 -- loiqr: 5.93299163 hiiqr: 6.19339742 min: 5.881084 median: 6.0312605 max: 6.2127175
48207 -- loiqr: 0.57557023 hiiqr: 0.83780879 min: 0.49710023 median: 0.68463844 max: 0.83780879
41057 -- loiqr: 4.14069201 hiiqr: 4.538717279999999 min: 4.1018915 median: 4.3490605 max: 4.5904121
26075 -- loiqr: 3.0386727099999997 hiiqr: 3.23577494 min: 3.0276442 median: 3.1290753 max: 3.2778041
1055 -- loiqr: 2.24701831 hiiqr: 2.7280282 min: 2.2291651 median: 2.3666215 max: 2.7364635
17053 -- loiqr: 2.05695898 hiiqr: 2.23182714 min: 2.0384278 median: 2.1305311 max: 2.3040586
47013 -- loiqr: 1.2953777 hiiqr: 1.56843563 min: 1.2858993 median: 1.4315577 max: 1.5775857
21203 -- loiqr: 0.738987687 hiiqr: 0.826086983 min: 0.71761203 median: 0.78565979 max: 0.84668189
26149 -- loiqr: 2.5328387 hiiqr: 2.8288335 min: 2.5245473 median: 2.6630256 max: 2.8969324
6019 -- loiqr: 3.69901323 hiiqr: 3.97008813 min: 3.6515334 median: 3.766459 max: 3.9791877
29071 -- loiqr: 3.7095653000000004 hiiqr: 3.99486605 min

18107 -- loiqr: 2.0164177800000003 hiiqr: 2.22957381 min: 2.0133774 median: 2.086699 max: 2.366924
42089 -- loiqr: 4.898294389999999 hiiqr: 5.0874516299999994 min: 4.8199162 median: 4.9488854 max: 5.2292309
24017 -- loiqr: 5.6973449 hiiqr: 6.37158608 min: 5.5990415 median: 6.1647282 max: 6.4074459
17187 -- loiqr: 1.29156347 hiiqr: 1.59040263 min: 1.2845407 median: 1.4488424 max: 1.6521611
18141 -- loiqr: 4.31076571 hiiqr: 4.74410065 min: 4.2820077 median: 4.4591022 max: 4.7759824
36011 -- loiqr: 2.64863486 hiiqr: 2.98163584 min: 2.6293304 median: 2.6935647 max: 3.0012372
48241 -- loiqr: 1.9572474199999998 hiiqr: 2.17988689 min: 1.8701338 median: 2.1217301 max: 2.1866646
20161 -- loiqr: 5.37417046 hiiqr: 5.6067881 min: 5.2346425 median: 5.4962955 max: 5.6307073
28115 -- loiqr: 0.960403285 hiiqr: 1.10311948 min: 0.9565255 median: 1.0055964 max: 1.1107801
5117 -- loiqr: 0.6466584409999999 hiiqr: 0.939539392 min: 0.61881191 median: 0.74866313 max: 0.95411175
12083 -- loiqr: 5.0280781600000

51045 -- loiqr: 1.55861863 hiiqr: 2.05140405 min: 1.5133601 median: 1.8291216 max: 2.1656353
13089 -- loiqr: 12.3980462 hiiqr: 13.3782025 min: 12.116005 median: 12.780334 max: 13.395468
45091 -- loiqr: 6.7932491399999995 hiiqr: 7.49636145 min: 6.7438297 median: 6.9595232 max: 7.5103879
2230 -- loiqr: 6.3392859 hiiqr: 8.4474888 min: 6.25 median: 7.5949368 max: 8.4474888
18023 -- loiqr: 1.8820178 hiiqr: 2.10721352 min: 1.8641589 median: 1.9611137 max: 2.1307955
27165 -- loiqr: 1.59865169 hiiqr: 2.06942437 min: 1.504755 median: 1.942802 max: 2.0924044
13309 -- loiqr: 0.66828674 hiiqr: 0.794399645 min: 0.66717207 median: 0.69866341 max: 0.85222948
20141 -- loiqr: 1.3953179 hiiqr: 2.3252445500000003 min: 1.2461059 median: 1.4783527 max: 2.3904383
29145 -- loiqr: 1.56352012 hiiqr: 1.76089993 min: 1.5551324 median: 1.6276751 max: 1.7696921
45035 -- loiqr: 4.92629851 hiiqr: 5.407920600000001 min: 4.8143387 median: 5.1662593 max: 5.6495533
51760 -- loiqr: 7.14763272 hiiqr: 7.87676826 min: 7.075

55067 -- loiqr: 2.04282295 hiiqr: 23.0558081 min: 2.0247104 median: 2.1928973 max: 25.997801
46101 -- loiqr: 1.92750605 hiiqr: 2.5010335 min: 1.7879162 median: 2.4390244 max: 2.5256674
55109 -- loiqr: 5.5726925000000005 hiiqr: 5.70427786 min: 5.5089407 median: 5.6326632 max: 5.7195916
30051 -- loiqr: 1.0149573 hiiqr: 1.32563256 min: 1.0149573 median: 1.210121 max: 1.430143
17093 -- loiqr: 5.57379006 hiiqr: 5.82852858 min: 5.4993477 median: 5.6473269 max: 5.8660054
19109 -- loiqr: 1.4811992200000001 hiiqr: 1.7560975 min: 1.4266419 median: 1.5832263 max: 1.7971759
40027 -- loiqr: 5.24562859 hiiqr: 5.898034920000001 min: 5.2153511 median: 5.3486257 max: 6.0030127
8103 -- loiqr: 2.72702084 hiiqr: 3.4566881 min: 2.7179384 median: 3.2557182 max: 3.4939013
13247 -- loiqr: 8.2061128 hiiqr: 10.357411200000001 min: 8.048399 median: 9.302947 max: 10.466195
34021 -- loiqr: 8.499056340000001 hiiqr: 10.832245599999998 min: 8.3182926 median: 8.6865387 max: 10.838427
21175 -- loiqr: 0.462385061 hiiqr:

27071 -- loiqr: 1.8481705800000001 hiiqr: 2.16485681 min: 1.8229674 median: 1.9399493 max: 2.2013819
1085 -- loiqr: 1.0258016 hiiqr: 1.4472336 min: 1.0118897 median: 1.3319672 max: 1.4509652
37107 -- loiqr: 1.87922574 hiiqr: 2.2071577 min: 1.8551117 median: 2.0284252 max: 2.2229869
37023 -- loiqr: 2.02281851 hiiqr: 2.20344846 min: 2.0177016 median: 2.161154 max: 2.2197967
39105 -- loiqr: 1.25386829 hiiqr: 1.38579465 min: 1.2488948 median: 1.3096817 max: 1.4006838
48415 -- loiqr: 1.02186972 hiiqr: 1.66575037 min: 0.99337751 median: 1.4607712 max: 1.7199403
1111 -- loiqr: 1.44197419 hiiqr: 4.9313540300000005 min: 1.4158185 median: 2.2195933 max: 5.3465791
20125 -- loiqr: 1.9200153400000002 hiiqr: 2.65119682 min: 1.8966203 median: 2.0805612 max: 2.657151
17125 -- loiqr: 1.00167943 hiiqr: 1.2237272 min: 0.95211941 median: 1.0711343 max: 1.2424101
41011 -- loiqr: 2.9950145 hiiqr: 3.40879156 min: 2.9807897 median: 3.3372512 max: 3.4299784
48337 -- loiqr: 3.79077538 hiiqr: 4.35685186 min: 3.5

In [124]:
train_imputed.query("cfips == '12045'")

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,date,outlier
13221,12045_2019-08-01,12045,Gulf County,Florida,2019-08-01,4.408387,595.0,2019-08-01,False
13222,12045_2019-09-01,12045,Gulf County,Florida,2019-09-01,4.489887,606.0,2019-09-01,False
13223,12045_2019-10-01,12045,Gulf County,Florida,2019-10-01,4.497296,607.0,2019-10-01,False
13224,12045_2019-11-01,12045,Gulf County,Florida,2019-11-01,4.519523,610.0,2019-11-01,False
13225,12045_2019-12-01,12045,Gulf County,Florida,2019-12-01,4.519523,610.0,2019-12-01,False
13226,12045_2020-01-01,12045,Gulf County,Florida,2020-01-01,4.552227,611.0,2020-01-01,False
13227,12045_2020-02-01,12045,Gulf County,Florida,2020-02-01,4.500074,604.0,2020-02-01,False
13228,12045_2020-03-01,12045,Gulf County,Florida,2020-03-01,4.418119,593.0,2020-03-01,False
13229,12045_2020-04-01,12045,Gulf County,Florida,2020-04-01,4.522426,607.0,2020-04-01,False
13230,12045_2020-05-01,12045,Gulf County,Florida,2020-05-01,18.69319,2509.0,2020-05-01,False


## EDA

In [None]:
cfips = "17031"
X = train.query(f"cfips == '{cfips}'")
print(X.shape)
plt.figure(figsize=(16,8))
plt.plot(X['date'], X['microbusiness_density'])
plt.show()

plt.figure(figsize=(16,8))
plt.plot(X['date'], X['active'])
plt.show()

X

In [None]:
X[['microbusiness_density', 'active']].corr(method='spearman')

In [None]:
sample_submissions = pd.read_csv("sample_submission.csv")
sample_submissions['cfips'] = sample_submissions['row_id'].apply(lambda s: s.split("_")[0]).apply(str)
sample_submissions['date'] = sample_submissions['row_id'].apply(lambda s: s.split("_")[1]).apply(pd.to_datetime)
sample_submissions = sample_submissions.sort_values("row_id", ascending=True)
sample_submissions

In [None]:
predictions_date = sorted(list(set(sample_submissions['date'])))
predictions_date

In [None]:
last_train_date = pd.to_datetime("2022-10-01")
last_train_date

## Naive baseline #1
* take the last observed value and forecast it forward for the entire forecast period

In [None]:
cfips_list = list(set(train['cfips']))
print(len(cfips_list))
cfips_list[:10]

In [None]:
from datetime import date
from dateutil.relativedelta import relativedelta

In [None]:
predictions_df = pd.DataFrame({'date':predictions_date})
predictions_df

In [None]:
h = 8
res = []
last_dates = []
for cfip in cfips_list:
    X = train.query(f"cfips == '{cfip}'").sort_values('date', ascending=True)
    last = X.iloc[-1,]
    last_y = last['microbusiness_density']
    #print(cfip, last_y)
    _predictions_df = predictions_df.copy()
    _predictions_df['cfips'] = cfip
    _predictions_df['microbusiness_density'] = [last_y for _ in range(h)]
    res.append(_predictions_df)
    
submissions_df = pd.concat(res, axis=0)
submissions_df

In [None]:
submissions_df['microbusiness_density'] = submissions_df['microbusiness_density'] + .01

In [None]:
np.quantile(submissions_df['microbusiness_density'], q=[.01, .05, .5, .95, .99])

In [None]:
plt.hist(submissions_df['microbusiness_density'])
plt.show()

plt.hist(np.log(submissions_df['microbusiness_density']))
plt.show()

In [None]:
x = train[['cfips', 'microbusiness_density']].groupby('cfips').mean().reset_index()
plt.hist(x['microbusiness_density'])
plt.show()

plt.hist(np.log(x['microbusiness_density']))
plt.show()

In [None]:
plt.hist(np.log(submissions_df['microbusiness_density']), color='red')
plt.hist(np.log(x['microbusiness_density']))


In [None]:
d = pd.to_datetime("2022-11-01")
'-'.join(str(x).zfill(2) for x in [d.year, d.month, d.day])

In [None]:
submissions_df['row_id'] = submissions_df['cfips'] + '_' + submissions_df['date'].apply(lambda d: '-'.join(str(x).zfill(2) for x in [d.year, d.month, d.day]))
submissions_df

In [None]:
submissions_df[['row_id', 'microbusiness_density']].to_csv("submission-naive-01.csv", index=False)

## Naive baseline #2
* AR(1)