<h1> <u> Dataset Sources </u> </h1>

<ul> 
  <li> <a href="https://catalog.data.gov/dataset/farmers-markets-geographic-data"> Farmer Market Data </a> (Last Updated: Feb 4, 2018) </li>
  <li> <a href="https://catalog.data.gov/dataset/zip-code-data"> Zip Code Data </a> (Last Updated: Nov 7, 2017) </li>
</ul>

In [2]:
# Computational and Visualisation Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *

# Pyspark Packages
from pyspark.sql import functions as F
from pyspark.sql.functions import col, desc
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
us_tax = spark.read.csv('/databricks-datasets/data.gov/irs_zip_code_data/data-001/2013_soi_zipcode_agi.csv', header=True)
farmer_market = spark.read.csv('/databricks-datasets/data.gov/farmers_markets_geographic_data/data-001/market_data.csv', header=True)

In [4]:
display (us_tax.sample(False, 0.3), 100)

STATEFIPS,STATE,zipcode,agi_stub,N1,MARS1,MARS2,MARS4,PREP,N2,NUMDEP,A00100,N02650,A02650,N00200,A00200,N00300,A00300,N00600,A00600,N00650,A00650,N00700,A00700,N00900,A00900,N01000,A01000,N01400,A01400,N01700,A01700,SCHF,N02300,A02300,N02500,A02500,N26270,A26270,N02900,A02900,N03220,A03220,N03300,A03300,N03270,A03270,N03150,A03150,N03210,A03210,N03230,A03230,N03240,A03240,N04470,A04470,A00101,N18425,A18425,N18450,A18450,N18500,A18500,N18300,A18300,N19300,A19300,N19700,A19700,N04800,A04800,N05800,A05800,N09600,A09600,N07100,A07100,N07300,A07300,N07180,A07180,N07230,A07230,N07240,A07240,N07220,A07220,N07260,A07260,N09400,A09400,N10600,A10600,N59660,A59660,N59720,A59720,N11070,A11070,N10960,A10960,N06500,A06500,N10300,A10300,N85330,A85330,N85300,A85300,N11901,A11901,N11902,A11902
1,AL,0,1,870380.0,488030.0,122290.0,247000.0,500770.0,1452580.0,571240.0,11255896.0,870380.0,11444868.0,700700.0,8889326.0,103290.0,77952.0,46870.0,75071.0,40890.0,47416.0,15650.0,6538.0,146240.0,824487.0,37970.0,23583.0,38400.0,221790.0,111060.0,1066291.0,8800.0,49720.0,187559.0,35370.0,62791.0,8980.0,10323.0,155190.0,186574.0,3300.0,747.0,140.0,487.0,8950.0,33584.0,2540.0,6212.0,20760.0,17533.0,5900.0,16956.0,30.0,2.0,57090.0,794815.0,884758.0,23440.0,22991.0,25940.0,20686.0,29140.0,25144.0,54970.0,84317.0,27140.0,151005.0,42500.0,111909.0,338560.0,1874627.0,333040.0,196535.0,0.0,0.0,108740.0,37220.0,2400.0,56.0,8850.0,2899.0,33010.0,16969.0,32080.0,5457.0,33530.0,10160.0,2650.0,677.0,117420.0,152943.0,807980.0,2277816.0,399710.0,1174659.0,371280.0,1057938.0,258200.0,327115.0,80350.0,76412.0,255750.0,159189.0,371450.0,318777.0,0.0,0.0,0.0,0.0,59580.0,44367.0,767170.0,2005593.0
1,AL,35004,3,910.0,290.0,490.0,110.0,450.0,2020.0,620.0,55761.0,910.0,56170.0,840.0,47285.0,250.0,185.0,120.0,380.0,100.0,283.0,310.0,178.0,120.0,584.0,80.0,139.0,70.0,820.0,190.0,3420.0,0.0,40.0,152.0,150.0,2058.0,30.0,492.0,260.0,409.0,40.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,138.0,0.0,0.0,0.0,0.0,430.0,6464.0,26501.0,390.0,1022.0,40.0,60.0,370.0,231.0,430.0,1396.0,360.0,2265.0,350.0,1137.0,910.0,36244.0,900.0,5051.0,0.0,0.0,410.0,629.0,40.0,2.0,90.0,47.0,80.0,98.0,60.0,8.0,300.0,448.0,30.0,9.0,70.0,124.0,890.0,6261.0,0.0,0.0,0.0,0.0,40.0,35.0,60.0,52.0,870.0,4422.0,880.0,4583.0,0.0,0.0,0.0,0.0,180.0,327.0,720.0,1974.0
1,AL,35005,3,470.0,140.0,230.0,100.0,290.0,1040.0,340.0,28531.0,470.0,28664.0,440.0,23743.0,110.0,95.0,50.0,118.0,40.0,106.0,150.0,92.0,50.0,233.0,20.0,22.0,40.0,543.0,110.0,2237.0,0.0,30.0,87.0,100.0,1267.0,50.0,918.0,100.0,132.0,30.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,39.0,0.0,0.0,0.0,0.0,250.0,4129.0,14931.0,210.0,542.0,40.0,76.0,210.0,151.0,250.0,848.0,200.0,1121.0,220.0,1073.0,470.0,17954.0,470.0,2472.0,0.0,0.0,190.0,286.0,0.0,0.0,30.0,19.0,40.0,60.0,30.0,3.0,130.0,185.0,30.0,19.0,40.0,71.0,470.0,3081.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,35.0,450.0,2186.0,460.0,2283.0,0.0,0.0,0.0,0.0,110.0,210.0,360.0,1002.0
1,AL,35005,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AL,35006,1,440.0,250.0,110.0,70.0,270.0,700.0,200.0,5410.0,440.0,5515.0,330.0,3600.0,60.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0,543.0,0.0,0.0,20.0,148.0,90.0,920.0,0.0,30.0,155.0,30.0,52.0,0.0,0.0,70.0,97.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,160.0,924.0,160.0,97.0,0.0,0.0,40.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,83.0,410.0,935.0,180.0,422.0,160.0,363.0,90.0,107.0,0.0,0.0,140.0,85.0,190.0,169.0,0.0,0.0,0.0,0.0,30.0,22.0,390.0,784.0
1,AL,35007,1,4300.0,2720.0,620.0,870.0,1820.0,6400.0,2360.0,51595.0,4300.0,52779.0,3370.0,40128.0,460.0,260.0,260.0,374.0,220.0,229.0,120.0,57.0,810.0,5331.0,200.0,18.0,200.0,1445.0,430.0,3740.0,0.0,140.0,705.0,170.0,295.0,70.0,253.0,880.0,1184.0,0.0,0.0,0.0,0.0,80.0,274.0,0.0,0.0,150.0,130.0,50.0,151.0,0.0,0.0,400.0,5547.0,5883.0,170.0,168.0,180.0,130.0,220.0,227.0,380.0,618.0,250.0,1415.0,300.0,673.0,1730.0,9714.0,1710.0,1026.0,0.0,0.0,460.0,167.0,0.0,0.0,40.0,13.0,140.0,80.0,140.0,20.0,140.0,51.0,0.0,0.0,660.0,910.0,3890.0,8540.0,1390.0,3694.0,1240.0,3185.0,1030.0,1303.0,270.0,257.0,1400.0,858.0,2010.0,1809.0,0.0,0.0,0.0,0.0,450.0,334.0,3590.0,7113.0
1,AL,35007,2,2680.0,1150.0,830.0,590.0,1190.0,5600.0,2100.0,97580.0,2680.0,98951.0,2360.0,80819.0,520.0,306.0,250.0,362.0,220.0,244.0,430.0,206.0,410.0,3335.0,190.0,218.0,240.0,2420.0,470.0,7516.0,0.0,80.0,296.0,420.0,2870.0,80.0,143.0,670.0,1371.0,90.0,21.0,0.0,0.0,70.0,317.0,60.0,173.0,290.0,306.0,20.0,52.0,0.0,0.0,820.0,11699.0,31484.0,630.0,1037.0,160.0,201.0,610.0,533.0,810.0,1925.0,620.0,3650.0,650.0,2136.0,2540.0,48205.0,2530.0,5901.0,0.0,0.0,1150.0,1082.0,30.0,1.0,150.0,87.0,250.0,256.0,330.0,58.0,690.0,652.0,70.0,26.0,280.0,718.0,2610.0,10926.0,610.0,1214.0,500.0,1015.0,570.0,881.0,190.0,177.0,2000.0,4820.0,2150.0,5615.0,0.0,0.0,0.0,0.0,380.0,673.0,2260.0,5955.0
1,AL,35010,4,530.0,50.0,460.0,30.0,330.0,1420.0,450.0,45396.0,530.0,45809.0,440.0,32427.0,230.0,288.0,130.0,649.0,120.0,435.0,160.0,124.0,100.0,1371.0,110.0,795.0,110.0,1559.0,190.0,5128.0,20.0,30.0,149.0,150.0,3041.0,30.0,620.0,160.0,413.0,40.0,12.0,0.0,0.0,20.0,102.0,0.0,0.0,60.0,49.0,0.0,0.0,0.0,0.0,250.0,4923.0,21633.0,200.0,656.0,50.0,65.0,230.0,214.0,250.0,990.0,200.0,1507.0,230.0,1355.0,520.0,31724.0,520.0,4407.0,0.0,0.0,260.0,464.0,50.0,17.0,60.0,41.0,70.0,91.0,0.0,0.0,170.0,289.0,20.0,8.0,70.0,258.0,520.0,5023.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,42.0,520.0,3943.0,520.0,4264.0,0.0,0.0,0.0,0.0,140.0,424.0,360.0,1133.0
1,AL,35010,6,150.0,0.0,140.0,0.0,130.0,400.0,110.0,81091.0,150.0,82076.0,130.0,31697.0,110.0,2230.0,100.0,4260.0,100.0,3404.0,80.0,242.0,50.0,4607.0,110.0,6124.0,40.0,2706.0,50.0,3133.0,0.0,0.0,0.0,50.0,1134.0,70.0,10805.0,70.0,986.0,0.0,0.0,0.0,0.0,30.0,336.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,7983.0,71472.0,130.0,2043.0,0.0,0.0,140.0,653.0,150.0,2794.0,120.0,2057.0,140.0,2243.0,150.0,66351.0,150.0,18510.0,50.0,484.0,70.0,162.0,40.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,333.0,150.0,22616.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,18791.0,150.0,19574.0,50.0,114.0,70.0,566.0,80.0,2574.0,40.0,854.0
1,AL,35014,2,480.0,140.0,120.0,210.0,280.0,1000.0,400.0,17141.0,480.0,17279.0,440.0,14834.0,70.0,38.0,30.0,44.0,0.0,0.0,100.0,64.0,40.0,67.0,30.0,27.0,30.0,300.0,90.0,1387.0,0.0,40.0,101.0,80.0,446.0,0.0,0.0,80.0,137.0,30.0,7.0,0.0,0.0,0.0,0.0,40.0,132.0,40.0,29.0,0.0,0.0,0.0,0.0,150.0,2440.0,5574.0,130.0,191.0,50.0,50.0,100.0,48.0,150.0,285.0,80.0,433.0,130.0,463.0,470.0,7871.0,470.0,940.0,0.0,0.0,240.0,228.0,0.0,0.0,50.0,28.0,60.0,72.0,90.0,18.0,150.0,133.0,0.0,0.0,40.0,79.0,480.0,1962.0,160.0,291.0,140.0,256.0,110.0,130.0,30.0,33.0,350.0,712.0,370.0,785.0,0.0,0.0,0.0,0.0,60.0,76.0,420.0,1247.0


In [5]:
display (farmer_market.sample(False, 0.3), 100)

FMID,MarketName,Website,Facebook,Twitter,Youtube,OtherMedia,street,city,County,State,zip,Season1Date,Season1Time,Season2Date,Season2Time,Season3Date,Season3Time,Season4Date,Season4Time,x,y,Location,Credit,WIC,WICcash,SFMNP,SNAP,Organic,Bakedgoods,Cheese,Crafts,Flowers,Eggs,Seafood,Herbs,Vegetables,Honey,Jams,Maple,Meat,Nursery,Nuts,Plants,Poultry,Prepared,Soap,Trees,Wine,Coffee,Beans,Fruits,Grains,Juices,Mushrooms,PetFood,Tofu,WildHarvested,updateTime
1010376,18th Street Farmers Market,,https://www.facebook.com/groups/631614906915563/,,,,825 18th Street,Charleston,Coles,Illinois,61920,06/07/2014 to 10/04/2014,Sat: 8:00 AM-12:00 PM;,,,,,,,-88.163254,39.490737,Local government building grounds,N,Y,N,Y,Y,Y,Y,N,Y,Y,N,N,Y,Y,Y,Y,N,N,N,N,Y,N,Y,Y,Y,N,N,N,Y,N,N,N,N,N,N,8/3/2014 12:22:27 PM
1000709,20th Annual Highlands Business Partnership Farmers Market,http://www.highlandsnj.com,https://www.facebook.com/pages/Highlands-Business-Partnership-Highlands-NJ/165486543524770,,,,71 Waterwitch Avenue,Highlands,Monmouth,New Jersey,07732,06/28/2014 to 11/01/2014,Sat: 8:30 AM-2:00 PM;,,,,,,,-73.994358,40.404837,Local government building grounds,Y,Y,Y,Y,Y,-,Y,Y,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,Y,Y,N,N,Y,N,N,N,N,Y,N,N,N,N,N,N,4/9/2014 10:45:36 AM
1010873,21st Street Farmers Market,,,,,,SW 21st and Oakley,Topeka,Shawnee,Kansas,66604,06/05/2015 to 10/30/2015,Fri: 7:30 AM-12:00 PM;,,,,,,,-95.714514,39.028184,,Y,N,N,Y,N,Y,Y,N,Y,Y,Y,N,Y,Y,Y,Y,N,N,N,Y,Y,N,Y,Y,N,N,N,N,Y,Y,N,N,N,N,N,3/2/2015 10:46:33 AM
1010966,26th and Allegheny,http://thefoodtrust.org/farmers-markets/market/26th-Allegheny,,,,,26th Street and W Allegheny Avenue,Philadelphia,Philadelphia,Pennsylvania,19129,06/01/2015 to 11/15/2015,Wed: 1:00 PM-5:00 PM;,,,,,,,-75.1722883,40.0040946,,Y,Y,N,Y,Y,N,N,N,N,N,N,N,N,Y,N,N,N,N,N,N,N,N,N,N,N,N,N,N,Y,N,N,N,N,N,N,3/24/2015 3:17:20 PM
1010965,4th and Lehigh Farmers' Market,http://thefoodtrust.org/farmers-markets/market/fairhill-square,,,,,N 4th Street and W. Lehigh Avenue,Philadelphia,Philadelphia,Pennsylvania,19133,06/01/2015 to 11/22/2015,Tue: 1:00 PM-5:00 PM;,,,,,,,-75.138654,39.991748,,Y,Y,N,Y,Y,N,N,N,N,N,N,N,N,Y,N,N,N,N,N,N,N,N,N,N,N,N,N,N,Y,N,N,N,N,N,N,3/24/2015 3:11:40 PM
1000060,57th Street Greenmarket,http://www.grownyc.org,https://www.facebook.com/ManhattanGreenmarkets,,,,W 57 St. & 9 Ave,New York,New York,New York,10019,04/20/2013 to 12/21/2013,Wed: 8:00 AM-5:00 PM;Sat: 8:00 AM-5:00 PM;,,,,,,,-73.985839,40.768159,Other,Y,Y,Y,Y,Y,N,Y,Y,N,N,Y,N,N,Y,N,N,N,Y,N,N,N,Y,N,N,N,N,N,N,Y,N,N,N,N,N,N,7/24/2013 11:06:00 AM
1005306,58th and Chester Farmers' Market,http://thefoodtrust.org/farmers-markets/market/58th-chester,,,,,58th Street and Chester Avenue,Philadelphia,Philadelphia,Pennsylvania,19143,06/01/2015 to 11/01/2015,Wed: 2:00 PM-6:00 PM;,,,,,,,-75.228303,39.935721,,Y,Y,N,Y,Y,N,N,N,N,N,N,N,N,Y,N,N,N,N,N,N,N,N,N,N,N,N,N,N,Y,N,N,N,N,N,N,3/24/2015 2:35:04 PM
1002678,61st Street Farmers Market,http://www.experimentalstation.org/farmers-market,https://www.facebook.com/61stStreetFarmersMarket,,,,6100 S Blackstone,Chicago,Cook,Illinois,60637,01/25/2014 to 04/26/2014,Sat: 9:00 AM-2:00 PM;,05/17/2014 to 10/25/2014,Sat: 9:00 AM-2:00 PM;,11/01/2014 to 12/13/2014,Sat: 9:00 AM-2:00 PM;,,,-87.590793,41.784342,Closed-off public street,Y,Y,Y,Y,Y,Y,Y,Y,N,Y,Y,N,Y,Y,Y,Y,Y,Y,N,Y,Y,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,N,4/23/2014 2:04:20 PM
1000061,79th Street Greenmarket,http://www.grownyc.org,https://www.facebook.com/ManhattanGreenmarkets,,,,Columbus - W 78 & 81 Sts.,New York,New York,New York,10024,01/01/2013 to 12/31/2013,Sun: 8:00 AM-5:00 PM;,,,,,,,-73.9757,40.7818,Other,Y,Y,Y,Y,Y,N,Y,Y,N,Y,Y,Y,Y,Y,Y,Y,Y,Y,N,N,Y,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,N,7/24/2013 10:54:51 AM
1008391,84 west farmers market (Dothan),,,,,,,Dothan,Houston,Alabama,,06/06/2013 to 08/08/2013,Thu: 3:00 PM-6:00 PM;,,,,,,,-85.449944,31.232252,Private business parking lot,Y,N,N,Y,N,N,Y,N,Y,N,N,N,N,Y,Y,Y,N,N,N,N,N,N,Y,Y,N,N,N,N,Y,N,N,N,N,N,N,3/15/2014 9:29:24 AM


In [6]:
display (us_tax.describe())

summary,STATEFIPS,STATE,zipcode,agi_stub,N1,MARS1,MARS2,MARS4,PREP,N2,NUMDEP,A00100,N02650,A02650,N00200,A00200,N00300,A00300,N00600,A00600,N00650,A00650,N00700,A00700,N00900,A00900,N01000,A01000,N01400,A01400,N01700,A01700,SCHF,N02300,A02300,N02500,A02500,N26270,A26270,N02900,A02900,N03220,A03220,N03300,A03300,N03270,A03270,N03150,A03150,N03210,A03210,N03230,A03230,N03240,A03240,N04470,A04470,A00101,N18425,A18425,N18450,A18450,N18500,A18500,N18300,A18300,N19300,A19300,N19700,A19700,N04800,A04800,N05800,A05800,N09600,A09600,N07100,A07100,N07300,A07300,N07180,A07180,N07230,A07230,N07240,A07240,N07220,A07220,N07260,A07260,N09400,A09400,N10600,A10600,N59660,A59660,N59720,A59720,N11070,A11070,N10960,A10960,N06500,A06500,N10300,A10300,N85330,A85330,N85300,A85300,N11901,A11901,N11902,A11902
count,166740.0,166740,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0,166740.0
mean,29.673299748110832,,48853.6032745592,3.5,1724.3668585822238,797.117608252369,634.7534484826676,260.0802446923354,948.5900803646396,3409.575806645076,1157.908120427012,109623.8864099796,1724.365299268322,111240.51847187238,1444.2636439966416,76348.68934868657,522.4330094758307,992.0911598896486,319.82745591939545,2252.5905961377,293.9652752788773,1625.159517812163,253.96671464555595,317.91415377234017,273.9436847786974,3718.659985606333,275.14891447762983,4792.704677941706,156.1160489384671,2487.5708408300347,323.53118627803764,7410.385498380712,20.062852344968213,112.90602135060573,635.469617368358,217.56561113110232,2874.3542281396185,94.58474271320618,6376.52308984047,438.60207508696175,1584.5677941705649,43.44488425092959,10.974319299508217,9.727779776898164,231.83918675782652,44.427491903562434,301.3600815641118,28.95034184958618,136.19686937747392,133.29980808444284,134.90089960417416,19.66942545280077,43.69257526688257,5.721122706009356,78.61055535564351,532.6807004917836,14033.02644836272,70168.89840470193,388.3840110351445,3553.5931510135542,117.56729039222742,189.4551517332374,453.23755547559074,2078.732757586662,528.713626004558,6000.9269281516135,394.7554276118508,3491.1308504258127,437.86038143216985,2149.861856783016,1313.7518291951542,75191.81502938707,1302.9902243013075,15124.835552356964,45.67434328895286,310.10578145615926,539.978109631762,703.7870337051697,84.255547559074,125.17783375314862,73.48482667626244,39.7027587861341,119.11137099676142,123.94679141177882,84.69773299748111,14.667002518891689,268.0231498140818,323.36886170085165,35.38850905601535,15.941465755067773,219.8847307184839,643.5925872616049,1637.714705529567,18176.12246611491,338.19527407940507,802.207820558954,294.73647595058173,695.8103394506417,241.82409739714527,323.410759265923,119.35228499460236,105.71308624205348,1164.1902962696413,14457.736631881971,1270.9345687897326,15389.845903802328,32.74637159649754,69.89030826436368,35.88557034904642,170.04715125344848,313.5573947463116,1583.3137579465035,1327.395825836632,3881.419395465995
stddev,15.109923412643347,,27140.030935908577,1.707830248905018,36918.28626998589,21282.966256269487,11364.895814177336,7082.007826205272,20656.559708495715,67536.41774568192,24217.127341786672,2328809.88553524,36918.24240918538,2363637.2436273536,29987.272095635224,1518760.3201167283,9559.30034056247,33845.8750503141,5729.308238782326,71878.82791021722,5280.5088103196285,52634.02701514949,5571.719012114495,10025.84896507628,6437.859172781818,85571.67541259594,5047.037867682155,238137.03270070467,2726.1662240913706,48593.30123322559,5743.060099787967,138890.55544887346,358.2314292930416,2716.172749357564,15971.210773545065,3941.9144361761714,55965.799329531015,1848.3841470562893,262209.4492253723,8653.86220400303,34722.94952990776,869.0816963042972,225.39255486676328,297.96239704610593,9619.68098965611,826.2460740316869,6425.441945268583,570.1652905447472,2812.925983483741,2508.938730729165,2565.003005984243,427.4405634862693,978.993863303149,129.95209716505144,3145.5280449416423,10305.313230505171,353996.60573251423,1934787.1835299127,8432.316517908024,151781.55389276138,3500.467362507173,6947.942311077028,8708.580240643496,51673.20634824699,10251.087202066135,198752.6947590216,7715.657882315186,82607.93754770595,8699.603264812433,54627.34644272458,25118.86844143483,1774911.6916273749,24900.758390991232,444929.61046463664,2098.2472378779626,20103.975771794197,10334.93037568484,14684.290358401267,1736.7911709922491,7424.288040349734,1423.1458514481158,815.0769871478175,2444.252813688137,2463.63243291279,2282.769694500292,407.2072407976646,5680.596335290294,6779.557911873222,646.1344567880351,428.25326546713296,5322.450230591623,13671.660367143117,34300.337674010094,475246.4324764587,11803.007463403648,29638.81682593508,10336.238523055232,25534.370614024898,7967.568461728026,10566.621077243424,2831.74089181356,2433.780801408975,21540.604511325862,435502.7867066566,24138.335920347206,457716.0031855273,1623.3267848246978,3905.736504798624,1776.1329144891429,9991.304876111551,5889.894335400429,47162.16356827604,29913.677129721487,78128.40708811424
min,1.0,AK,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,56.0,WY,99999.0,6.0,9990.0,9960.0,9920.0,99410.0,99720.0,9990.0,9990.0,99998.0,9990.0,99999.0,9980.0,99997.0,99430.0,9997.0,99420.0,9994.0,990.0,9999.0,990.0,999.0,9960.0,9998.0,9950.0,9999.0,990.0,9999.0,99740.0,9999.0,9880.0,990.0,9998.0,990.0,9999.0,990.0,9999.0,99310.0,9998.0,9980.0,990.0,990.0,9995.0,9950.0,999.0,9960.0,999.0,99860.0,997.0,9990.0,995.0,980.0,999.0,99910.0,9999.0,9999.0,990.0,9998.0,9980.0,9996.0,99220.0,9999.0,9990.0,9999.0,990.0,9999.0,9970.0,9998.0,99520.0,99999.0,9930.0,9999.0,990.0,999.0,990.0,999.0,99180.0,999.0,9930.0,997.0,990.0,998.0,9980.0,997.0,990.0,999.0,990.0,998.0,9970.0,999.0,9990.0,9999.0,9920.0,9991.0,9990.0,9998.0,9990.0,999.0,990.0,999.0,9960.0,99994.0,993320.0,9999.0,990.0,998.0,990.0,9982.0,9990.0,9997.0,9990.0,9998.0


In [7]:
display (farmer_market.describe())

summary,FMID,MarketName,Website,Facebook,Twitter,Youtube,OtherMedia,street,city,County,State,zip,Season1Date,Season1Time,Season2Date,Season2Time,Season3Date,Season3Time,Season4Date,Season4Time,x,y,Location,Credit,WIC,WICcash,SFMNP,SNAP,Organic,Bakedgoods,Cheese,Crafts,Flowers,Eggs,Seafood,Herbs,Vegetables,Honey,Jams,Maple,Meat,Nursery,Nuts,Plants,Poultry,Prepared,Soap,Trees,Wine,Coffee,Beans,Fruits,Grains,Juices,Mushrooms,PetFood,Tofu,WildHarvested,updateTime
count,8518.0,8518,5003,3344,845,134,434,8228,8474,7930,8517,7528.0,5026,5130,384,375,67,65,7,7,8487.0,8487.0,3698,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518,8518
mean,1009019.2689598496,,,,,,,98383.0,,,,47501.76667998402,,,,,,,,,-91.0307326892509,39.17351085931251,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2009.936057510405
stddev,57015.38311039544,,,,,,,,,,,30650.047052285277,,,,,,,,,17.52819710985125,5.284537517011239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.4734842504100931
min,1000001.0,"""Badseed """"Funky"""" Farmers' Market""",http:// jasper.agrilife.org/jasper-county-farmers-market/,www.facebook.com/pages/Haddon-Heights-Farmers-Market/219172298144851,no twitter,Buena Vista,http://instagram.com/forsythfarmersmarket,1102 McConnell Road,Dayton,ALAMEDA,Alabama,0.0,01/01/2011 to 12/31/2011,Fri: 10:00 AM-1:00 PM;,01/01/2014 to 04/01/2014,Fri: 11:30 AM-1:00 PM;,01/01/2012 to 05/31/2012,Fri: 9:00 AM-1:00 PM;,01/03/2014 to 05/30/2014,Fri: 9:00 AM-1:00 PM;,-88.174004,17.7099,Closed-off public street,N,N,N,N,N,-,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,1/1/2015 10:12:55 PM
max,2000036.0,wilbraham winter farmers market,https://www.msuextension.org/Richland/,yes,yes,youtube.com/c/4mrmarketOrg,www.yelp.com/biz/battens-farmers-market-davie,Broadway & South Third Streets,woodstock,midland,Wyoming,,Start Date 1/1/13 to End Date 12/31/13,sun:TBD - TBD;,October to March,sun:10:00 AM - 2:00 PM;,September to October,Wed: 8:00 AM-12:00 PM;Sat: 8:00 AM-12:00 PM;,11/07/2015 to 11/28/2015,Thu: 3:00 PM-7:00 PM;,-99.9987,64.86275,Private business parking lot,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,May 30 2010 7:17PM


In [8]:
us_tax_filtered = us_tax\
    .withColumn("zipcode", (col('zipcode')/10).cast('int'))\
    .withColumn("single_returns", col('mars1').cast('int'))\
    .withColumn("joint_returns", col('mars2').cast('int'))\
    .withColumn("numdep", col('numdep').cast('int'))\
    .withColumn("total_income_amount", col('A02650').cast('double'))\
    .withColumn("taxable_interest_amount", col('A00300').cast('double'))\
    .withColumn("net_capital_gains", col('a01000').cast('double'))\
    .withColumn("biz_net_income", col('a00900').cast('double'))

us_tax_filtered = us_tax_filtered [['state', 'zipcode', 'single_returns', 'joint_returns', 'numdep', 'total_income_amount', 'taxable_interest_amount', 'net_capital_gains', 'biz_net_income']]
us_tax_filtered.createOrReplaceTempView('us_tax_filtered')

# Average Taxable Interest across the US States
display (us_tax_filtered)

state,zipcode,single_returns,joint_returns,numdep,total_income_amount,taxable_interest_amount,net_capital_gains,biz_net_income
AL,0,488030,122290,571240,11444868.0,77952.0,23583.0,824487.0
AL,0,195840,155230,383240,17810952.0,81216.0,54639.0,252768.0
AL,0,72710,146880,189340,16070153.0,80627.0,84137.0,259836.0
AL,0,24860,126480,134370,14288572.0,71086.0,105947.0,214668.0
AL,0,16930,168170,177800,26053920.0,149150.0,404166.0,567439.0
AL,0,3530,42190,48270,20752068.0,271416.0,1569967.0,822565.0
AL,3500,950,260,710,19851.0,183.0,4.0,1657.0
AL,3500,590,410,860,49338.0,172.0,54.0,788.0
AL,3500,290,490,620,56170.0,185.0,139.0,584.0
AL,3500,90,490,530,52977.0,89.0,173.0,339.0


In [9]:
display(us_tax_filtered.filter('zipcode NOT IN (0000, 9999)').groupby('zipcode').agg(F.sum('net_capital_gains').alias('capital_gains'))\
        .sort(desc('capital_gains')).limit(40))

zipcode,capital_gains
1002,12443093.0
1001,8606053.0
9430,5976580.0
9402,4667696.0
1000,4110129.0
683,3964177.0
9411,3929969.0
6061,3851723.0
1012,2426170.0
7702,2186449.0


In [10]:
display(us_tax_filtered.filter('zipcode NOT IN (0000, 9999)').groupby('zipcode').agg(F.sum('biz_net_income').alias('business_net_income'), \
     F.sum('net_capital_gains').alias('capital_gains'), (F.sum('net_capital_gains') + F.sum('biz_net_income')).alias('capital_and_business_income'))\
      .sort(desc('capital_and_business_income')).limit(40))

zipcode,business_net_income,capital_gains,capital_and_business_income
1002,1866926.0,12443093.0,14310019.0
1001,1491999.0,8606053.0,10098052.0
9430,349156.0,5976580.0,6325736.0
9402,681523.0,4667696.0,5349219.0
9411,1027768.0,3929969.0,4957737.0
1000,685593.0,4110129.0,4795722.0
6061,793338.0,3851723.0,4645061.0
683,277152.0,3964177.0,4241329.0
1012,331281.0,2426170.0,2757451.0
7702,541319.0,2186449.0,2727768.0


In [11]:
display(farmer_market.groupBy('state').agg(F.count(F.lit(1)).alias('Total Markets')))

state,Total Markets
Utah,41
Hawaii,97
Minnesota,189
Ohio,321
Oregon,171
Arkansas,100
Texas,205
North Dakota,65
,1
Pennsylvania,303


In [12]:
us_tax_cumm = us_tax_filtered.groupBy('zipcode').sum()
farmer_market_cleaned = farmer_market.withColumn("zipcode", (col("zip")/10)).groupby('zipcode').count()\
                        .select(col('count').cast('double').alias('count'), col('zipcode').alias('zip'))
us_tax_expanded = farmer_market_cleaned.join(us_tax_cumm, farmer_market_cleaned.zip == us_tax_cumm.zipcode, how='outer').na.fill(0)
us_tax_expanded = us_tax_expanded.select(col('count'), col('zip'), col('zipcode'), col('sum(zipcode)').alias('sum_zipcode'), col('sum(single_returns)').alias('sum_single_returns'), col('sum(joint_returns)').alias('sum_joint_returns'), col('sum(numdep)').alias('sum_numdep'), col('sum(total_income_amount)').alias('sum_total_income_amount'), col('sum(taxable_interest_amount)').alias('sum_taxable_interest_amount'), col('sum(net_capital_gains)').alias('sum_net_capital_gains'), col('sum(biz_net_income)').alias('sum_biz_net_income'))

# Bringing featured columns as a single columns
feature_columns = ['sum_zipcode', 'sum_single_returns', 'sum_joint_returns', 'sum_numdep', 'sum_total_income_amount', 'sum_taxable_interest_amount', 'sum_net_capital_gains', 'sum_biz_net_income']
us_tax_assembler_model = VectorAssembler(inputCols=feature_columns, outputCol='features')
us_tax_expanded_prepared = us_tax_assembler_model.transform(us_tax_expanded)

# Splitting the data into training and testing set
train, test = us_tax_expanded_prepared.randomSplit([.65, .25])

In [13]:
lrReg = LinearRegression (maxIter=50, regParam=0.3, labelCol='count', elasticNetParam=0.4)
lrModel = lrReg.fit (train)
lrModelSummary = lrModel.summary
print ("Computed Coefficients = ", lrModel.coefficients)
print ("Computed Intercepts = ", lrModel.intercept)
print ("Objective History = ", lrModelSummary.objectiveHistory)
print ("Mean Absolute Error, MAE = ", lrModelSummary.meanAbsoluteError)
print ("RMSE = {0}".format(lrModelSummary.rootMeanSquaredError))
print ("R^2 = {0}".format(lrModelSummary.r2))

In [14]:
print (lrModel.explainParams())

In [15]:
display (lrModel, test, "fittedVsResiduals")

fitted values,residuals
0.7533342057442908,-0.7533342057442908
0.7877943753218855,0.2122056246781145
0.7877943753218855,0.2122056246781145
0.5838570483014895,-0.5838570483014895
0.7877943753218855,0.2122056246781145
0.576463437793736,-0.576463437793736
0.595012483251002,-0.595012483251002
0.7877943753218855,0.2122056246781145
0.7877943753218855,0.2122056246781145
0.7877943753218855,0.2122056246781145


In [16]:
display (lrModelSummary.residuals)

residuals
-0.7601519320165157
-0.7292213754953862
-0.686215826938981
-0.5205097823933504
-0.7108488106909435
-0.5202868594634684
-0.4122435461139855
-0.6789708317178156
-0.2316295306324953
-0.4400810469830021


In [17]:
lrPredictions = lrModel.transform(test)
lrEvaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmse = lrEvaluator.evaluate(lrPredictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [18]:
display(lrPredictions, 1000)

count,zip,zipcode,sum_zipcode,sum_single_returns,sum_joint_returns,sum_numdep,sum_total_income_amount,sum_taxable_interest_amount,sum_net_capital_gains,sum_biz_net_income,features,prediction
0.0,0.0,305,10980,25040,22560,29630,3711733.0,16160.0,68018.0,128687.0,"List(1, 8, List(), List(10980.0, 25040.0, 22560.0, 29630.0, 3711733.0, 16160.0, 68018.0, 128687.0))",0.7707965019183819
0.0,0.0,2862,120204,9230,10660,15950,1083118.0,6959.0,20019.0,35787.0,"List(1, 8, List(), List(120204.0, 9230.0, 10660.0, 15950.0, 1083118.0, 6959.0, 20019.0, 35787.0))",0.6017094596028741
0.0,0.0,5983,179490,1660,2280,2550,244294.0,2011.0,7647.0,6364.0,"List(1, 8, List(), List(179490.0, 1660.0, 2280.0, 2550.0, 244294.0, 2011.0, 7647.0, 6364.0))",0.5099302316793666
0.0,0.0,6765,202950,1160,1050,1280,123276.0,1366.0,4671.0,4683.0,"List(1, 8, List(), List(202950.0, 1160.0, 1050.0, 1280.0, 123276.0, 1366.0, 4671.0, 4683.0))",0.4736123710194217
0.0,0.0,7313,263268,19740,14960,28140,2222300.0,13873.0,33345.0,58180.0,"List(1, 8, List(), List(263268.0, 19740.0, 14960.0, 28140.0, 2222300.0, 13873.0, 33345.0, 58180.0))",0.380235528765093
1.0,211.6,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.7877943753218855
1.0,875.2,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.7877943753218855
1.0,2350.4,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.7877943753218855
1.0,2815.0,2815,50670,8510,7970,15820,916019.0,5718.0,11020.0,22524.0,"List(1, 8, List(), List(50670.0, 8510.0, 7970.0, 15820.0, 916019.0, 5718.0, 11020.0, 22524.0))",0.7093533693696518
1.0,4968.6,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.7877943753218855


In [19]:
display(lrPredictions)

count,zip,zipcode,sum_zipcode,sum_single_returns,sum_joint_returns,sum_numdep,sum_total_income_amount,sum_taxable_interest_amount,sum_net_capital_gains,sum_biz_net_income,features,prediction
0.0,0.0,305,10980,25040,22560,29630,3711733.0,16160.0,68018.0,128687.0,"List(1, 8, List(), List(10980.0, 25040.0, 22560.0, 29630.0, 3711733.0, 16160.0, 68018.0, 128687.0))",0.7707965019183819
0.0,0.0,2862,120204,9230,10660,15950,1083118.0,6959.0,20019.0,35787.0,"List(1, 8, List(), List(120204.0, 9230.0, 10660.0, 15950.0, 1083118.0, 6959.0, 20019.0, 35787.0))",0.6017094596028741
0.0,0.0,5983,179490,1660,2280,2550,244294.0,2011.0,7647.0,6364.0,"List(1, 8, List(), List(179490.0, 1660.0, 2280.0, 2550.0, 244294.0, 2011.0, 7647.0, 6364.0))",0.5099302316793666
0.0,0.0,6765,202950,1160,1050,1280,123276.0,1366.0,4671.0,4683.0,"List(1, 8, List(), List(202950.0, 1160.0, 1050.0, 1280.0, 123276.0, 1366.0, 4671.0, 4683.0))",0.4736123710194217
0.0,0.0,7313,263268,19740,14960,28140,2222300.0,13873.0,33345.0,58180.0,"List(1, 8, List(), List(263268.0, 19740.0, 14960.0, 28140.0, 2222300.0, 13873.0, 33345.0, 58180.0))",0.380235528765093
1.0,211.6,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.7877943753218855
1.0,875.2,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.7877943753218855
1.0,2350.4,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.7877943753218855
1.0,2815.0,2815,50670,8510,7970,15820,916019.0,5718.0,11020.0,22524.0,"List(1, 8, List(), List(50670.0, 8510.0, 7970.0, 15820.0, 916019.0, 5718.0, 11020.0, 22524.0))",0.7093533693696518
1.0,4968.6,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.7877943753218855


Contrasted with linear regression where the output is assumed to follow a Gaussian distribution, generalized linear models (GLMs) are specifications of linear models where the response variable Yi follows some distribution from the exponential family of distributions. Spark’s GeneralizedLinearRegression interface allows for flexible specification of GLMs which can be used for various types of prediction problems including linear regression, Poisson regression, logistic regression, and others. 
Reference: https://spark.apache.org/docs/2.2.0/ml-classification-regression.html#generalized-linear-regression

In [21]:
glrReg = GeneralizedLinearRegression (family='gaussian', link='identity', maxIter=50, regParam=0.3, labelCol='count')
glrModel = glrReg.fit (train)
glrModelSummary = glrModel.summary
print ("GLR Computed Coefficients = ", glrModel.coefficients)
print ("GLR Computed Intercept = ", glrModel.intercept)
print ("T Values = ", glrModelSummary.tValues)
print ("P Values = ", glrModelSummary.pValues)
print ("GLR Coefficient of Standard Errors = ", glrModelSummary.coefficientStandardErrors)
print ("Dispersion = ", glrModelSummary.dispersion)
print ("Deviance = ", glrModelSummary.deviance)
print ("Null Deviance = ", glrModelSummary.nullDeviance)
print ("Residual Degree of Freedom Null = ", glrModelSummary.residualDegreeOfFreedomNull)
print ("AIC = ", glrModelSummary.aic)

In [22]:
display (glrModelSummary.residuals())

devianceResiduals
-0.7894576495598106
-0.7912674387431808
-0.6848771981461963
-0.4586520855091049
-0.7284227522708872
-0.5075470280829191
-0.3222074258722064
-0.6856052077053332
-0.0717182052891364
-0.354373503194155


In [23]:
glrPredictions = glrModel.transform(test)
glrEvaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmse_glr = glrEvaluator.evaluate(glrPredictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse_glr)

In [24]:
display (glrPredictions, 1000)

count,zip,zipcode,sum_zipcode,sum_single_returns,sum_joint_returns,sum_numdep,sum_total_income_amount,sum_taxable_interest_amount,sum_net_capital_gains,sum_biz_net_income,features,prediction
0.0,0.0,305,10980,25040,22560,29630,3711733.0,16160.0,68018.0,128687.0,"List(1, 8, List(), List(10980.0, 25040.0, 22560.0, 29630.0, 3711733.0, 16160.0, 68018.0, 128687.0))",0.7703960447242665
0.0,0.0,2862,120204,9230,10660,15950,1083118.0,6959.0,20019.0,35787.0,"List(1, 8, List(), List(120204.0, 9230.0, 10660.0, 15950.0, 1083118.0, 6959.0, 20019.0, 35787.0))",0.5603204200086129
0.0,0.0,5983,179490,1660,2280,2550,244294.0,2011.0,7647.0,6364.0,"List(1, 8, List(), List(179490.0, 1660.0, 2280.0, 2550.0, 244294.0, 2011.0, 7647.0, 6364.0))",0.4528720699584748
0.0,0.0,6765,202950,1160,1050,1280,123276.0,1366.0,4671.0,4683.0,"List(1, 8, List(), List(202950.0, 1160.0, 1050.0, 1280.0, 123276.0, 1366.0, 4671.0, 4683.0))",0.4062684137148278
0.0,0.0,7313,263268,19740,14960,28140,2222300.0,13873.0,33345.0,58180.0,"List(1, 8, List(), List(263268.0, 19740.0, 14960.0, 28140.0, 2222300.0, 13873.0, 33345.0, 58180.0))",0.2546454785619292
1.0,211.6,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.8333502619029401
1.0,875.2,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.8333502619029401
1.0,2350.4,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.8333502619029401
1.0,2815.0,2815,50670,8510,7970,15820,916019.0,5718.0,11020.0,22524.0,"List(1, 8, List(), List(50670.0, 8510.0, 7970.0, 15820.0, 916019.0, 5718.0, 11020.0, 22524.0))",0.7110746813163363
1.0,4968.6,0,0,0,0,0,0.0,0.0,0.0,0.0,"List(0, 8, List(), List())",0.8333502619029401


We have used the publicly available dataset from the US Government department, and we build our ***Linear Regression (LR)*** and ***Generalized Linear Regression (GLR)*** model on top of it, which we have then used to make predictions on our hold-out or test sample. The RMSE scores were improved from LR to GLR. In the later versions, I plan to include Random Forest and Gradient Boosted Tree on the same dataset for more advanced Data Science use-case. We also realized that we need to evaluate hyper-parameters to achieve the best model. 
The published notebook is available at - *https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/3173713035751393/175225107118479/2308983777460038/latest.html*