## Imports and Reading Dataset

In [1]:
!pip install --user uszipcode
from uszipcode import SearchEngine
import pandas as pd
import numpy as np

search = SearchEngine(simple_zipcode=True)
df = pd.read_csv('/datasets/iowaliquor.csv')[['Date', 'Store Name', 'Zip Code', 'Sale (Dollars)']]



  interactivity=interactivity, compiler=compiler, result=result)


## Fill Missing Zip Codes

In [2]:
unqs = df[df["Zip Code"].isna()]["Store Name"].unique() # get unique store names
cities = [store.split("/")[-1].strip().lower() for store in unqs if "/" in store] # extract city name
if "mlk" in cities:
    cities.remove("mlk")
subset = df[df["Store Name"].str.contains("/") & (df["Zip Code"].isna())] 
city_dict = {}
for city in set(cities): # get zip code from rows with cities in store name
    city_dict[city] = search.by_city_and_state(city, "iowa")[0].zipcode
city_dict["mlk"] = '50310'
df.loc[subset.index, 'Zip Code'] = subset["Store Name"].str.split("/").str[-1].str.strip().str.lower().map(city_dict) 

## Aggregate and Get Zip Code Statistics

In [91]:
df["YearMonth"] = df["Date"].str.split("/").str[2] + "-" + df["Date"].str.split("/").str[0]

totalSales = pd.DataFrame(df.groupby(["YearMonth", "Zip Code"])["Sale (Dollars)"].sum())
totalSales.reset_index(inplace=True)
totalSales["Zip Code"] = totalSales["Zip Code"].replace("712-2", 51529)
totalSales["Zip Code"] = totalSales["Zip Code"].astype(int)

populationDict = {}
popDensityDict = {}
housingUnitsDict = {}
incomeDict = {}

for zc in totalSales["Zip Code"].unique():
    stats = search.by_zipcode(zc)
    populationDict[zc] = stats.population
    popDensityDict[zc] = stats.population_density
    housingUnitsDict[zc] = stats.housing_units
    incomeDict[zc] = stats.median_household_income

In [149]:
totalSales["Month"] = totalSales["YearMonth"].str.split("-").str[-1]
totalSales["Population"] = totalSales["Zip Code"].map(populationDict)
totalSales["PopulationDensity"] = totalSales["Zip Code"].map(popDensityDict)
totalSales["HousingUnits"] = totalSales["Zip Code"].map(housingUnitsDict)
totalSales["MedianIncome"] = totalSales["Zip Code"].map(incomeDict)
totalSales["SalesPerCapita"] = totalSales["Sale (Dollars)"] / totalSales["Population"]
totalSales["LogPopDensity"] = np.log(totalSales["PopulationDensity"])
totalSales.dropna(inplace=True)

## Linear Regression

In [154]:
X = totalSales[['LogPopDensity', 'MedianIncome', 'Month']]
X = np.array(pd.get_dummies(X))
y = np.array(totalSales['SalesPerCapita'])

In [155]:
b = np.linalg.inv(X.T@X)@X.T@y

In [156]:
b

array([ 1.63191468e+00, -1.11067809e-04,  3.72030548e+00,  3.72173226e+00,
        3.90663960e+00,  4.06305266e+00,  4.83021274e+00,  4.90684039e+00,
        4.24682397e+00,  4.81158221e+00,  3.90109263e+00,  4.93140181e+00,
        3.68464545e+00,  5.16324349e+00])

In [157]:
np.sqrt(((X@b - y)**2).mean())

9.961374167005804