In [2]:
#import packages
import numpy as np
import pandas as pd

# Clean the Origional Data

In [3]:
#read the data
df = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv")

In [17]:
#Remove values for the United States as a whole, as it causes double counting when ploting the line chart
df = df[df["state"]!="United States"]

In [37]:
#Treat missing values for Colony renovated
#df[df["colony_reno"].isna()==True]
df["colony_reno"].fillna(0, inplace=True)
df["colony_lost"].fillna(0, inplace=True)

In [38]:
df.to_csv("colonyv2.csv")

# Clean the Data for a Choropleth Map

In [39]:
#get a list of states used in the dataset
states = df["state"].unique()

In [52]:
#create additional features
df["net_gain"] = (df["colony_reno"]+df["colony_added"])-df["colony_lost"]

In [53]:
#group by operation to summeries bee columns
dfBees = df.groupby(["state"]).sum()
dfBees.drop(columns=["year"], inplace=True)

In [54]:
# Join with Longitude and Latitude loactions
dfMap = pd.read_json("https://gist.githubusercontent.com/meiqimichelle/7727723/raw/0109432d22f28fd1a669a3fd113e41c4193dbb5d/USstates_avg_latLong")
dfMap = dfMap .set_index('state')

In [55]:
#get a list of the states missing from the origional Bees file
dfMissingStates = dfMap[dfMap.index.isin(dfBees.index)==False]
dfMissingStates

Unnamed: 0_level_0,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alaska,61.385,-152.2683
Delaware,39.3498,-75.5148
New Hampshire,43.4108,-71.5653
Nevada,38.4199,-117.1219
Rhode Island,41.6772,-71.5101


In [56]:
#Combine the two datasets together
dfConcat = pd.concat([dfBees,dfMap], axis=1, join="inner")

In [57]:
#Add the missing states to the dataset
dfConcat = dfConcat.append(dfMissingStates)

In [58]:
#Adjust for missing values
dfConcatNA = dfConcat.fillna("NA")

In [59]:
dfConcatNA

Unnamed: 0_level_0,colony_n,colony_max,colony_lost,colony_lost_pct,colony_added,colony_reno,colony_reno_pct,net_gain,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,197500.0,214000.0,32710.0,382.0,28070.0,15430.0,191.0,10790.0,32.799,-86.8073
Arizona,706500.0,734500.0,132300.0,442.0,134590.0,81010.0,258.0,83300.0,33.7712,-111.3877
Arkansas,539500.0,561000.0,84300.0,364.0,79660.0,24930.0,161.0,20290.0,34.9513,-92.3809
California,23510000.0,29830000.0,3456000.0,279.0,3311000.0,2983500.0,274.0,2838500.0,36.17,-119.7462
Colorado,489500.0,632500.0,94080.0,349.0,66900.0,47950.0,158.0,21640.0,39.0646,-105.3272
Connecticut,87900.0,89400.0,6450.0,180.0,9120.0,4000.0,112.0,7070.0,41.5834,-72.7622
Florida,6112000.0,6630000.0,877000.0,336.0,1140000.0,691500.0,274.0,954500.0,27.8333,-81.717
Georgia,3067000.0,3230000.0,427000.0,331.0,528200.0,448950.0,335.0,550150.0,32.9866,-83.6487
Hawaii,387000.0,387000.0,20830.0,131.0,27120.0,50270.0,320.0,56560.0,21.1098,-157.5311
Idaho,2417000.0,3121000.0,318000.0,256.0,276800.0,239230.0,247.0,198030.0,44.2394,-114.5103


In [60]:
#Add ID field 
idCsv = "https://raw.githubusercontent.com/vega/vega/master/docs/data/population_engineers_hurricanes.csv"
idDf = pd.read_csv(idCsv)
idDf=idDf[["id","state"]]
dfFinal = pd.merge(left=idDf, right=dfConcatNA, left_on='state', right_on='state')

In [72]:
dfFinal

Unnamed: 0,id,state,colony_n,colony_max,colony_lost,colony_lost_pct,colony_added,colony_reno,colony_reno_pct,net_gain,latitude,longitude
0,1,Alabama,197500.0,214000.0,32710.0,382.0,28070.0,15430.0,191.0,10790.0,32.799,-86.8073
1,2,Alaska,,,,,,,,,61.385,-152.2683
2,4,Arizona,706500.0,734500.0,132300.0,442.0,134590.0,81010.0,258.0,83300.0,33.7712,-111.3877
3,5,Arkansas,539500.0,561000.0,84300.0,364.0,79660.0,24930.0,161.0,20290.0,34.9513,-92.3809
4,6,California,23510000.0,29830000.0,3456000.0,279.0,3311000.0,2983500.0,274.0,2838500.0,36.17,-119.7462
5,8,Colorado,489500.0,632500.0,94080.0,349.0,66900.0,47950.0,158.0,21640.0,39.0646,-105.3272
6,9,Connecticut,87900.0,89400.0,6450.0,180.0,9120.0,4000.0,112.0,7070.0,41.5834,-72.7622
7,10,Delaware,,,,,,,,,39.3498,-75.5148
8,12,Florida,6112000.0,6630000.0,877000.0,336.0,1140000.0,691500.0,274.0,954500.0,27.8333,-81.717
9,13,Georgia,3067000.0,3230000.0,427000.0,331.0,528200.0,448950.0,335.0,550150.0,32.9866,-83.6487


In [73]:
dfFinal[dfFinal["colony_max"]]

KeyError: "None of [Index([  214000.0,       'NA',   734500.0,   561000.0, 29830000.0,   632500.0,\n          89400.0,       'NA',  6630000.0,  3230000.0,   387000.0,  3121000.0,\n         291500.0,   272000.0,   811000.0,   157400.0,   208900.0,  1318000.0,\n         277500.0,   209000.0,   187900.0,  2004000.0,  2485000.0,  1063500.0,\n         237300.0,  2827000.0,  1012000.0,       'NA',       'NA',   356000.0,\n         157200.0,  1136000.0,   556500.0,  8779000.0,   449500.0,   561700.0,\n        2618000.0,   513500.0,       'NA',   393000.0,  3845500.0,   259500.0,\n        6729000.0,   661500.0,   168500.0,   195000.0,  2492000.0,   190200.0,\n        1279000.0,   613000.0],\n      dtype='object')] are in the [columns]"

In [62]:
#Save to CSV
dfFinal.to_csv("beesv3.csv")