In [4]:
#import packages
import numpy as np
import pandas as pd

In [5]:
#get data as CSV
df = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv")

In [6]:
#get a list of states used in the dataset
states = df["state"].unique()

In [7]:
#create additional features
df["net_gain"] = (df["colony_reno"])-df["colony_lost"]
df["artifical_gain"] = (df["colony_added"])-df["colony_lost"]

In [8]:
#group by operation to summeries bee columns
dfBees = df.groupby(["state"]).mean()
dfBees.drop(columns=["year"], inplace=True)

In [9]:
# Join with Longitude and Latitude loactions
dfMap = pd.read_json("https://gist.githubusercontent.com/meiqimichelle/7727723/raw/0109432d22f28fd1a669a3fd113e41c4193dbb5d/USstates_avg_latLong")
dfMap = dfMap .set_index('state')

In [12]:
#get a list of the states missing from the origional Bees file
dfMissingStates = dfMap[dfMap.index.isin(dfBees.index)==False]
dfMissingStates

Unnamed: 0_level_0,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alaska,61.385,-152.2683
Delaware,39.3498,-75.5148
New Hampshire,43.4108,-71.5653
Nevada,38.4199,-117.1219
Rhode Island,41.6772,-71.5101


In [13]:
#Combine the two datasets together
dfConcat = pd.concat([dfBees,dfMap], axis=1, join="inner")

In [14]:
#Add the missing states to the dataset
dfConcat = dfConcat.append(dfMissingStates)

In [16]:
#Adjust for missing values
dfConcatNA = dfConcat.fillna("NA")

In [19]:
dfConcatNA

Unnamed: 0_level_0,colony_n,colony_max,colony_lost,colony_lost_pct,colony_added,colony_reno,colony_reno_pct,net_gain,artifical_gain,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,7900.0,8560.0,1308.4,15.28,1122.8,617.2,8.304348,-691.2,-185.6,32.799,-86.8073
Arizona,28260.0,29380.0,5292.0,17.68,5383.6,3375.416667,11.727273,-1887.083333,91.6,33.7712,-111.3877
Arkansas,21580.0,22440.0,3372.0,14.56,3186.4,1038.75,8.05,-2373.75,-185.6,34.9513,-92.3809
California,940400.0,1193200.0,138240.0,11.16,132440.0,119340.0,10.96,-18900.0,-5800.0,36.17,-119.7462
Colorado,19580.0,25300.0,3763.2,13.96,2908.695652,2283.333333,9.294118,-1795.714286,-1143.043478,39.0646,-105.3272
Connecticut,3516.0,3576.0,258.0,7.2,396.521739,181.818182,5.894737,-45.909091,136.086957,41.5834,-72.7622
Florida,244480.0,265200.0,35080.0,13.44,45600.0,27660.0,10.96,-7420.0,10520.0,27.8333,-81.717
Georgia,122680.0,129200.0,17080.0,13.24,21128.0,17958.0,13.958333,878.0,4048.0,32.9866,-83.6487
Hawaii,15480.0,15480.0,833.2,5.458333,1084.8,2094.583333,15.238095,1261.25,251.6,21.1098,-157.5311
Idaho,96680.0,124840.0,12720.0,10.24,11072.0,9569.2,10.73913,-3150.8,-1648.0,44.2394,-114.5103


In [20]:
#Add ID field 
idCsv = "https://raw.githubusercontent.com/vega/vega/master/docs/data/population_engineers_hurricanes.csv"
idDf = pd.read_csv(idCsv)
idDf=idDf[["id","state"]]
dfFinal = pd.merge(left=idDf, right=dfConcatNA, left_on='state', right_on='state')

In [21]:
dfFinal

Unnamed: 0,id,state,colony_n,colony_max,colony_lost,colony_lost_pct,colony_added,colony_reno,colony_reno_pct,net_gain,artifical_gain,latitude,longitude
0,1,Alabama,7900.0,8560.0,1308.4,15.28,1122.8,617.2,8.304348,-691.2,-185.6,32.799,-86.8073
1,2,Alaska,,,,,,,,,,61.385,-152.2683
2,4,Arizona,28260.0,29380.0,5292.0,17.68,5383.6,3375.416667,11.727273,-1887.083333,91.6,33.7712,-111.3877
3,5,Arkansas,21580.0,22440.0,3372.0,14.56,3186.4,1038.75,8.05,-2373.75,-185.6,34.9513,-92.3809
4,6,California,940400.0,1193200.0,138240.0,11.16,132440.0,119340.0,10.96,-18900.0,-5800.0,36.17,-119.7462
5,8,Colorado,19580.0,25300.0,3763.2,13.96,2908.695652,2283.333333,9.294118,-1795.714286,-1143.043478,39.0646,-105.3272
6,9,Connecticut,3516.0,3576.0,258.0,7.2,396.521739,181.818182,5.894737,-45.909091,136.086957,41.5834,-72.7622
7,10,Delaware,,,,,,,,,,39.3498,-75.5148
8,12,Florida,244480.0,265200.0,35080.0,13.44,45600.0,27660.0,10.96,-7420.0,10520.0,27.8333,-81.717
9,13,Georgia,122680.0,129200.0,17080.0,13.24,21128.0,17958.0,13.958333,878.0,4048.0,32.9866,-83.6487


In [18]:
#Save to CSV
dfFinal.to_csv("beesv3.csv")