In [48]:
#import packages
import numpy as np
import pandas as pd

# Clean the Origional Data

In [49]:
#read the data
df = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv")

In [50]:
#Remove values for the United States as a whole, as it causes double counting when ploting the line chart
df = df[df["state"]!="United States"]

In [51]:
#Treat missing values for Colony renovated
#df[df["colony_reno"].isna()==True]
df["colony_added"].fillna(0, inplace=True)
df["colony_reno"].fillna(0, inplace=True)
df["colony_lost"].fillna(0, inplace=True)
df["gross_gain"] = df["colony_added"]+df["colony_reno"]
df["net_gain"] = df["gross_gain"]-df["colony_lost"]

In [52]:
df.to_csv("colonyv2.csv")

# Clean the Data for a Choropleth Map

In [53]:
#get a list of states used in the dataset
states = df["state"].unique()

In [54]:
#create additional features
df["net_gain"] = (df["colony_reno"]+df["colony_added"])-df["colony_lost"]

In [55]:
#group by operation to summeries bee columns
dfBees = df.groupby(["state"]).mean()
dfBees.drop(columns=["year"], inplace=True)

In [58]:
dfBees = dfBees.round(2)

In [59]:
# Join with Longitude and Latitude loactions
dfMap = pd.read_json("https://gist.githubusercontent.com/meiqimichelle/7727723/raw/0109432d22f28fd1a669a3fd113e41c4193dbb5d/USstates_avg_latLong")
dfMap = dfMap .set_index('state')

In [60]:
#get a list of the states missing from the origional Bees file
dfMissingStates = dfMap[dfMap.index.isin(dfBees.index)==False]
dfMissingStates

Unnamed: 0_level_0,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alaska,61.385,-152.2683
Delaware,39.3498,-75.5148
New Hampshire,43.4108,-71.5653
Nevada,38.4199,-117.1219
Rhode Island,41.6772,-71.5101


In [61]:
#Combine the two datasets together
dfConcat = pd.concat([dfBees,dfMap], axis=1, join="inner")

In [62]:
#Add the missing states to the dataset
dfConcat = dfConcat.append(dfMissingStates)

In [63]:
#Adjust for missing values
dfConcatNA = dfConcat.fillna("NA")

In [64]:
dfConcatNA

Unnamed: 0_level_0,colony_n,colony_max,colony_lost,colony_lost_pct,colony_added,colony_reno,colony_reno_pct,gross_gain,net_gain,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,7900.0,8560.0,1258.08,15.28,1079.62,593.46,8.3,1673.08,415.0,32.799,-86.8073
Arizona,28260.0,29380.0,5088.46,17.68,5176.54,3115.77,11.73,8292.31,3203.85,33.7712,-111.3877
Arkansas,21580.0,22440.0,3242.31,14.56,3063.85,958.85,8.05,4022.69,780.38,34.9513,-92.3809
California,940400.0,1193200.0,132923.08,11.16,127346.15,114750.0,10.96,242096.15,109173.08,36.17,-119.7462
Colorado,19580.0,25300.0,3618.46,13.96,2573.08,1844.23,9.29,4417.31,798.85,39.0646,-105.3272
Connecticut,3516.0,3576.0,248.08,7.2,350.77,153.85,5.89,504.62,256.54,41.5834,-72.7622
Florida,244480.0,265200.0,33730.77,13.44,43846.15,26596.15,10.96,70442.31,36711.54,27.8333,-81.717
Georgia,122680.0,129200.0,16423.08,13.24,20315.38,17267.31,13.96,37582.69,21159.62,32.9866,-83.6487
Hawaii,15480.0,15480.0,801.15,5.46,1043.08,1933.46,15.24,2976.54,2175.38,21.1098,-157.5311
Idaho,96680.0,124840.0,12230.77,10.24,10646.15,9201.15,10.74,19847.31,7616.54,44.2394,-114.5103


In [65]:
#Add ID field 
idCsv = "https://raw.githubusercontent.com/vega/vega/master/docs/data/population_engineers_hurricanes.csv"
idDf = pd.read_csv(idCsv)
idDf=idDf[["id","state"]]
dfFinal = pd.merge(left=idDf, right=dfConcatNA, left_on='state', right_on='state')

In [66]:
dfFinal.dtypes

id                   int64
state               object
colony_n            object
colony_max          object
colony_lost         object
colony_lost_pct     object
colony_added        object
colony_reno         object
colony_reno_pct     object
gross_gain          object
net_gain            object
latitude           float64
longitude          float64
dtype: object

In [67]:
#see states that are not includeded in the colonies dataset
dfFinal[dfFinal["colony_max"]=="NA"]

Unnamed: 0,id,state,colony_n,colony_max,colony_lost,colony_lost_pct,colony_added,colony_reno,colony_reno_pct,gross_gain,net_gain,latitude,longitude
1,2,Alaska,,,,,,,,,,61.385,-152.2683
7,10,Delaware,,,,,,,,,,39.3498,-75.5148
27,32,Nevada,,,,,,,,,,38.4199,-117.1219
28,33,New Hampshire,,,,,,,,,,43.4108,-71.5653
38,44,Rhode Island,,,,,,,,,,41.6772,-71.5101


In [68]:
#remove these states and Save to CSV
dfFinal = dfFinal[dfFinal["colony_max"]!="NA"]
dfFinal.to_csv("beesv3.csv")