In [1]:
#import packages
import numpy as np
import pandas as pd

# Clean the Origional Data

In [2]:
#read the data
df = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv")

In [3]:
#Remove values for the United States as a whole, as it causes double counting when ploting the line chart
df = df[df["state"]!="United States"]

In [4]:
#Treat missing values for Colony renovated
#df[df["colony_reno"].isna()==True]
df["colony_added"].fillna(0, inplace=True)
df["colony_reno"].fillna(0, inplace=True)
df["colony_lost"].fillna(0, inplace=True)
df["gross_gain"] = df["colony_added"]+df["colony_reno"]
df["net_gain"] = df["gross_gain"]-df["colony_lost"]

In [5]:
df.to_csv("colonyv2.csv")

# Clean the Data for a Choropleth Map

In [6]:
#get a list of states used in the dataset
states = df["state"].unique()

In [7]:
#create additional features
df["net_gain"] = (df["colony_reno"]+df["colony_added"])-df["colony_lost"]

In [8]:
#group by operation to summeries bee columns
dfBees = df.groupby(["state"]).mean()
dfBees.drop(columns=["year"], inplace=True)

In [43]:
#format columns as correct data types
dfFinal["state"] = dfFinal["state"].astype("string")
#dfFinal["state"]

#dfFinal["colony_added"] = dfFinal["colony_added"].round(2)

In [None]:
dfBees

In [9]:
# Join with Longitude and Latitude loactions
dfMap = pd.read_json("https://gist.githubusercontent.com/meiqimichelle/7727723/raw/0109432d22f28fd1a669a3fd113e41c4193dbb5d/USstates_avg_latLong")
dfMap = dfMap .set_index('state')

In [10]:
#get a list of the states missing from the origional Bees file
dfMissingStates = dfMap[dfMap.index.isin(dfBees.index)==False]
dfMissingStates

Unnamed: 0_level_0,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alaska,61.385,-152.2683
Delaware,39.3498,-75.5148
New Hampshire,43.4108,-71.5653
Nevada,38.4199,-117.1219
Rhode Island,41.6772,-71.5101


In [11]:
#Combine the two datasets together
dfConcat = pd.concat([dfBees,dfMap], axis=1, join="inner")

In [12]:
#Add the missing states to the dataset
dfConcat = dfConcat.append(dfMissingStates)

In [13]:
#Adjust for missing values
dfConcatNA = dfConcat.fillna("NA")

In [14]:
dfConcatNA

Unnamed: 0_level_0,colony_n,colony_max,colony_lost,colony_lost_pct,colony_added,colony_reno,colony_reno_pct,gross_gain,net_gain,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,7900.0,8560.0,1258.076923,15.28,1079.615385,593.461538,8.304348,1673.076923,415.0,32.799,-86.8073
Arizona,28260.0,29380.0,5088.461538,17.68,5176.538462,3115.769231,11.727273,8292.307692,3203.846154,33.7712,-111.3877
Arkansas,21580.0,22440.0,3242.307692,14.56,3063.846154,958.846154,8.05,4022.692308,780.384615,34.9513,-92.3809
California,940400.0,1193200.0,132923.076923,11.16,127346.153846,114750.0,10.96,242096.153846,109173.076923,36.17,-119.7462
Colorado,19580.0,25300.0,3618.461538,13.96,2573.076923,1844.230769,9.294118,4417.307692,798.846154,39.0646,-105.3272
Connecticut,3516.0,3576.0,248.076923,7.2,350.769231,153.846154,5.894737,504.615385,256.538462,41.5834,-72.7622
Florida,244480.0,265200.0,33730.769231,13.44,43846.153846,26596.153846,10.96,70442.307692,36711.538462,27.8333,-81.717
Georgia,122680.0,129200.0,16423.076923,13.24,20315.384615,17267.307692,13.958333,37582.692308,21159.615385,32.9866,-83.6487
Hawaii,15480.0,15480.0,801.153846,5.458333,1043.076923,1933.461538,15.238095,2976.538462,2175.384615,21.1098,-157.5311
Idaho,96680.0,124840.0,12230.769231,10.24,10646.153846,9201.153846,10.73913,19847.307692,7616.538462,44.2394,-114.5103


In [40]:
#Add ID field 
idCsv = "https://raw.githubusercontent.com/vega/vega/master/docs/data/population_engineers_hurricanes.csv"
idDf = pd.read_csv(idCsv)
idDf=idDf[["id","state"]]
dfFinal = pd.merge(left=idDf, right=dfConcatNA, left_on='state', right_on='state')

In [47]:
dfFinal.dtypes

id                   int64
state               string
colony_n            object
colony_max          object
colony_lost         object
colony_lost_pct     object
colony_added        object
colony_reno         object
colony_reno_pct     object
gross_gain          object
net_gain            object
latitude           float64
longitude          float64
dtype: object

In [30]:
#see states that are not includeded in the colonies dataset
dfFinal[dfFinal["colony_max"]=="NA"]

Unnamed: 0,id,state,colony_n,colony_max,colony_lost,colony_lost_pct,colony_added,colony_reno,colony_reno_pct,gross_gain,net_gain,latitude,longitude
1,2,Alaska,,,,,,,,,,61.385,-152.2683
7,10,Delaware,,,,,,,,,,39.3498,-75.5148
27,32,Nevada,,,,,,,,,,38.4199,-117.1219
28,33,New Hampshire,,,,,,,,,,43.4108,-71.5653
38,44,Rhode Island,,,,,,,,,,41.6772,-71.5101


In [31]:
#remove these states and Save to CSV
dfFinal = dfFinal[dfFinal["colony_max"]!="NA"]
dfFinal.to_csv("beesv3.csv")