## Intitialize

In [1]:
# Dependencies
import pandas as pd
import os
from rename_fields import *

In [2]:
# Import File Paths
temp_csv = os.path.join("Data","Land_Temp","land_temps.csv")
export_csv = os.path.join("Data", "Aquaculture", "Exports.csv")
import_csv = os.path.join("Data", "Aquaculture", "Imports.csv")

In [3]:
# Ouput File Paths
TABLE_country = os.path.join("Output", "TABLE_country.csv")
TABLE_fish_type = os.path.join("Output", "TABLE_fish_type.csv")
TABLE_commercial = os.path.join("Output", "TABLE_commercial.csv")
TABLE_temp = os.path.join("Output", "TABLE_temp.csv")

In [4]:
# Load DataFrames
temp_df = pd.read_csv(temp_csv)
exports_df = pd.read_csv(export_csv)
imports_df = pd.read_csv(import_csv)

## Clean Imports/Exports

In [5]:
# Concat Dataframes
commercial_df = pd.concat([imports_df, exports_df])
# Drop Source Col
commercial_df.drop(labels="SOURCE_ID", axis=1, inplace=True)
# Split Attribute Description
attr_split_df = commercial_df.ATTRIBUTE_DESC.str.split(pat=", ", expand=True)
# Add Columns back to comercial df
commercial_df["Direction"] = attr_split_df.loc[:,0]
commercial_df["Measure"] = attr_split_df.loc[:,1]
commercial_df.head()

Unnamed: 0,HS_CODE,COMMODITY_DESC,GEOGRAPHY_CODE,GEOGRAPHY_DESC,ATTRIBUTE_DESC,UNIT_DESC,YEAR_ID,TIMEPERIOD_ID,AMOUNT,Direction,Measure
0,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,1,46682,US Import,QTY
1,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,2,37354,US Import,QTY
2,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,3,26080,US Import,QTY
3,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,4,38737,US Import,QTY
4,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,5,25799,US Import,QTY


## Clean Temperature Data
- Split Date into columns
- Aggregate by Month (to match commercial data)

In [6]:
#Split 'dt' column into Year, Month, Day
temp_dates = temp_df.dt.str.split(pat="-", expand=True)
temp_df = pd.merge(left=temp_df, right=temp_dates, how="left", left_index=True, right_index=True)
temp_df.rename(columns={0:"YEAR_ID", 1:"MONTH_ID", 2:"DAY_ID"}, inplace=True)
temp_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,YEAR_ID,MONTH_ID,DAY_ID
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,1,1
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,2,1
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,3,1
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,4,1
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,5,1


In [7]:
# Aggregate by year/month/country
temp_gb = temp_df.groupby(by=["Country", "YEAR_ID", "MONTH_ID"])
agg_temp_df = temp_gb["AverageTemperature"].mean()
agg_temp_df = agg_temp_df.reset_index()
agg_temp_df.head()

Unnamed: 0,Country,YEAR_ID,MONTH_ID,AverageTemperature
0,Afghanistan,1833,1,2.29
1,Afghanistan,1833,2,3.319
2,Afghanistan,1833,3,7.444
3,Afghanistan,1833,4,13.576
4,Afghanistan,1833,5,19.321


## Align Country Names

In [8]:
# Let's make sets of the country names
countries_com = set(commercial_df.GEOGRAPHY_DESC)
countries_temp = set(temp_df.Country)
print(f"There are {len(countries_com)} country names in the commercial dataset")
print(f"There are {len(countries_temp)} country names in the temperature dataset")

There are 225 country names in the commercial dataset
There are 49 country names in the temperature dataset


In [9]:
#Lets see how many of the countries are in both datasets
countries_inter = countries_com.intersection(countries_temp)
print(f"There are {len(countries_inter)} country names that appear in both datasets")

There are 39 country names that appear in both datasets


In [10]:
# Since most of the temp countries fit inside of the commercial, let's see which ones do not
print("The following country names from the temperature data could not be found in the commercial data:")
print_statement = [print(each) for each in countries_temp.difference(countries_com)]

The following country names from the temperature data could not be found in the commercial data:
United States
Congo (Democratic Republic Of The)
Sudan
Syria
Côte D'Ivoire
China
Germany
South Africa
Taiwan
Burma


In [11]:
# We are going to rename the commercial_df to match the temp_df. We could go the other direction, but... 
# The temperature data labels make more sense
country_mapper = { 
    "Myanmar (Burma)" : "Burma",
    "China (Mainland)" : "China",
    "China (Taiwan)" : "Taiwan",
    "Congo (Kinshasa)" : "Congo (Democratic Republic Of The)",
    "German Democratic Republic" : "Germany",
    "Germany, Fed. Republic" : "Germany",
    "Ivory Coast" : "Côte D'Ivoire",
    "Republic of South Africa" : "South Africa",
}
commercial_df.GEOGRAPHY_DESC.replace(country_mapper, inplace=True)

Interestingly, there were no aquacultural exports to Sudan or Syria during this time

In [12]:
# And now the only countries that remain unmatched...
countries_com = set(commercial_df.GEOGRAPHY_DESC)
countries_temp = set(temp_df.Country)
print("These Countries could not be matched:")
print_statement = [print(each) for each in countries_temp.difference(countries_com)]

These Countries could not be matched:
United States
Sudan
Syria


There is just one issue remaining. We mapped two countries (The Germanies) onto one country name. We should also map their 'GEOGRAPHY_CODE' onto one ID as well. Let's see what the IDs are for Germany:

In [13]:
commercial_df[commercial_df.GEOGRAPHY_DESC == "Germany"]["GEOGRAPHY_CODE"].value_counts()

4280    9574
4290       8
Name: GEOGRAPHY_CODE, dtype: int64

It looks like there are only a few trades with '4290' code. This makes sense because the 'German Democratic Republic' was dissolved about a year into our commercial dataset. Let's check our intuitions...

In [14]:
commercial_df[commercial_df.GEOGRAPHY_CODE == 4290]

Unnamed: 0,HS_CODE,COMMODITY_DESC,GEOGRAPHY_CODE,GEOGRAPHY_DESC,ATTRIBUTE_DESC,UNIT_DESC,YEAR_ID,TIMEPERIOD_ID,AMOUNT,Direction,Measure
319090,306130020.0,"SHRIMPS AND PRAWNS, SHELL-ON, FROZEN",4290,Germany,"US Import, QTY",KG,1989,11,16000,US Import,QTY
319091,306130020.0,"SHRIMPS AND PRAWNS, SHELL-ON, FROZEN",4290,Germany,"US Import, VLU",U.S.$,1989,11,122171,US Import,VLU
319092,,"Shrimp, frozen",4290,Germany,"US Import, QTY",KG,1989,11,16000,US Import,QTY
319093,,"Shrimp, frozen",4290,Germany,"US Import, VLU",U.S.$,1989,11,122171,US Import,VLU
319094,,"Shrimp, total",4290,Germany,"US Import, QTY",KG,1989,11,16000,US Import,QTY
319095,,"Shrimp, total",4290,Germany,"US Import, VLU",U.S.$,1989,11,122171,US Import,VLU
240232,301100000.0,"FISH, ORNAMENTAL, LIVE",4290,Germany,"US Export, VLU",U.S.$,1989,1,2501,US Export,VLU
240233,,Ornamental fish,4290,Germany,"US Export, VLU",U.S.$,1989,1,2501,US Export,VLU


And there we go! Isn't history fun? Ok let's get back to the Data Cleaning. We are just going to map those few communist Germans into their new united Germany

In [15]:
commercial_df.GEOGRAPHY_CODE.replace({4290 : 4280} , inplace=True)

In [16]:
commercial_df[commercial_df.GEOGRAPHY_DESC == "Germany"]["GEOGRAPHY_CODE"].value_counts()

4280    9582
Name: GEOGRAPHY_CODE, dtype: int64

## Add Fish Type

In [17]:
# Define separate df
niraj_df = commercial_df

# Pull COMMODITY_DESC into list
unique_commodity_list = niraj_df.COMMODITY_DESC.unique().tolist()

# Define fish types
fish_list = ["Salmon", "Clam", "Shrimp", "Mussel", "Oyster", "Trout", "Scallop", "Ornamental", "Tilapia", "Carp"]


In [18]:
# Create empty dictionary
commodity_dict = {}

# Iterate through unique list and assign fish value
for commodity in unique_commodity_list:
    x=0
    while x < len(fish_list):
        if fish_list[x] in commodity:
            commodity_dict[commodity] = fish_list[x]
            x = x + len(fish_list)
        elif fish_list[x].upper() in commodity:
            commodity_dict[commodity] = fish_list[x]
            x = x + len(fish_list)
        elif fish_list[x].lower() in commodity:        
            commodity_dict[commodity] = fish_list[x]
            x = x + len(fish_list)
        else:
            x=x+1 

In [19]:
# Pull COMMODITY_DESC from commercial_df. Turn into list
all_commodity = niraj_df.COMMODITY_DESC.tolist()
fish = []

# This is a list of descriptions with issues
problem_desc = ['PINK (HUMPIE)SALMN WHOLE/PIECES IN OIL/AIRTIT CNTR']

# Iterate through commodities and assign fish value
for commodity in all_commodity:
    for problem in problem_desc:
        if commodity == problem:
            fish.append("Salmon")
        else:
            append = commodity_dict[commodity]
            fish.append(append)

In [20]:
# DO NOT RUN THIS CELL MORE THAN ONCE!

# Insert new column
niraj_df.insert(2, "FISH_TYPE", fish)


In [21]:
niraj_df.head()

Unnamed: 0,HS_CODE,COMMODITY_DESC,FISH_TYPE,GEOGRAPHY_CODE,GEOGRAPHY_DESC,ATTRIBUTE_DESC,UNIT_DESC,YEAR_ID,TIMEPERIOD_ID,AMOUNT,Direction,Measure
0,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,1,46682,US Import,QTY
1,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,2,37354,US Import,QTY
2,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,3,26080,US Import,QTY
3,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,4,38737,US Import,QTY
4,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,5,25799,US Import,QTY
...,...,...,...,...,...,...,...,...,...,...,...,...
368422,1.605520e+09,"SCALLOPS, INCLUDING QUEEN SCALLOPS, PREPARED O...",Scallop,2777,Curaco,"US Export, VLU",U.S.$,2014,7,5100,US Export,VLU
368423,1.605520e+09,"SCALLOPS, INCLUDING QUEEN SCALLOPS, PREPARED O...",Scallop,2777,Curaco,"US Export, VLU",U.S.$,2014,8,5100,US Export,VLU
368424,3.011100e+08,"ORNAMENTAL FRESHWATER FISH, LIVE",Ornamental,2777,Curaco,"US Export, VLU",U.S.$,2012,11,4460,US Export,VLU
368425,3.011100e+08,"ORNAMENTAL FRESHWATER FISH, LIVE",Ornamental,2777,Curaco,"US Export, VLU",U.S.$,2012,12,4139,US Export,VLU


## Table Creation

In [22]:
# Country Table
country_df = niraj_df.groupby(['GEOGRAPHY_CODE', 'GEOGRAPHY_DESC']).agg({'AMOUNT': 'sum'})
country_df = country_df.drop(columns=["AMOUNT"])
country_df.reset_index(inplace=True)
country_df_final = country_df.rename(mapper=country_renamer, axis=1)
country_df_final.head()

Unnamed: 0,country_code,country_name
0,1,World
1,1010,Greenland
2,1220,Canada
3,1610,St Pierre and Miquelon
4,2010,Mexico


In [23]:
# Fish Table
fish_type_df = niraj_df.groupby(by=["HS_CODE", "COMMODITY_DESC","FISH_TYPE"]).agg({'AMOUNT': 'sum'})
fish_type_df = fish_type_df.drop(columns=["AMOUNT"])
fish_type_df.reset_index(inplace=True)
fish_type_df_final = fish_type_df.rename(mapper=fish_renamer, axis=1)
fish_type_df_final.head()

Unnamed: 0,hs_code,commodity_desc,fish_type
0,301100000.0,"FISH, ORNAMENTAL, LIVE",Ornamental
1,301100010.0,"KOI (COMMON) CARP (CYPRINUS CARPIO), LIVE",Carp
2,301100020.0,"GOLDFISH AND CRUCIAN CARP, LIVE",Carp
3,301100090.0,"FISH, ORNAMENTAL, LIVE, NESOI",Ornamental
4,301110000.0,"ORNAMENTAL FRESHWATER FISH, LIVE",Ornamental


In [24]:
# Commercial Table
commercial_df = niraj_df
commercial_cols = list(commercial_renamer.keys())
commercial_df_final = commercial_df[commercial_cols]
commercial_df_final = commercial_df_final.rename(mapper=commercial_renamer, axis=1)
commercial_df_final.head()

Unnamed: 0,country_code,hs_code,unit_descr,direction,measure,year_id,month_id,amount
0,1,302110000.0,KG,US Import,QTY,1989,1,46682
1,1,302110000.0,KG,US Import,QTY,1989,2,37354
2,1,302110000.0,KG,US Import,QTY,1989,3,26080
3,1,302110000.0,KG,US Import,QTY,1989,4,38737
4,1,302110000.0,KG,US Import,QTY,1989,5,25799


In [25]:
# Temperature Table
agg_temp_df_final = agg_temp_df.rename(mapper=temp_renamer, axis=1)
agg_temp_df_final.head()

Unnamed: 0,country_name,year_id,month_id,avg_temperature
0,Afghanistan,1833,1,2.29
1,Afghanistan,1833,2,3.319
2,Afghanistan,1833,3,7.444
3,Afghanistan,1833,4,13.576
4,Afghanistan,1833,5,19.321


In [30]:
# EXPORT TABLES TO CSV
country_df_final.to_csv(TABLE_country)
fish_type_df_final.to_csv(TABLE_fish_type)
commercial_df_final.to_csv(TABLE_commercial)
agg_temp_df_final.to_csv(TABLE_temp)

## Aggregation 
Aggregations that will not be loaded into the database but are here for academic purposes

In [27]:
#Grouping by Country, Import/Export and QTY/VLU
country_df = commercial_df.groupby(['GEOGRAPHY_DESC', "FISH_TYPE"]).agg({'AMOUNT': 'sum', 'GEOGRAPHY_CODE':'mean'})
country_df

Unnamed: 0_level_0,Unnamed: 1_level_0,AMOUNT,GEOGRAPHY_CODE
GEOGRAPHY_DESC,FISH_TYPE,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,Ornamental,32918,5310.0
Afghanistan,Oyster,794728,5310.0
Afghanistan,Salmon,165362,5310.0
Afghanistan,Scallop,3074540,5310.0
Afghanistan,Shrimp,283708,5310.0
...,...,...,...
Zaire,Salmon,37262,7660.0
Zambia,Carp,14676,7940.0
Zambia,Ornamental,5221980,7940.0
Zimbabwe,Salmon,6576,7960.0


In [28]:
country_df["GEOGRAPHY_CODE"] = country_df["GEOGRAPHY_CODE"].astype(int)
country_df

Unnamed: 0_level_0,Unnamed: 1_level_0,AMOUNT,GEOGRAPHY_CODE
GEOGRAPHY_DESC,FISH_TYPE,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,Ornamental,32918,5310
Afghanistan,Oyster,794728,5310
Afghanistan,Salmon,165362,5310
Afghanistan,Scallop,3074540,5310
Afghanistan,Shrimp,283708,5310
...,...,...,...
Zaire,Salmon,37262,7660
Zambia,Carp,14676,7940
Zambia,Ornamental,5221980,7940
Zimbabwe,Salmon,6576,7960


In [29]:
species_df = commercial_df.groupby(by=["COMMODITY_DESC", "FISH_TYPE"]).sum()
species_df

Unnamed: 0_level_0,Unnamed: 1_level_0,HS_CODE,GEOGRAPHY_CODE,YEAR_ID,TIMEPERIOD_ID,AMOUNT
COMMODITY_DESC,FISH_TYPE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATLANTIC & DANUBE SALMON FROZEN EX FILLET/LVR/ROE,Salmon,1.208028e+12,10832650,7966966,26188,846428360
"ATLANTIC AND DANUBE SALMON, FROZEN, NESOI",Salmon,6.246333e+10,540262,409776,1294,31821312
"ATLANTIC SALMON (SALMO SALAR) AND DANUBE SALMON (HUCHO HUCHO), FRESH OR CHILLED,",Salmon,9.547626e+10,670308,636430,1896,7230762
"ATLANTIC SALMON (SALMO SALAR) AND DANUBE SALMON (HUCHO HUCHO), FROZEN, EXCEPT FI",Salmon,3.231366e+11,3313056,2146878,6730,179120768
"ATLANTIC SALMON (SALMO SALAR) FILLETS, NOT FARMED, FRESH OR CHILLED",Salmon,3.324157e+11,3823152,2199046,6882,9834372218
...,...,...,...,...,...,...
"Tilapia, Fillets, frozen",Tilapia,0.000000e+00,18738434,9631362,30892,16053408448
"Tilapia, Whole, frozen",Tilapia,0.000000e+00,14995316,7456520,24198,4204230064
"Tilapia, total",Tilapia,0.000000e+00,31122340,17311031,55644,25938310798
"Trout, fresh and frozen",Trout,0.000000e+00,25565109,18344154,58971,1910883300
