## Intitialize

In [1]:
# Dependencies
import pandas as pd
import os

In [2]:
# File Paths
temp_csv = os.path.join("Data","Land_Temp","land_temps.csv")
export_csv = os.path.join("Data", "Aquaculture", "Exports.csv")
import_csv = os.path.join("Data", "Aquaculture", "Imports.csv")

In [3]:
# Load DataFrames
temp_df = pd.read_csv(temp_csv)
exports_df = pd.read_csv(export_csv)
imports_df = pd.read_csv(import_csv)

## Clean Imports/Exports

In [4]:
# Concat Dataframes
commercial_df = pd.concat([imports_df, exports_df])
# Drop Source Col
commercial_df.drop(labels="SOURCE_ID", axis=1, inplace=True)
# Split Attribute Description
attr_split_df = commercial_df.ATTRIBUTE_DESC.str.split(pat=", ", expand=True)
# Add Columns back to comercial df
commercial_df["Direction"] = attr_split_df.loc[:,0]
commercial_df["Measure"] = attr_split_df.loc[:,1]
commercial_df.head()

Unnamed: 0,HS_CODE,COMMODITY_DESC,GEOGRAPHY_CODE,GEOGRAPHY_DESC,ATTRIBUTE_DESC,UNIT_DESC,YEAR_ID,TIMEPERIOD_ID,AMOUNT,Direction,Measure
0,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,1,46682,US Import,QTY
1,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,2,37354,US Import,QTY
2,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,3,26080,US Import,QTY
3,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,4,38737,US Import,QTY
4,302110000.0,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",1,World,"US Import, QTY",KG,1989,5,25799,US Import,QTY


## Clean Temperature Data
- Split Date into columns
- Aggregate by Month (to match commercial data)

In [5]:
#Split 'dt' column into Year, Month, Day
temp_dates = temp_df.dt.str.split(pat="-", expand=True)
temp_df = pd.merge(left=temp_df, right=temp_dates, how="left", left_index=True, right_index=True)
temp_df.rename(columns={0:"YEAR_ID", 1:"MONTH_ID", 2:"DAY_ID"}, inplace=True)
temp_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,YEAR_ID,MONTH_ID,DAY_ID
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,1,1
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,2,1
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,3,1
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,4,1
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W,1849,5,1


In [6]:
# Aggregate by year/month/country
temp_gb = temp_df.groupby(by=["Country", "YEAR_ID", "MONTH_ID"])
agg_temp_df = temp_gb["AverageTemperature"].mean()
agg_temp_df = agg_temp_df.reset_index()
agg_temp_df.head()

Unnamed: 0,Country,YEAR_ID,MONTH_ID,AverageTemperature
0,Afghanistan,1833,1,2.29
1,Afghanistan,1833,2,3.319
2,Afghanistan,1833,3,7.444
3,Afghanistan,1833,4,13.576
4,Afghanistan,1833,5,19.321


## Align Country Names

In [7]:
# Let's make sets of the country names
countries_com = set(commercial_df.GEOGRAPHY_DESC)
countries_temp = set(temp_df.Country)
print(f"There are {len(countries_com)} country names in the commercial dataset")
print(f"There are {len(countries_temp)} country names in the temperature dataset")

There are 225 country names in the commercial dataset
There are 49 country names in the temperature dataset


In [8]:
#Lets see how many of the countries are in both datasets
countries_inter = countries_com.intersection(countries_temp)
print(f"There are {len(countries_inter)} country names that appear in both datasets")

There are 39 country names that appear in both datasets


In [9]:
# Since most of the temp countries fit inside of the commercial, let's see which ones do not
print("The following country names from the temperature data could not be found in the commercial data:")
print_statement = [print(each) for each in countries_temp.difference(countries_com)]

The following country names from the temperature data could not be found in the commercial data:
Syria
Côte D'Ivoire
Germany
South Africa
Burma
China
Congo (Democratic Republic Of The)
Taiwan
United States
Sudan


In [10]:
# We are going to rename the commercial_df to match the temp_df. We could go the other direction, but... 
# The temperature data labels make more sense
country_mapper = { 
    "Myanmar (Burma)" : "Burma",
    "China (Mainland)" : "China",
    "China (Taiwan)" : "Taiwan",
    "Congo (Kinshasa)" : "Congo (Democratic Republic Of The)",
    "German Democratic Republic" : "Germany",
    "Germany, Fed. Republic" : "Germany",
    "Ivory Coast" : "Côte D'Ivoire",
    "Republic of South Africa" : "South Africa",
}
commercial_df.GEOGRAPHY_DESC.replace(country_mapper, inplace=True)

Interestingly, there were no aquacultural exports to Sudan or Syria during this time

In [11]:
# And now the only countries that remain unmatched...
countries_com = set(commercial_df.GEOGRAPHY_DESC)
countries_temp = set(temp_df.Country)
print("These Countries could not be matched:")
print_statement = [print(each) for each in countries_temp.difference(countries_com)]

These Countries could not be matched:
United States
Syria
Sudan


There is just one issue remaining. We mapped two countries (The Germanies) onto one country name. We should also map their 'GEOGRAPHY_CODE' onto one ID as well. Let's see what the IDs are for Germany:

In [12]:
commercial_df[commercial_df.GEOGRAPHY_DESC == "Germany"]["GEOGRAPHY_CODE"].value_counts()

4280    9574
4290       8
Name: GEOGRAPHY_CODE, dtype: int64

It looks like there are only a few trades with '4290' code. This makes sense because the 'German Democratic Republic' was dissolved about a year into our commercial dataset. Let's check our intuitions...

In [13]:
commercial_df[commercial_df.GEOGRAPHY_CODE == 4290]

Unnamed: 0,HS_CODE,COMMODITY_DESC,GEOGRAPHY_CODE,GEOGRAPHY_DESC,ATTRIBUTE_DESC,UNIT_DESC,YEAR_ID,TIMEPERIOD_ID,AMOUNT,Direction,Measure
319090,306130020.0,"SHRIMPS AND PRAWNS, SHELL-ON, FROZEN",4290,Germany,"US Import, QTY",KG,1989,11,16000,US Import,QTY
319091,306130020.0,"SHRIMPS AND PRAWNS, SHELL-ON, FROZEN",4290,Germany,"US Import, VLU",U.S.$,1989,11,122171,US Import,VLU
319092,,"Shrimp, frozen",4290,Germany,"US Import, QTY",KG,1989,11,16000,US Import,QTY
319093,,"Shrimp, frozen",4290,Germany,"US Import, VLU",U.S.$,1989,11,122171,US Import,VLU
319094,,"Shrimp, total",4290,Germany,"US Import, QTY",KG,1989,11,16000,US Import,QTY
319095,,"Shrimp, total",4290,Germany,"US Import, VLU",U.S.$,1989,11,122171,US Import,VLU
240232,301100000.0,"FISH, ORNAMENTAL, LIVE",4290,Germany,"US Export, VLU",U.S.$,1989,1,2501,US Export,VLU
240233,,Ornamental fish,4290,Germany,"US Export, VLU",U.S.$,1989,1,2501,US Export,VLU


And there we go! Isn't history fun? Ok let's get back to the Data Cleaning. We are just going to map those few communist Germans into their new united Germany

In [14]:
commercial_df.GEOGRAPHY_CODE.replace({4290 : 4280} , inplace=True)

In [15]:
commercial_df[commercial_df.GEOGRAPHY_DESC == "Germany"]["GEOGRAPHY_CODE"].value_counts()

4280    9582
Name: GEOGRAPHY_CODE, dtype: int64

## Add Fish Type

In [16]:
# Define separate df
niraj_df = commercial_df

# Pull COMMODITY_DESC into list
unique_commodity_list = niraj_df.COMMODITY_DESC.unique().tolist()

# Define fish types
fish_list = ["Salmon", "Clam", "Shrimp", "Mussel", "Oyster", "Trout", "Scallop", "Ornamental", "Tilapia", "Carp"]


In [17]:
# Create empty dictionary
commodity_dict = {}

# Iterate through unique list and assign fish value
for commodity in unique_commodity_list:
    x=0
    while x < len(fish_list):
        if fish_list[x] in commodity:
            commodity_dict[commodity] = fish_list[x]
            x = x + len(fish_list)
        elif fish_list[x].upper() in commodity:
            commodity_dict[commodity] = fish_list[x]
            x = x + len(fish_list)
        elif fish_list[x].lower() in commodity:        
            commodity_dict[commodity] = fish_list[x]
            x = x + len(fish_list)
        else:
            x=x+1 

In [18]:
# Pull COMMODITY_DESC from commercial_df. Turn into list
all_commodity = niraj_df.COMMODITY_DESC.tolist()
fish = []

# This is a list of descriptions with issues
problem_desc = ['PINK (HUMPIE)SALMN WHOLE/PIECES IN OIL/AIRTIT CNTR']

# Iterate through commodities and assign fish value
for commodity in all_commodity:
    for problem in problem_desc:
        if commodity == problem:
            fish.append("Salmon")
        else:
            append = commodity_dict[commodity]
            fish.append(append)

In [19]:
# DO NOT RUN THIS CELL MORE THAN ONCE!

# Insert new column
niraj_df.insert(2, "FISH_TYPE", fish)


In [20]:
niraj_df

Unnamed: 0,HS_CODE,COMMODITY_DESC,FISH_TYPE,GEOGRAPHY_CODE,GEOGRAPHY_DESC,ATTRIBUTE_DESC,UNIT_DESC,YEAR_ID,TIMEPERIOD_ID,AMOUNT,Direction,Measure
0,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,1,46682,US Import,QTY
1,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,2,37354,US Import,QTY
2,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,3,26080,US Import,QTY
3,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,4,38737,US Import,QTY
4,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,5,25799,US Import,QTY
...,...,...,...,...,...,...,...,...,...,...,...,...
368422,1.605520e+09,"SCALLOPS, INCLUDING QUEEN SCALLOPS, PREPARED O...",Scallop,2777,Curaco,"US Export, VLU",U.S.$,2014,7,5100,US Export,VLU
368423,1.605520e+09,"SCALLOPS, INCLUDING QUEEN SCALLOPS, PREPARED O...",Scallop,2777,Curaco,"US Export, VLU",U.S.$,2014,8,5100,US Export,VLU
368424,3.011100e+08,"ORNAMENTAL FRESHWATER FISH, LIVE",Ornamental,2777,Curaco,"US Export, VLU",U.S.$,2012,11,4460,US Export,VLU
368425,3.011100e+08,"ORNAMENTAL FRESHWATER FISH, LIVE",Ornamental,2777,Curaco,"US Export, VLU",U.S.$,2012,12,4139,US Export,VLU


## Aggregation

In [30]:
#Grouping by Country, Import/Export and QTY/VLU
country_df = niraj_df.groupby(['GEOGRAPHY_CODE', 'GEOGRAPHY_DESC']).agg({'AMOUNT': 'sum'})
country_df = country_df.drop(columns=["AMOUNT"])
country_df = country_df.rename(columns={"GEOGRAPHY_CODE":"country_code", "GEOGRAPHY_DESC": "country_name"})
country_df

GEOGRAPHY_CODE,GEOGRAPHY_DESC
1,World
1010,Greenland
1220,Canada
1610,St Pierre and Miquelon
2010,Mexico
...,...
7940,Zambia
7950,Swaziland
7960,Zimbabwe
7970,Malawi


In [27]:
country_df["GEOGRAPHY_CODE"] = country_df["GEOGRAPHY_CODE"].astype(int)

In [28]:
#Grouping by country, year, and ATTRIBUTE_DESC
imports_year_df = imports_df.groupby(["GEOGRAPHY_DESC", "YEAR_ID", "ATTRIBUTE_DESC"]).agg({'AMOUNT': 'sum', 'GEOGRAPHY_CODE':'mean'})

In [31]:
commercial_df

Unnamed: 0,HS_CODE,COMMODITY_DESC,FISH_TYPE,GEOGRAPHY_CODE,GEOGRAPHY_DESC,ATTRIBUTE_DESC,UNIT_DESC,YEAR_ID,TIMEPERIOD_ID,AMOUNT,Direction,Measure
0,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,1,46682,US Import,QTY
1,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,2,37354,US Import,QTY
2,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,3,26080,US Import,QTY
3,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,4,38737,US Import,QTY
4,3.021100e+08,"TROUT (SALMO TRUTTA, S. CLARKI ETC) FRESH, CHI...",Trout,1,World,"US Import, QTY",KG,1989,5,25799,US Import,QTY
...,...,...,...,...,...,...,...,...,...,...,...,...
368422,1.605520e+09,"SCALLOPS, INCLUDING QUEEN SCALLOPS, PREPARED O...",Scallop,2777,Curaco,"US Export, VLU",U.S.$,2014,7,5100,US Export,VLU
368423,1.605520e+09,"SCALLOPS, INCLUDING QUEEN SCALLOPS, PREPARED O...",Scallop,2777,Curaco,"US Export, VLU",U.S.$,2014,8,5100,US Export,VLU
368424,3.011100e+08,"ORNAMENTAL FRESHWATER FISH, LIVE",Ornamental,2777,Curaco,"US Export, VLU",U.S.$,2012,11,4460,US Export,VLU
368425,3.011100e+08,"ORNAMENTAL FRESHWATER FISH, LIVE",Ornamental,2777,Curaco,"US Export, VLU",U.S.$,2012,12,4139,US Export,VLU


In [33]:
fish_type_df = commercial_df.groupby(by=["HS_CODE", FISH_TYPE","ATTRIBUTE_DESC"]).agg({'AMOUNT': 'sum'})
fish_type_df

Unnamed: 0_level_0,Unnamed: 1_level_0,AMOUNT
FISH_TYPE,ATTRIBUTE_DESC,Unnamed: 2_level_1
Carp,"US Import, VLU",142815510
Clam,"US Export, QTY",459988088
Clam,"US Export, VLU",3662031580
Clam,"US Import, QTY",1540970468
Clam,"US Import, VLU",5154108476
Mussel,"US Export, QTY",72270536
Mussel,"US Export, VLU",220520608
Mussel,"US Import, QTY",2008522272
Mussel,"US Import, VLU",5698530844
Ornamental,"US Export, VLU",1179998528
