#### This file cleans raw data and generate processed data files for further analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import operator 

In [2]:
# Data import 
trade = pd.read_csv('/Users/Sabrina/Documents/OneDrive/Github/crop-trends/data/raw/Trade_DetailedTradeMatrix_E_All_Data_NOFLAG.csv',
                   encoding='latin1')


## 1. Inspect general information of the dataset

In [4]:
print(trade.shape) # total rows and columns 

(5248354, 41)


In [5]:
trade.head() # time series between years 1986 and 2017 on export and import agricultural commodities.

Unnamed: 0,Reporter Country Code,Reporter Countries,Partner Country Code,Partner Countries,Item Code,Item,Element Code,Element,Unit,Y1986,...,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017
0,2,Afghanistan,4,Algeria,230,"Cashew nuts, shelled",5910,Export Quantity,tonnes,,...,,,,,,,,,3.0,
1,2,Afghanistan,4,Algeria,230,"Cashew nuts, shelled",5922,Export Value,1000 US$,,...,,,,,,,,,23.0,
2,2,Afghanistan,4,Algeria,1293,Crude materials,5922,Export Value,1000 US$,,...,,,,,,,,1.0,1.0,5.0
3,2,Afghanistan,4,Algeria,561,Raisins,5910,Export Quantity,tonnes,,...,,,,,,,12.0,,,
4,2,Afghanistan,4,Algeria,561,Raisins,5922,Export Value,1000 US$,,...,,,,,,,27.0,,,


In [11]:
# Identify total number of reporter countries, partner countries and unique items
print('Number of unique items traded:', len(trade.Item.unique()))
print('Number of Reporter Countries:', len(trade['Reporter Countries'].unique()))
print('Number of Partner Countries:', len(trade['Partner Countries'].unique()))
print('Entries with Unspecified Area as partner countries:',
      len(trade.loc[trade['Partner Countries']=='Unspecified Area',:]))

Number of unique items traded: 424
Number of Reporter Countries: 184
Number of Partner Countries: 255
Entries with Unspecified Area as partner countries: 86692


 ### Observations
 - There are many missing data (no trading for a particular year with a country, or data missing)
 - 'Partner Countries' include entries of 'Unspecified Area', which should be noted when calculating diversity of trading partners. 
 
 We are not interested in data of a particular item between two specific countries. Instead, we are looking for the number of items(i.e., trading diversity) and their total amount a country export/import, and the number of partner countries each reporter country have for an item.   
 Therefore, we shall aggregate the dataset based on items and reporter countries, respectively. 

## 2. Aggregate data according to items and partner countries

#### 2.1 Sum up trading volumn for reporter countries by items

In [12]:
items_by_country = trade.groupby(['Reporter Countries','Item','Element','Unit']).sum()

In [20]:
items_by_country = items_by_country.drop(['Reporter Country Code','Partner Country Code','Element Code'],axis=1)

In [21]:
items_by_country.shape

(223049, 33)

In [23]:
items_by_country = items_by_country.reset_index()

In [26]:
items_by_country.head()

Unnamed: 0,Reporter Countries,Item,Element,Unit,Item Code,Y1986,Y1987,Y1988,Y1989,Y1990,...,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017
0,Afghanistan,Almonds shelled,Export Quantity,tonnes,5313,0.0,0.0,0.0,0.0,0.0,...,0.0,4763.0,1308.0,2261.0,0.0,0.0,2714.0,2086.0,1778.0,2756.0
1,Afghanistan,Almonds shelled,Export Value,1000 US$,5313,0.0,0.0,0.0,0.0,0.0,...,0.0,35476.0,15894.0,20270.0,0.0,0.0,16454.0,12793.0,10934.0,19677.0
2,Afghanistan,Almonds shelled,Import Quantity,tonnes,1617,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,168.0,181.0,846.0,103.0
3,Afghanistan,Almonds shelled,Import Value,1000 US$,1617,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1117.0,1377.0,4988.0,759.0
4,Afghanistan,"Almonds, with shell",Export Quantity,tonnes,3315,0.0,0.0,0.0,0.0,0.0,...,0.0,11066.0,779.0,1016.0,0.0,0.0,1856.0,1660.0,1545.0,875.0


#### 2.2 Sum up trading volumn for each reporter countries by partner countries

In [31]:
partners_by_country = trade.groupby(['Reporter Countries','Partner Countries','Element','Unit']).sum().reset_index()

In [32]:
# Sum of tonnes, heads, etc across different items do not make much sense, so will be removed
# Only value amount (i.e., US$ will be kept)
partners_by_country = partners_by_country.loc[partners_by_country['Unit']=='1000 US$',:]

In [36]:
partners_by_country = partners_by_country.drop(['Reporter Country Code','Partner Country Code','Item Code','Element Code'],axis=1)

In [37]:
partners_by_country.shape

(57838, 36)

In [38]:
partners_by_country.head()

Unnamed: 0,Reporter Countries,Partner Countries,Element,Unit,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,...,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017
1,Afghanistan,Algeria,Export Value,1000 US$,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,27.0,1.0,24.0,5.0
3,Afghanistan,Angola,Export Value,1000 US$,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,Afghanistan,Argentina,Import Value,1000 US$,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,415.0,0.0,0.0,1163.0,159.0,1844.0,1829.0
7,Afghanistan,Armenia,Export Value,1000 US$,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,Afghanistan,Armenia,Import Value,1000 US$,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,48.0,0.0,0.0,282.0,246.0,0.0,67.0


In [39]:
# Export aggregated data
items_by_country.to_csv('../data/processed/items_by_country.csv')
partners_by_country.to_csv('../data/processed/partners_by_country.csv')