This notebook conducts clustering analyses on countries based on crop production diversity and trends

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read in processed data
items_by_country = pd.read_csv('../data/processed/items_by_country.csv',index_col=0)
print(items_by_country.shape)

# Put name for years into a column
# The years list is used by the function below when reshaping data frame 
year = items_by_country.columns[5:-1].tolist() # Select year 1986-2017

(223049, 38)


In [3]:
items_by_country.head()

Unnamed: 0,Reporter Countries,Item,Element,Unit,Item Code,Y1986,Y1987,Y1988,Y1989,Y1990,...,Y2009,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,NoneZero
0,Afghanistan,Almonds shelled,Export Quantity,tonnes,5313,0.0,0.0,0.0,0.0,0.0,...,4763.0,1308.0,2261.0,0.0,0.0,2714.0,2086.0,1778.0,2756.0,7
1,Afghanistan,Almonds shelled,Export Value,1000 US$,5313,0.0,0.0,0.0,0.0,0.0,...,35476.0,15894.0,20270.0,0.0,0.0,16454.0,12793.0,10934.0,19677.0,7
2,Afghanistan,Almonds shelled,Import Quantity,tonnes,1617,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,168.0,181.0,846.0,103.0,4
3,Afghanistan,Almonds shelled,Import Value,1000 US$,1617,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1117.0,1377.0,4988.0,759.0,4
4,Afghanistan,"Almonds, with shell",Export Quantity,tonnes,3315,0.0,0.0,0.0,0.0,0.0,...,11066.0,779.0,1016.0,0.0,0.0,1856.0,1660.0,1545.0,875.0,7


In [131]:
# Select rows with >50% data (16 years)
items_by_country = items_by_country.loc[items_by_country['NoneZero']>=16,:]

In [132]:
items_by_country.shape

(97768, 38)

1. Reshape data so that items are columns 

In [133]:
items_by_country['Item'].unique()

array(['Almonds shelled', 'Almonds, with shell',
       'Animals, live, non-food', 'Anise, badian, fennel, coriander',
       'Apples', 'Apricots', 'Apricots, dry', 'Bacon and ham', 'Bananas',
       'Barley', 'Beans, dry', 'Beans, green', 'Beer of barley',
       'Beeswax', 'Beverages, distilled alcoholic',
       'Beverages, non alcoholic', 'Bran, maize', 'Bran, wheat', 'Bread',
       'Broad beans, horse beans, dry', 'Butter, cow milk',
       'Buttermilk, curdled, acidified milk',
       'Cabbages and other brassicas', 'Cake, soybeans',
       'Cake, sunflower', 'Carrots and turnips', 'Cattle',
       'Cauliflowers and broccoli', 'Cereal preparations nes',
       'Cereals, breakfast', 'Cheese, processed',
       'Cheese, whole cow milk', 'Cherries', 'Chestnut', 'Chick peas',
       'Chickens', 'Chillies and peppers, dry',
       'Chillies and peppers, green', 'Chocolate products nes',
       'Cider etc', 'Cigarettes', 'Cigars, cheroots',
       'Cinnamon (cannella)', 'Cocoa, powder

In [134]:
# Reshape data from wide to long by years 
items_by_country_long = items_by_country.melt(['Reporter Countries','Item','Element'],year,'year','value')

In [135]:
# Convert df to time series
items_by_country_long['year'] = items_by_country_long['year'].map(lambda x: x.lstrip('Y')) # strip Y from year names for easy converting to ts
items_by_country_long.year = pd.to_datetime(items_by_country_long.year)

In [136]:
items_by_country_long.head()

Unnamed: 0,Reporter Countries,Item,Element,year,value
0,Albania,Almonds shelled,Import Quantity,1986-01-01,0.0
1,Albania,Almonds shelled,Import Value,1986-01-01,0.0
2,Albania,"Almonds, with shell",Import Quantity,1986-01-01,0.0
3,Albania,"Almonds, with shell",Import Value,1986-01-01,0.0
4,Albania,"Animals, live, non-food",Export Value,1986-01-01,0.0


In [137]:
# Seperate export, import, quantity and value
items_exportQ = items_by_country_long.loc[items_by_country_long['Element']=='Export Quantity',:]
items_exportV = items_by_country_long.loc[items_by_country_long['Element']=='Export Value',:]

items_importQ = items_by_country_long.loc[items_by_country_long['Element']=='Import Quantity',:]
items_importV = items_by_country_long.loc[items_by_country_long['Element']=='Import Value',:]


In [138]:
items_exportQ.head()

Unnamed: 0,Reporter Countries,Item,Element,year,value
19,Albania,"Beans, dry",Export Quantity,1986-01-01,0.0
23,Albania,"Beans, green",Export Quantity,1986-01-01,0.0
27,Albania,Beer of barley,Export Quantity,1986-01-01,0.0
33,Albania,"Beverages, distilled alcoholic",Export Quantity,1986-01-01,0.0
37,Albania,"Beverages, non alcoholic",Export Quantity,1986-01-01,0.0


In [139]:
items_exportQ.shape

(614496, 5)

In [140]:
# Reshape data from long to wide, by items
# Firstly, work with Export Quantity

items_exportQ_wide = pd.pivot_table(items_exportQ, index=['Reporter Countries','year'],columns='Item',values='value')

In [141]:
items_exportQ_wide = items_exportQ_wide.reset_index()
items_exportQ_wide.year = pd.to_datetime(items_exportQ_wide.year)

In [142]:
items_exportQ_wide.shape

(4160, 384)

In [162]:
country_high = []
country_med = []
country_low = []

for country in items_exportQ_wide['Reporter Countries'].unique():
    
    df = items_exportQ_wide.loc[items_exportQ_wide['Reporter Countries']==country,:].dropna(axis=1)
    
    if df.shape[1] >= 200:
        country_high.append(country)
        #print(country, df.shape[1])
    elif (df.shape[1] >= 100) & (df.shape[1] < 200):
        country_med.append(country)
        #print(country, df.shape[1])
    elif df.shape[1] < 100:
        country_low.append(country)
        #print(country, df.shape[1])


In [177]:
high_div = items_exportQ_wide.loc[items_exportQ_wide['Reporter Countries'].isin(country_high),:].dropna(axis=1)
high_div.year = pd.to_datetime(high_div.year)
med_div = items_exportQ_wide.loc[items_exportQ_wide['Reporter Countries'].isin(country_med),:].dropna(axis=1)
med_div.year = pd.to_datetime(med_div.year)
low_div = items_exportQ_wide.loc[items_exportQ_wide['Reporter Countries'].isin(country_low),:].dropna(axis=1)
low_div.year = pd.to_datetime(low_div.year)

In [178]:
print(high_div.shape, med_div.shape, low_div.shape)

(1472, 66) (1152, 11) (1536, 2)


In [176]:
high_div.year = pd.to_datetime(high_div.year)
high_div.head()


Item,Reporter Countries,year,Apples,"Beans, dry",Beer of barley,"Beverages, distilled alcoholic","Beverages, non alcoholic",Cabbages and other brassicas,Cereal preparations nes,"Cereals, breakfast",...,"Tomatoes, paste",Vegetables in vinegar,"Vegetables, dehydrated","Vegetables, fresh nes","Vegetables, frozen","Vegetables, preserved nes","Vegetables, temporarily preserved",Watermelons,"Waters,ice etc",Wine
64,Argentina,1986-01-01,77959.0,208316.0,1237.0,54.0,0.0,0.0,0.0,0.0,...,0.0,0.0,356.0,0.0,17.0,0.0,0.0,0.0,6.0,3660.0
65,Argentina,1987-01-01,185768.0,139073.0,0.0,207.0,1.0,0.0,357.0,0.0,...,1046.0,0.0,393.0,6.0,0.0,139.0,0.0,0.0,0.0,16336.0
66,Argentina,1988-01-01,183033.0,155440.0,0.0,112.0,111.0,0.0,2239.0,2.0,...,1028.0,160.0,634.0,232.0,0.0,398.0,0.0,0.0,13.0,14870.0
67,Argentina,1989-01-01,200094.0,98602.0,192.0,2877.0,1083.0,0.0,6623.0,0.0,...,18708.0,897.0,767.0,366.0,201.0,771.0,294.0,0.0,17.0,24926.0
68,Argentina,1990-01-01,240303.0,158755.0,102.0,9717.0,282.0,0.0,26191.0,83.0,...,19932.0,825.0,1032.0,435.0,311.0,3871.0,0.0,0.0,155.0,51621.0


In [179]:
extracted_features = extract_features(high_div, column_id="Reporter Countries", column_sort="year")


Feature Extraction: 100%|██████████| 10/10 [01:52<00:00,  9.87s/it]


In [None]:
# add country coordinates and other info

In [182]:
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extract_features)

 'Apples__agg_linear_trend__f_agg_"max"__chunk_len_50__attr_"rvalue"'
 'Apples__agg_linear_trend__f_agg_"max"__chunk_len_50__attr_"slope"' ...
 'Wine__fft_coefficient__coeff_99__attr_"angle"'
 'Wine__fft_coefficient__coeff_99__attr_"imag"'
 'Wine__fft_coefficient__coeff_99__attr_"real"'] did not have any finite values. Filling with zeros.


NameError: name 'select_features' is not defined

In [183]:
from tsfresh import extract_relevant_features
features_filtered_direct = extract_relevant_features(high_div, column_id="Reporter Countries", column_sort="year")

TypeError: extract_relevant_features() missing 1 required positional argument: 'y'

In [128]:
df = items_exportQ_wide.loc[items_exportQ_wide['Reporter Countries']=='Maldives',:].dropna(axis=1)

In [130]:
df

Item,Reporter Countries,year,Beer of barley,"Beverages, distilled alcoholic","Chillies and peppers, dry",Cigarettes,Coconuts,"Coffee, extracts",Crude materials,Dates,"Flour, wheat",Food prep nes,"Fruit, prepared nes",Hides nes,"Milk, whole dried",Nuts nes,"Oil, coconut (copra)","Oil, essential nes","Waters,ice etc",Wine
3296,Maldives,1986-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3297,Maldives,1987-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3298,Maldives,1988-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3299,Maldives,1989-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3300,Maldives,1990-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3301,Maldives,1991-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3302,Maldives,1992-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3303,Maldives,1993-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3304,Maldives,1994-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3305,Maldives,1995-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
from tsfresh import extract_features