In [1]:
#importing relevant libraries and dataset

import pandas as pd
import numpy as np

dataset = "/Users/alexandreribeiro/Downloads/Data.csv"
data = pd.read_csv(dataset)

df = data.copy()

df.head()

Unnamed: 0,m49_code,country,region,cpc_code,commodity,year,loss_percentage,loss_percentage_original,loss_quantity,activity,food_supply_stage,treatment,cause_of_loss,sample_size,method_data_collection,reference,url,notes
0,104,Myanmar,,23161.02,"Rice, milled",2015,1.78,1.78%,26.12kgs,Storage,Storage,"30 days storage, with trapping",Rodents,,Controlled Experiment,"Dr Steven Belmain (2015), context post-harvest...",,Reference has been generated automatically
1,104,Myanmar,,23161.02,"Rice, milled",2015,11.77,11.77%,88.18kgs,Storage,Storage,"60 days storage, no trapping",Rodents,,Controlled Experiment,"Dr Steven Belmain (2015), context post-harvest...",,Reference has been generated automatically
2,104,Myanmar,,23161.02,"Rice, milled",2015,5.88,5.88%,44.09kgs,Storage,Storage,"30 days storage, no trapping",Rodents,,Controlled Experiment,"Dr Steven Belmain (2015), context post-harvest...",,Reference has been generated automatically
3,104,Myanmar,,23161.02,"Rice, milled",2015,3.57,3.57%,52.24kgs,Storage,Storage,"60 days storage, with trapping",Rodents,,Controlled Experiment,"Dr Steven Belmain (2015), context post-harvest...",,Reference has been generated automatically
4,104,Myanmar,,23161.02,"Rice, milled",2015,17.65,17.65%,132.27kgs,Storage,Storage,"90 days storage, no trapping",Rodents,,Controlled Experiment,"Dr Steven Belmain (2015), context post-harvest...",,Reference has been generated automatically


In [2]:
#Analysing data types

df.dtypes

m49_code                      int64
country                      object
region                       object
cpc_code                     object
commodity                    object
year                          int64
loss_percentage             float64
loss_percentage_original     object
loss_quantity                object
activity                     object
food_supply_stage            object
treatment                    object
cause_of_loss                object
sample_size                  object
method_data_collection       object
reference                    object
url                          object
notes                        object
dtype: object

In [3]:
#Standardize column names

df.columns = df.columns.str.lower().str.replace(' ', '_')

df.columns


Index(['m49_code', 'country', 'region', 'cpc_code', 'commodity', 'year',
       'loss_percentage', 'loss_percentage_original', 'loss_quantity',
       'activity', 'food_supply_stage', 'treatment', 'cause_of_loss',
       'sample_size', 'method_data_collection', 'reference', 'url', 'notes'],
      dtype='object')

In [4]:
#Missing values

df.isnull().sum()

m49_code                        0
country                         0
region                      24202
cpc_code                        0
commodity                       0
year                            0
loss_percentage                 0
loss_percentage_original        0
loss_quantity               24877
activity                     2808
food_supply_stage            3391
treatment                   24096
cause_of_loss               24414
sample_size                 24224
method_data_collection        355
reference                   20303
url                          3293
notes                       23139
dtype: int64

In [5]:
#Drop columns that are not relevant for our analysis

df = df.drop(columns=['notes', 'url', 'reference', 'method_data_collection', 'sample_size', 'cause_of_loss', 'treatment', 'region', 'cpc_code'])

df

Unnamed: 0,m49_code,country,commodity,year,loss_percentage,loss_percentage_original,loss_quantity,activity,food_supply_stage
0,104,Myanmar,"Rice, milled",2015,1.78000,1.78%,26.12kgs,Storage,Storage
1,104,Myanmar,"Rice, milled",2015,11.77000,11.77%,88.18kgs,Storage,Storage
2,104,Myanmar,"Rice, milled",2015,5.88000,5.88%,44.09kgs,Storage,Storage
3,104,Myanmar,"Rice, milled",2015,3.57000,3.57%,52.24kgs,Storage,Storage
4,104,Myanmar,"Rice, milled",2015,17.65000,17.65%,132.27kgs,Storage,Storage
...,...,...,...,...,...,...,...,...,...
25411,894,Zambia,Millet,2000,2.50000,2.5,,Transportation,Farm
25412,894,Zambia,Millet,2000,2.50000,2.5,,Winnowing,Farm
25413,894,Zambia,Millet,2000,2.38075,2.38075,,Storage,Storage
25414,894,Zambia,Millet,2000,3.44008,3.44008,,"Drying, Harvesting",Harvest


In [6]:
#How many rows have both activity and food_supply_stage missing

df[(df['activity'].isnull()) & (df['food_supply_stage'].isnull())].shape[0]

#Drop rows that have both activity and food_supply_stage missing

df = df.dropna(subset=['activity', 'food_supply_stage'], how='all') #dropping 46 rows

#Replace missing values in food_supply_stage with the same rows values on activity

df['food_supply_stage'] = df['food_supply_stage'].fillna(df['activity'])

#Drop activity column

df = df.drop(columns=['activity'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['food_supply_stage'] = df['food_supply_stage'].fillna(df['activity'])


In [7]:
df.isnull().sum()

m49_code                        0
country                         0
commodity                       0
year                            0
loss_percentage                 0
loss_percentage_original        0
loss_quantity               24834
food_supply_stage               0
dtype: int64

In [8]:
#Value count on food_supply_stage

df['food_supply_stage'].value_counts()


food_supply_stage
Farm                   9775
Harvest                3945
Shelling, Threshing    3342
Storage                2970
Transport              1963
Whole supply chain     1776
Retail                  494
Wholesale               293
Processing              283
Households              175
Post-harvest            107
Trader                   79
Export                   50
Market                   35
Food Services            20
Packing                  18
Distribution             17
Pre-harvest              11
Grading                  11
Grading, Sorting          3
Collector                 2
Stacking                  1
Name: count, dtype: int64

In [9]:
#Standardize food_supply_stage values

#Merge 'Wholesale' and 'Retail' into 'Retail and wholesale'

df['food_supply_stage'] = df['food_supply_stage'].replace(['Wholesale', 'Retail', 'Market'], 'Retail, wholesale and market')

#Merge 'Processing' and 'Manufacturing' into 'Processing and manufacturing'

df['food_supply_stage'] = df['food_supply_stage'].replace(['Processing', 'Manufacturing'], 'Processing and manufacturing')

#Change 'Shelling, Threshing' to just 'Shelling'

df['food_supply_stage'] = df['food_supply_stage'].replace('Shelling, Threshing', 'Shelling')

df['food_supply_stage'].value_counts()



food_supply_stage
Farm                            9775
Harvest                         3945
Shelling                        3342
Storage                         2970
Transport                       1963
Whole supply chain              1776
Retail, wholesale and market     822
Processing and manufacturing     283
Households                       175
Post-harvest                     107
Trader                            79
Export                            50
Food Services                     20
Packing                           18
Distribution                      17
Pre-harvest                       11
Grading                           11
Grading, Sorting                   3
Collector                          2
Stacking                           1
Name: count, dtype: int64