# Task 
Predict the Country given the methane emissions from 1990-2018, sector, and gas.

In [72]:
# Imports

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("methane_hist_emissions.csv")
data.head()

Unnamed: 0,Country,Sector,Gas,Unit,2018,2017,2016,2015,2014,2013,...,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990
0,China,Total including LUCF,CH4,MTCO2e,1238.95,1239.28,1242.43,1237.79,1206.51,1178.21,...,826.57,814.8,792.55,824.63,798.9,774.1,758.01,752.1,743.51,730.78
1,China,Total excluding LUCF,CH4,MTCO2e,1238.63,1239.13,1242.15,1237.52,1206.21,1178.02,...,825.69,813.83,791.73,823.86,798.45,773.65,757.56,751.66,743.07,730.34
2,Russia,Total including LUCF,CH4,MTCO2e,853.0,852.12,856.0,837.01,833.59,827.06,...,604.57,606.5,614.99,634.53,643.52,670.71,728.51,784.68,867.35,933.79
3,Russia,Total excluding LUCF,CH4,MTCO2e,849.57,850.17,852.55,835.56,830.22,825.64,...,598.7,597.72,610.38,622.22,639.73,666.92,724.72,780.89,863.52,929.97
4,China,Energy,CH4,MTCO2e,739.58,741.73,743.88,746.03,723.02,700.01,...,303.13,289.36,275.59,261.82,248.05,234.28,229.86,225.44,221.02,216.6


In [12]:
# Viewing the Sector categorical column

data['Sector'].value_counts()

Total including LUCF            194
Total excluding LUCF            194
Agriculture                     194
Land-Use Change and Forestry    194
Other Fuel Combustion           193
Industrial Processes            193
Energy                          192
Fugitive Emissions              192
Waste                           192
Name: Sector, dtype: int64

In [14]:
# Viewing the Gas categorical column

data['Gas'].value_counts()

CH4    1738
Name: Gas, dtype: int64

In [18]:
# Viewing the Unit categorical column

data['Unit'].value_counts()

MTCO2e    1738
Name: Unit, dtype: int64

In [19]:
# Separating features and target

features = [col for col in data.columns if col != 'Country']
target = 'Country'

## One-Hot Encoding

In [16]:
ohe_df = pd.get_dummies(data[features])
print(ohe_df.shape)
ohe_df.head()

(1738, 40)


Unnamed: 0,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,Sector_Energy,Sector_Fugitive Emissions,Sector_Industrial Processes,Sector_Land-Use Change and Forestry,Sector_Other Fuel Combustion,Sector_Total excluding LUCF,Sector_Total including LUCF,Sector_Waste,Gas_CH4,Unit_MTCO2e
0,1238.95,1239.28,1242.43,1237.79,1206.51,1178.21,1151.28,1106.41,1064.2,1019.28,...,0,0,0,0,0,0,1,0,1,1
1,1238.63,1239.13,1242.15,1237.52,1206.21,1178.02,1151.1,1106.19,1063.83,1018.91,...,0,0,0,0,0,1,0,0,1,1
2,853.0,852.12,856.0,837.01,833.59,827.06,827.98,817.7,804.22,776.75,...,0,0,0,0,0,0,1,0,1,1
3,849.57,850.17,852.55,835.56,830.22,825.64,824.27,814.67,801.36,774.47,...,0,0,0,0,0,1,0,0,1,1
4,739.58,741.73,743.88,746.03,723.02,700.01,677.0,635.32,593.65,552.07,...,1,0,0,0,0,0,0,0,1,1


## Dummy Coding

In [87]:
dummy_df = pd.get_dummies(data[features], drop_first=True)
print(dummy_df.shape)
dummy_df.head()

(1738, 37)


Unnamed: 0,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,1991,1990,Sector_Energy,Sector_Fugitive Emissions,Sector_Industrial Processes,Sector_Land-Use Change and Forestry,Sector_Other Fuel Combustion,Sector_Total excluding LUCF,Sector_Total including LUCF,Sector_Waste
0,1238.95,1239.28,1242.43,1237.79,1206.51,1178.21,1151.28,1106.41,1064.2,1019.28,...,743.51,730.78,0,0,0,0,0,0,1,0
1,1238.63,1239.13,1242.15,1237.52,1206.21,1178.02,1151.1,1106.19,1063.83,1018.91,...,743.07,730.34,0,0,0,0,0,1,0,0
2,853.0,852.12,856.0,837.01,833.59,827.06,827.98,817.7,804.22,776.75,...,867.35,933.79,0,0,0,0,0,0,1,0
3,849.57,850.17,852.55,835.56,830.22,825.64,824.27,814.67,801.36,774.47,...,863.52,929.97,0,0,0,0,0,1,0,0
4,739.58,741.73,743.88,746.03,723.02,700.01,677.0,635.32,593.65,552.07,...,221.02,216.6,1,0,0,0,0,0,0,0


Dummy coding (setting drop_first=True in pd.get_dummies()) actually removed the columns with only a single value such as 'Gas' and 'Unit'

## Effect Coding

In [88]:
dummy_cols = [col for col in dummy_df if len(col) > 4]
dummy_cols

['Sector_Energy',
 'Sector_Fugitive Emissions',
 'Sector_Industrial Processes',
 'Sector_Land-Use Change and Forestry',
 'Sector_Other Fuel Combustion',
 'Sector_Total excluding LUCF',
 'Sector_Total including LUCF',
 'Sector_Waste']

In [89]:
effect_coding_df = dummy_df.copy()

In [91]:
# Replacing reference category by -1

effect_coding_df['negative'] = effect_coding_df[dummy_cols].sum(axis=1)==0
effect_coding_df.loc[effect_coding_df['negative'],dummy_cols] = np.NaN
effect_coding_df.fillna(-1, inplace=True)