### Set up notebook and imports

In [1]:
import pandas as pd


In [4]:
#import dataset
emissions_data = pd.read_csv("Emissions_data.csv")
emissions_data

Unnamed: 0,country,year,co2,population,gdp,AverageTemperature,gdp_per_capita,co2_per_capita,co2_per_unit_gdp,emission_ratio
0,Afghanistan,1950,0.084,7752000.0,1.949480e+10,13.043500,2514.808999,0.108359,4.308841,low
1,Afghanistan,1951,0.092,7840000.0,2.006385e+10,13.967750,2559.164343,0.117347,4.585362,low
2,Afghanistan,1952,0.092,7936000.0,2.074235e+10,14.175417,2613.703484,0.115927,4.435370,low
3,Afghanistan,1953,0.106,8040000.0,2.201546e+10,14.650750,2738.241719,0.131841,4.814798,low
4,Afghanistan,1954,0.106,8151000.0,2.248333e+10,13.691333,2758.352230,0.130045,4.714604,low
...,...,...,...,...,...,...,...,...,...,...
11730,Zimbabwe,2009,5.519,12527000.0,1.514130e+10,21.377250,1208.692995,4.405684,364.499814,high
11731,Zimbabwe,2010,7.707,12698000.0,1.660410e+10,21.986250,1307.615340,6.069460,464.162478,high
11732,Zimbabwe,2011,9.498,12894000.0,1.830726e+10,21.602417,1419.827859,7.366217,518.810558,high
11733,Zimbabwe,2012,7.695,13115000.0,2.048226e+10,21.521333,1561.743118,5.867327,375.690946,high


### Preprocessing data for modeling
In this section I worked to encode the categorical data so that it works for the model.  I used the Label Encoder method from the scikit learn library to encode any features where the data was a string, the 'country' and 'emission_ratio' columns.  The data was changed from a string to a number for each unique entry.

In [5]:
# encoding categorical data

from sklearn.preprocessing import LabelEncoder

#create labelencoder instance
le = LabelEncoder()

#encode data
emissions_data['country'] = le.fit_transform(emissions_data['country'])

emissions_data

Unnamed: 0,country,year,co2,population,gdp,AverageTemperature,gdp_per_capita,co2_per_capita,co2_per_unit_gdp,emission_ratio
0,0,1950,0.084,7752000.0,1.949480e+10,13.043500,2514.808999,0.108359,4.308841,low
1,0,1951,0.092,7840000.0,2.006385e+10,13.967750,2559.164343,0.117347,4.585362,low
2,0,1952,0.092,7936000.0,2.074235e+10,14.175417,2613.703484,0.115927,4.435370,low
3,0,1953,0.106,8040000.0,2.201546e+10,14.650750,2738.241719,0.131841,4.814798,low
4,0,1954,0.106,8151000.0,2.248333e+10,13.691333,2758.352230,0.130045,4.714604,low
...,...,...,...,...,...,...,...,...,...,...
11730,153,2009,5.519,12527000.0,1.514130e+10,21.377250,1208.692995,4.405684,364.499814,high
11731,153,2010,7.707,12698000.0,1.660410e+10,21.986250,1307.615340,6.069460,464.162478,high
11732,153,2011,9.498,12894000.0,1.830726e+10,21.602417,1419.827859,7.366217,518.810558,high
11733,153,2012,7.695,13115000.0,2.048226e+10,21.521333,1561.743118,5.867327,375.690946,high


In [10]:
# encode target feature
from sklearn.preprocessing import LabelEncoder

#create labelencoder instance
le = LabelEncoder()

#encode data
emissions_data['emission_ratio'] = le.fit_transform(emissions_data['emission_ratio'])
emissions_data

Unnamed: 0,country,year,co2,population,gdp,AverageTemperature,gdp_per_capita,co2_per_capita,co2_per_unit_gdp,emission_ratio
0,0,1950,0.084,7752000.0,1.949480e+10,13.043500,2514.808999,0.108359,4.308841,1
1,0,1951,0.092,7840000.0,2.006385e+10,13.967750,2559.164343,0.117347,4.585362,1
2,0,1952,0.092,7936000.0,2.074235e+10,14.175417,2613.703484,0.115927,4.435370,1
3,0,1953,0.106,8040000.0,2.201546e+10,14.650750,2738.241719,0.131841,4.814798,1
4,0,1954,0.106,8151000.0,2.248333e+10,13.691333,2758.352230,0.130045,4.714604,1
...,...,...,...,...,...,...,...,...,...,...
11730,153,2009,5.519,12527000.0,1.514130e+10,21.377250,1208.692995,4.405684,364.499814,0
11731,153,2010,7.707,12698000.0,1.660410e+10,21.986250,1307.615340,6.069460,464.162478,0
11732,153,2011,9.498,12894000.0,1.830726e+10,21.602417,1419.827859,7.366217,518.810558,0
11733,153,2012,7.695,13115000.0,2.048226e+10,21.521333,1561.743118,5.867327,375.690946,0


### Feature Selection
The X variable, or independent variable, holds all the features that effect the y variable, or dependent variable,  'emission_ratio.'  The X features include 'country', 'year', 'co2', 'population', 'gdp', 'AverageTemperature', 'gdp_per_capita', 'co2_per_capita', and 'co2_per_unit_gdp'.  The emission_ratio indicates the relationship for each country between gdp output and co2 emissions per person.  The emission_ratio is effected for each country by each feature held in the X variable.

In [13]:
# define the feature set
X = emissions_data.drop('emission_ratio', axis=1)
y = emissions_data['emission_ratio']

### Split data into training and testing
The data was split into training and testing sets using the train_test_split method from the scikit learn library.

In [14]:
# the dataset is split into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

### Model choice and testing
We chose a logistic regression for our first attempt, since our dependent variable is a binary of high and low, and logistic regression works well with binary classifications.  A logistic regression is easy to implement and train, fast, and not inclined to over-fitting.  A disadvantage of this model is that it is difficult to display complex relationships, so nuanced influences on the dependent variable may not be detectable with this model. 

In [15]:
# create the model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [16]:
# next step is to create predictions and assemble the results into a Pandas DataFrame
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
6451,0,0
9135,0,0
1880,0,0
7545,0,0
6627,0,0
...,...,...
5289,0,0
3202,0,0
10723,0,0
5098,0,0


### Validate the model with test data
This quick model shows an accuracy rating of about 99% meaning its very good at predicting if a country is a polluter or not.

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.994546693933197