In [55]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split

# Read in data
df = pd.read_csv('cleaned_data_COGS118A.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Entity,Code,Continent_Code,Year,Number of executions (Amnesty International),Meningitis,Neoplasms,"Fire, heat, and hot substances",Malaria,...,Protein-energy malnutrition,Terrorism (deaths),Cardiovascular diseases,Chronic kidney disease,Chronic respiratory diseases,Cirrhosis and other chronic liver diseases,Digestive diseases,Acute hepatitis,Alzheimer's disease and other dementias,Parkinson's disease
0,0,Afghanistan,AFG,AS,2007,15,2933.0,15925.0,481.0,393.0,...,2439.0,1199.0,53962.0,4490.0,7222.0,3346.0,6458.0,3437.0,1402.0,450.0
1,1,Afghanistan,AFG,AS,2008,17,2731.0,16148.0,462.0,255.0,...,2231.0,1092.0,54051.0,4534.0,7143.0,3316.0,6408.0,3005.0,1424.0,455.0
2,2,Afghanistan,AFG,AS,2009,0,2460.0,16383.0,448.0,239.0,...,1998.0,1065.0,53964.0,4597.0,7045.0,3291.0,6358.0,2663.0,1449.0,460.0
3,3,Afghanistan,AFG,AS,2011,2,2327.0,17094.0,448.0,390.0,...,1805.0,1525.0,54347.0,4785.0,6916.0,3318.0,6370.0,2365.0,1508.0,473.0
4,4,Afghanistan,AFG,AS,2012,14,2254.0,17522.0,445.0,94.0,...,1667.0,3521.0,54868.0,4846.0,6878.0,3353.0,6398.0,2264.0,1544.0,482.0


In [56]:
# One-hot-encoding
# 0 represents Asia | 1 represents Europe | 2 represents North America
# 3 represents South America | 4 represents Africa | 5 represents Oceania

df['Continent_Code'] = df['Continent_Code'].replace(to_replace='AS', value=0)
df['Continent_Code'] = df['Continent_Code'].replace(to_replace='EU', value=1)
df.fillna(2,inplace=True)
df['Continent_Code'] = df['Continent_Code'].replace(to_replace='SA', value=3)
df['Continent_Code'] = df['Continent_Code'].replace(to_replace='AF', value=4)
df['Continent_Code'] = df['Continent_Code'].replace(to_replace='OC', value=5)
df = df[df.Continent_Code != '0']

# Converting strings to floats
df['Number of executions (Amnesty International)'] = df['Number of executions (Amnesty International)'].replace(to_replace='>1000', value=1000)
df['Number of executions (Amnesty International)'] = df['Number of executions (Amnesty International)'].replace(to_replace='>1', value=1)
df['Number of executions (Amnesty International)'] = df['Number of executions (Amnesty International)'].astype(float)

In [57]:
# Training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, df.loc[:,"Continent_Code"], test_size=0.25, stratify=df.loc[:,"Continent_Code"], random_state=42)

# Dropping values for years, countries, etc.
X_train.drop(X_train.columns[[0, 1, 2, 3, 4]], axis = 1, inplace = True)
X_test.drop(X_test.columns[[0, 1, 2, 3, 4]], axis = 1, inplace = True)

# Converting dataframe to arrays
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()
y_train = y_train.astype('int')
y_test = y_test.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [58]:
X_train

array([[0.00000e+00, 1.40000e+01, 1.11200e+03, ..., 3.40000e+01,
        1.47000e+02, 5.40000e+01],
       [0.00000e+00, 1.04000e+03, 1.11493e+05, ..., 1.95000e+02,
        2.03800e+04, 3.91200e+03],
       [0.00000e+00, 0.00000e+00, 2.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [0.00000e+00, 7.70000e+01, 2.26000e+02, ..., 3.50000e+01,
        2.10000e+01, 7.00000e+00],
       [0.00000e+00, 8.30000e+01, 4.96330e+04, ..., 7.00000e+00,
        6.81800e+03, 1.84500e+03],
       [4.00000e+00, 5.20000e+01, 1.90250e+04, ..., 2.10000e+01,
        3.47400e+03, 6.40000e+02]])

In [59]:
y_train

array([0, 0, 5, ..., 0, 1, 1])

In [60]:
X_test

array([[0.0000e+00, 1.4700e+02, 1.5439e+04, ..., 1.2000e+01, 2.2900e+03,
        5.5200e+02],
       [0.0000e+00, 1.1350e+03, 2.3730e+03, ..., 1.1800e+02, 1.3500e+02,
        4.6000e+01],
       [0.0000e+00, 1.6000e+02, 4.0990e+04, ..., 6.0000e+00, 5.9100e+03,
        1.2790e+03],
       ...,
       [0.0000e+00, 2.0000e+00, 1.8400e+02, ..., 0.0000e+00, 2.1000e+01,
        7.0000e+00],
       [0.0000e+00, 7.0000e+00, 1.4100e+02, ..., 3.0000e+00, 1.1000e+01,
        7.0000e+00],
       [0.0000e+00, 3.8300e+02, 4.8033e+04, ..., 1.3000e+01, 1.3056e+04,
        1.8810e+03]])

In [61]:
y_test

array([1, 4, 1, ..., 2, 5, 3])

In [71]:
import sklearn.metrics as metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Nonlinear kernels that we will be testing
parameters = ['linear', 'poly', 'rbf', 'sigmoid']

# Print accuracy of each kernel
for i in parameters:
    # Multi class is handled through OvO
    clf = make_pipeline(StandardScaler(), SVC(kernel=i))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(i)
    print(metrics.accuracy_score(y_test, y_pred))
    


linear
0.6555337262606418
poly
0.3182711198428291
rbf
0.5376555337262606
sigmoid
0.3785199738048461


In [None]:
# Conclusion
# The most accurate kernel was linear, unfortunantely none of them were able to reach a very high accuracy. 
# This may be due to the fact that continents like North America and Europe share many similarities due
# to them both housing many first world countries. As a result, the types of deaths between those two continents
# may be very similar. The same logic can be applied to South America and Africa for third world countries. 