In [57]:
import pandas as pd
import numpy as np
import country_converter as coco
import re
import pandas_profiling as pp
import pymysql
from sqlalchemy import create_engine
import getpass
import math
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [58]:
pd.set_option('display.max_rows', 2000)

In [59]:
password = getpass.getpass()

········


### Importing data from SQL

In [60]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/p5'
engine = create_engine(connection_string)

In [61]:
query = '''SELECT i.continent, i.region, p.iso3, i.country_name, i.indicator_id, i.value, p.democ, p.autoc, p.polity, id.indicator_name
FROM in17 i
    LEFT JOIN p5_score p
        ON i.country_name = p.country
    JOIN indicator id
        USING (indicator_id)'''

data = pd.read_sql_query(query, engine)
data.head()

Unnamed: 0,continent,region,iso3,country_name,indicator_id,value,democ,autoc,polity,indicator_name
0,Africa,Eastern Africa,BDI,Burundi,1,0.0,2.0,3.0,-1.0,European Union Membership (True/False)
1,Africa,Eastern Africa,BDI,Burundi,2,1.0,2.0,3.0,-1.0,United Nations Membership (True/False)
2,Africa,Eastern Africa,BDI,Burundi,3,0.0,2.0,3.0,-1.0,OECD Membership (True/False)
3,Africa,Eastern Africa,BDI,Burundi,21806,439.3,2.0,3.0,-1.0,Refugees by country of origin (thousands)
4,Africa,Eastern Africa,BDI,Burundi,23806,9.3,2.0,3.0,-1.0,Population with at least some secondary educat...


### Getting information to fit the model

In [62]:
data2 = data.dropna(axis = 0)

In [63]:
p5 = data2[['country_name', 'value', 'polity', 'region', 'indicator_id']].copy()

In [64]:
p5['indicator_id'] = p5['indicator_id'].astype('str')

In [65]:
p5p = p5.pivot(index = ['country_name', 'region', 'polity'], columns='indicator_id', values = 'value').reset_index()

In [66]:
p5p.isna().sum()

indicator_id
country_name      0
region            0
polity            0
1                 0
100806            1
101006           67
101406           42
101606           13
101706           22
101806            0
102006           67
103006            2
103206            0
103606            2
103706            2
110806            0
110906            0
111106           11
111306           16
117806           67
117906           67
118006           67
120606            0
121106            0
121206            0
122006            0
123306            7
123406            7
123506            2
123606            2
123806           14
127606           13
128106            8
128306            4
132706            0
132806            0
133006           25
133206            4
135006           23
135106           23
135206           23
136706            6
136906           12
137006           12
137506            2
137906           12
138806           23
140606            0
141706            2
142506 

In [67]:
p5p.dropna(axis = 1, inplace = True)

In [74]:
p5p.isna().sum()

indicator_id
country_name    0
region          0
polity          0
1               0
101806          0
103206          0
110806          0
110906          0
120606          0
121106          0
121206          0
122006          0
132706          0
132806          0
140606          0
148206          0
148306          0
150606          0
150706          0
163906          0
169706          0
169806          0
175106          0
177206          0
181606          0
182206          0
2               0
21806           0
27706           0
3               0
31706           0
36806           0
43006           0
44206           0
45106           0
46006           0
47906           0
48706           0
48806           0
49006           0
57206           0
57506           0
63106           0
64306           0
64406           0
69206           0
71506           0
89006           0
dtype: int64

In [42]:
X = p5p.drop(['polity', 'country_name'],axis=1)
y = p5p['polity']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [44]:
numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)

categoricals_train = X_train.select_dtypes(object)
categoricals_test = X_test.select_dtypes(object)

In [46]:
transformer = StandardScaler().fit(numericals_train)
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

In [47]:
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train)
categoricals_train_encoded = encoder.transform(categoricals_train).toarray()
categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

In [48]:
X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)

In [107]:
LR = LogisticRegression(random_state=42, solver='saga', max_iter = 10000)
LR.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=42, solver='saga')

In [108]:
LR.score(X_test, y_test)

0.23809523809523808

In [109]:
pred = LR.predict(X_test)
pred

array([-2,  9, -8, -6,  6,  7,  9,  6,  8, 10, -7, 10, -7,  4, -3, 10,  7,
        8,  8,  9, -3,  7,  6,  7,  8, -4,  7, 10,  9,  7, -3, 10,  6, -3,
       10,  8, -6, -2, 10, 10, 10,  9], dtype=int64)

In [113]:
KNN = KNeighborsRegressor(n_neighbors=5)
KNN.fit(X_train, y_train)

KNeighborsRegressor()

In [114]:
score = KNN.score(X_test, y_test)
score

0.19955852867599377