In [1]:
import pandas as pd
import numpy as np
import country_converter as coco
import re
import pandas_profiling as pp
import pymysql
from sqlalchemy import create_engine
import getpass
import math
import seaborn as sns
import matplotlib
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)

In [3]:
password = getpass.getpass()

········


### Importing data from SQL

In [4]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/p5'
engine = create_engine(connection_string)

In [5]:
query = '''SELECT i.continent, i.region, p.iso3, i.country_name, i.indicator_id, i.value, p.democ, p.autoc, p.polity, id.indicator_name, d.dimension
FROM in17 i
    LEFT JOIN p5_score p
        ON i.country_name = p.country
    JOIN indicator id
        USING (indicator_id)
    LEFT JOIN dimensions d
        USING (indicator_id)
GROUP BY country_name, indicator_id
ORDER BY country_name'''

data = pd.read_sql_query(query, engine)
data.head()

Unnamed: 0,continent,region,iso3,country_name,indicator_id,value,democ,autoc,polity,indicator_name,dimension
0,Asia,Southern Asia,AFG,Afghanistan,1,0.0,1.0,2.0,-1.0,European Union Membership (True/False),International Organisations
1,Asia,Southern Asia,AFG,Afghanistan,2,1.0,1.0,2.0,-1.0,United Nations Membership (True/False),International Organisations
2,Asia,Southern Asia,AFG,Afghanistan,3,0.0,1.0,2.0,-1.0,OECD Membership (True/False),International Organisations
3,Asia,Southern Asia,AFG,Afghanistan,21806,2621.1,1.0,2.0,-1.0,Refugees by country of origin (thousands),Human Security
4,Asia,Southern Asia,AFG,Afghanistan,23806,25.1,1.0,2.0,-1.0,Population with at least some secondary educat...,Education


In [37]:
data = data[~data['polity'].isin([-77, -88, -99])]

### Getting information to fit the model

In [38]:
data2 = data.dropna(axis = 0)

In [39]:
p5 = data2[['country_name', 'value', 'polity', 'region', 'indicator_id']].copy()

In [40]:
p5['indicator_id'] = p5['indicator_id'].astype('str')

In [41]:
p5p = p5.pivot(index = ['country_name', 'region', 'polity'], columns='indicator_id', values = 'value').reset_index().copy()

In [42]:
p5p.isna().sum()

indicator_id
country_name      0
region            0
polity            0
1                 0
100806            0
101006           67
101406           40
101606           12
101706           21
101806            0
102006           67
103006            2
110806            0
110906            0
111106            9
111306           15
117806           67
117906           67
118006           67
121206            0
122006            0
123306            7
123406            7
123506            2
123606            2
123806           13
127606           13
128106            8
128306            4
132706            0
132806            0
133006           24
133206            4
135106           22
135206           22
136706            6
140606            0
141706            2
142506           67
143306            6
147206           38
147906           57
148206            0
148306            0
149206           42
150606            0
150706            0
153706           37
164406           19
167106 

### Dropping NaNs:

In [43]:
p5p_clean = p5p.dropna(axis = 1)

### Building the model

In [44]:
X = p5p_clean.drop(['polity', 'country_name'],axis=1)
y = p5p_clean['polity']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.145, random_state = 100)

In [46]:
numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)

categoricals_train = X_train.select_dtypes(object)
categoricals_test = X_test.select_dtypes(object)

In [47]:
transformer = StandardScaler().fit(numericals_train)
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

In [48]:
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train)
categoricals_train_encoded = encoder.transform(categoricals_train).toarray()
categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

In [49]:
X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)

In [50]:
LR = LogisticRegression(random_state=42, solver='saga', max_iter = 10000)
LR.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=42, solver='saga')

In [51]:
LR.score(X_test, y_test)

0.3333333333333333

In [52]:
LR_pred = LR.predict(X_test)
LR_pred

array([  8.,   9.,  -7.,  10.,   6.,   7.,  10.,   7.,  10.,   7.,   7.,
         8.,   7.,   8.,  10.,  10.,   8.,   8.,   8.,   8., -66.,  10.,
         5.,  -7.])

In [53]:
print("precision: ",precision_score(y_test, LR_pred, average= 'weighted'))
print("recall: ",recall_score(y_test, LR_pred, average='weighted'))
print("f1: ",f1_score(y_test, LR_pred, average='weighted'))

precision:  0.2571428571428572
recall:  0.3333333333333333
f1:  0.2766203703703704


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
KNN = KNeighborsRegressor(n_neighbors=5)
KNN.fit(X_train, y_train)
KNN_pred = KNN.predict(X_test)

In [55]:
score = KNN.score(X_test, y_test)
score

0.1924112769485904

### Building the model (without region)

In [56]:
X = p5p_clean.drop(['polity', 'country_name', 'region'],axis=1)
y = p5p_clean['polity']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, 
                                                    random_state = 100)

numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)

categoricals_train = X_train.select_dtypes(object)
categoricals_test = X_test.select_dtypes(object)

transformer = StandardScaler().fit(numericals_train)
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train)
categoricals_train_encoded = encoder.transform(categoricals_train).toarray()
categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)

LR = LogisticRegression(random_state=42, solver='saga', max_iter = 10000)
LR.fit(X_train, y_train)

print('LR score: ', LR.score(X_test, y_test))

LR_pred = LR.predict(X_test)

KNN = KNeighborsRegressor(n_neighbors=3)
KNN.fit(X_train, y_train)

print('KNN score: ', KNN.score(X_test, y_test))

LR score:  0.2727272727272727
KNN score:  -0.26368086147821823


In [57]:
print("precision: ",precision_score(y_test, LR_pred, average='weighted'))
print("recall: ",recall_score(y_test, LR_pred, average='weighted'))
print("f1: ",f1_score(y_test, LR_pred, average='weighted'))

precision:  0.23484848484848486
recall:  0.2727272727272727
f1:  0.2510822510822511


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Replacing NaNs with KNeighbors

In [58]:
def predict(df):
    
    nan = df.isna().sum().sort_values()
    nan = pd.DataFrame(nan).reset_index()
    nan.columns = ['indicator_id', 'nans']
    nans = []
    lst = nan['indicator_id'].to_list()

    for i in range(len(lst)):
        if nan['nans'][i] > 0:
            nans.append(nan['indicator_id'][i])
        
    for i in nans:
        
        num = df.select_dtypes(np.number).dropna(axis = 1)
        num[i] = df[i]
        
        id_with_num = num[num[i].notna()]
        X_id = id_with_num.drop([i],axis=1)
        y_id = id_with_num[i]
        X_normalized_id = pd.DataFrame(MinMaxScaler().fit_transform(X_id), columns=X_id.columns)
        
        LR = LinearRegression()
        LR.fit(X_normalized_id, y_id)
        
        X_all = num.drop([i],axis=1)
        X_normalized_all = pd.DataFrame(MinMaxScaler().fit_transform(X_all), columns=X_all.columns)
        y_all = num[i]
        
        i_pred = pd.Series(LR.predict(X_normalized_all), name='predicted_' + i)
        df = pd.concat([df, i_pred], axis=1)
        
        df['final_' + i] =  np.where(df[i].isna(), df['predicted_' + i], df[i])
        df.drop([i, 'predicted_' + i], axis = 1, inplace = True)
        
    return df

In [59]:
p5p_KNN = predict(p5p)
p5p_KNN.isna().sum()

country_name    0
region          0
polity          0
1               0
100806          0
101806          0
110806          0
110906          0
121206          0
122006          0
132706          0
132806          0
140606          0
148206          0
148306          0
150606          0
150706          0
181606          0
2               0
21806           0
27706           0
3               0
31706           0
36806           0
43006           0
44206           0
45106           0
46006           0
47906           0
48706           0
48806           0
49006           0
63106           0
69206           0
final_69706     0
final_181706    0
final_103006    0
final_123506    0
final_123606    0
final_141706    0
final_53506     0
final_128306    0
final_178306    0
final_133206    0
final_57806     0
final_174406    0
final_57906     0
final_143306    0
final_136706    0
final_123306    0
final_123406    0
final_24206     0
final_24106     0
final_128106    0
final_65606     0
final_6100

In [60]:
Xp = p5p_KNN.drop(['polity', 'country_name', 'region'],axis=1)
yp = p5p_KNN['polity']
X_train, X_test, y_train, y_test = train_test_split(Xp, yp, test_size = 0.145, 
                                                    random_state = 100)
numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)

categoricals_train = X_train.select_dtypes(object)
categoricals_test = X_test.select_dtypes(object)

transformer = StandardScaler().fit(numericals_train)
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train)
categoricals_train_encoded = encoder.transform(categoricals_train).toarray()
categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)

LR = LogisticRegression(random_state=42, solver='saga', max_iter = 10000)
LR.fit(X_train, y_train)

print('LR score: ', LR.score(X_test, y_test))

LR_pred = LR.predict(X_test)

KNN = KNeighborsRegressor(n_neighbors=5)
KNN.fit(X_train, y_train)

print('KNN score: ', KNN.score(X_test, y_test))

LR score:  0.375
KNN score:  -0.28573134328358196


In [61]:
print("precision: ",precision_score(y_test, LR_pred, average='weighted'))
print("recall: ",recall_score(y_test, LR_pred, average='weighted'))
print("f1: ",f1_score(y_test, LR_pred, average='weighted'))

precision:  0.3958333333333333
recall:  0.375
f1:  0.3744949494949495


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
LR_pred = LR.predict(X_test)
y_test = pd.DataFrame(y_test).reset_index(drop = True)
LR_pred = pd.DataFrame(LR_pred).reset_index(drop = True)
pd.concat([y_test, LR_pred], axis = 1)

Unnamed: 0,polity,0
0,7.0,8.0
1,-7.0,4.0
2,6.0,7.0
3,10.0,10.0
4,-1.0,-3.0
5,-7.0,6.0
6,10.0,10.0
7,-4.0,-4.0
8,10.0,10.0
9,9.0,7.0
