In [453]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import metrics

data = pd.read_csv("pvq21CENTRED.csv", sep=',')
lib_dataframe = data.copy()
con_dataframe = data.copy()



Bin the data for each dataframe

In [454]:
def normalise_lib (value):

        if value < 3:
            return "liberal" #liberal
        else:
            return "moderate" #moderate

def normalise_con (value):
        
        if value > 6:
            return "conservative" #conservative
        else:
            return "moderate" #moderate
        

lib_dataframe['lrscale'] = lib_dataframe['lrscale'].apply(normalise_lib)
con_dataframe['lrscale'] = con_dataframe['lrscale'].apply(normalise_con)

Split into features and target

In [455]:
X_lib = lib_dataframe.iloc[:, 2:12].copy()
y_lib = lib_dataframe.iloc[:,1].copy()
#Now the data frame for conservative prediction
X_con = con_dataframe.iloc[:, 2:12].copy()
y_con = con_dataframe.iloc[:,1].copy()

In [456]:
list(y_con).count("conservative")

9391

One Hot encoding 

In [320]:
enc = OneHotEncoder(sparse=False)
y_lib_onehot = enc.fit_transform(y_lib.to_numpy().reshape(-1, 1))
y_con_onehot = enc.fit_transform(y_con.to_numpy().reshape(-1, 1))

Label Encoding instead

In [321]:
label_encoder = LabelEncoder()
y_lib_onehot = label_encoder.fit_transform(y_lib.copy())
y_con_onehot = label_encoder.fit_transform(y_con.copy())

Splitting the data

In [188]:
X_train_lib_oh, X_test_lib_oh, y_train_lib_oh, y_test_lib_oh = train_test_split(X_lib, y_lib_onehot, test_size=0.25, random_state=10)
X_train_con_oh, X_test_con_oh, y_train_con_oh, y_test_con_oh = train_test_split(X_con, y_con_onehot, test_size=0.25, random_state=10)

# Models 

Call this function to perform cross validation to the model

In [31]:
def cv(model):
    #return scores
    pass

### Lasso

In [32]:
# class Lasso():
lasso_LIB_model = Lasso(alpha=1.0)
#fitting the liberal model
lasso_LIB_model.fit(X_train_lib_oh,y_train_lib_oh)

lasso_CON_model = Lasso(alpha=1.0)
lasso_CON_model.fit(X_train_con_oh,y_train_con_oh)


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [37]:
lasso_lib_pred = lasso_LIB_model.predict(X_test_lib_oh)
print(lasso_lib_pred)

#enc.inverse_transform([np.argmax(lasso_lib_pred[0, :])])
#print(lasso_lib_pred[0, :])
print("The ground truth labels:")
#print(np.argmax(y_test_lib_oh, axis = 1))
print("Liberal LASSO accuracy: ")
#print(accuracy_score(y_test_lib_oh, lasso_lib_pred))#np.argmax(y_test_lib_oh, axis = 1), np.argmax(lasso_LIB_model.predict(X_test_lib_oh), axis=1)))
print(lasso_LIB_model.coef_) #feature significance
print()
print(lasso_CON_model.predict(X_test_con_oh))
print("The ground truth labels:")
print(y_test_con_oh)

[0.96219289 0.96219289 0.96219289 ... 0.96219289 0.96219289 0.96219289]
The ground truth labels:
Liberal LASSO accuracy: 
[ 0. -0.  0.  0.  0.  0. -0. -0. -0. -0.]

[0.93714705 0.93714705 0.93714705 ... 0.93714705 0.93714705 0.93714705]
The ground truth labels:
[1 1 1 ... 1 0 1]


### Logistic Regression

TIP: One hot encoding does not work well with Logistic Regression; a one hot encoding will case the matrix of input data to become singular, meaning it cannot be inverted and the linear regression coefficients cannot be calculated using linear algebra. For these types of models a dummy variable encoding must be used instead.

So: Label encoding the data for the Logistic Regression (and the Random Forest)

In [57]:
y_lib_cat = y_lib.copy()        #NOT SURE WHAT THIS STEP DOES, MAYBE IT IS REDUNDANT
y_con_cat = y_con.copy()

y_lib_cat = y_lib_cat.astype('category')
y_con_cat = y_con_cat.astype('category')

In [265]:
label_encoder = LabelEncoder()
y_lib_labelencoded = label_encoder.fit_transform(y_lib_cat)
y_con_labelencoded = label_encoder.fit_transform(y_con_cat)

In [266]:
X_train_lib_l, X_test_lib_l, y_train_lib_l, y_test_lib_l = train_test_split(X_lib, y_lib_labelencoded, test_size=0.30, random_state=10)
X_train_con_l, X_test_con_l, y_train_con_l, y_test_con_l = train_test_split(X_con, y_con_labelencoded, test_size=0.30, random_state=10)

In [60]:
logistic_regression_LIB = LogisticRegression(random_state=0)
logistic_regression_LIB.fit(X_train_lib_l, y_train_lib_l)

logistic_regression_CON = LogisticRegression(random_state=0)
logistic_regression_CON.fit(X_train_con_l, y_train_con_l)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,lrscale,universalism,achievement,benevolence,self_direction,stimulation,hedonism,power,security,conformity,tradition,p_avg
0,0,0,1.5,2.5,1.5,3.0,3.5,4.0,4.0,4.5,6.0,4.5,3.5


In [65]:
LR_lib_pred = logistic_regression_LIB.predict(X_test_lib_l)
print("Liberal LR predicts: ",LR_lib_pred )
print("Ground truth: ", y_test_lib_l)

print(accuracy_score(y_test_lib_l, LR_lib_pred))

LR_con_pred = logistic_regression_CON.predict(X_test_con_l)
print("Conservative LR predicts: ", )
print("Ground truth: ", y_test_con_l)
print(accuracy_score(y_test_con_l, LR_con_pred))
print()
print("LIBERAL COEFFICIENTS")
print(logistic_regression_LIB.coef_)

print()
print("CONSERVATIVE COEFFICIENTS")
print(logistic_regression_CON.coef_)

# Liberal LR predicts:  [0 0 0 ... 0 0 0]
# Ground truth:  [1 0 0 ... 0 0 0]
# 0.8820682068206821
# Conservative LR predicts: 
# Ground truth:  [0 0 0 ... 0 1 0]
# 0.7358635863586359

Liberal LR predicts:  [1 1 1 ... 1 1 1]
Ground truth:  [0 1 1 ... 1 1 1]
0.8820682068206821
Conservative LR predicts: 
Ground truth:  [1 1 1 ... 1 0 1]
0.8481848184818482

LIBERAL COEFFICIENTS
[[ 0.64696149  0.10334159  0.00508842  0.03620765 -0.00407959  0.05359682
  -0.02097152 -0.05334792 -0.013917   -0.057601  ]]

CONSERVATIVE COEFFICIENTS
[[-0.39614516  0.027852    0.03136662  0.11140505  0.06367177 -0.00044922
   0.11708522  0.08849489  0.14891201  0.24233131]]


Result interpretation:

LIBERAL COEFFICIENTS
[[ 0.64696149  0.10334159  0.00508842  0.03620765 -0.00407959  0.05359682
  -0.02097152 -0.05334792 -0.013917   -0.057601  ]]

CONSERVATIVE COEFFICIENTS
[[-0.39614516  0.027852    0.03136662  0.11140505  0.06367177 -0.00044922
   0.11708522  0.08849489  0.14891201  0.24233131]]
   
The external correlations match those given at https://gosling.psy.utexas.edu/wp-content/uploads/2016/12/Sandy-et-al-JPA-2016-Brief-values-measures.pdf

Confusion matrix: 
Shows that the 

In [73]:
cm_lib = metrics.confusion_matrix(y_test_lib_l, LR_lib_pred)
print("liberal confusion matrix: \n", cm_lib)

cm_con = metrics.confusion_matrix(y_test_con_l, LR_con_pred)
print("conservative confusion matrix: \n", cm_con)

liberal confusion matrix: 
 [[   0 1072]
 [   0 8018]]
conservative confusion matrix: 
 [[   0 1380]
 [   0 7710]]


### Cross validation

In [64]:
logistic_regression_LIB_CVVVVV = LogisticRegression(random_state=0)

cv_results_lib = cross_validate(logistic_regression_LIB_CVVVVV, X_lib, y_lib_labelencoded, cv=10)

cv_results_con = cross_validate(logistic_regression_CON, X_con, y_con_labelencoded, cv=3)

print(cv_results_lib['test_score'])

print()

[0.88476348 0.88476348 0.88476348 0.88476348 0.88476348 0.88476348
 0.88476348 0.88476348 0.8850385  0.8850385 ]



## Random Forest

… there are occasions when a complete set of dummy variables is useful. For example, the splits in a tree-based model are more interpretable when the dummy variables encode all the information for that predictor. We recommend using the full set if dummy variables when working with tree-based models.

In [74]:
rf_lib = RandomForestClassifier(max_depth=None, random_state=0)
rf_lib.fit(X_train_lib_l, y_train_lib_l)

rf_con = RandomForestClassifier(max_depth= None, random_state=0)
rf_lib.fit(X_train_con_l, y_train_con_l)


rf_lib_predicted = rf_lib.predict(X_test_lib_l)
rf_con_predicted = rf_lib.predict(X_test_con_l)

print("Accuracy of liberal Random Forest ", accuracy_score(y_test_lib_l,rf_lib_predicted))
print("Accuracy of conservative Random Forest ", accuracy_score(y_test_con_l,rf_con_predicted))


Accuracy of liberal Random Forest  0.8797579757975797
Accuracy of conservative Random Forest  0.8467546754675468


In [75]:
rf_cm_lib = metrics.confusion_matrix(y_test_lib_l, rf_lib_predicted)
print("liberal confusion matrix: \n", rf_cm_lib)

rf_cm_con = metrics.confusion_matrix(y_test_con_l, rf_con_predicted)
print("conservative confusion matrix: \n", rf_cm_con)

liberal confusion matrix: 
 [[   3 1069]
 [  24 7994]]
conservative confusion matrix: 
 [[   7 1373]
 [  20 7690]]


## Naive Bayes (ToDo)

In [86]:
lib_gnb = GaussianNB()

lib_gnb.fit(X_train_lib_l, y_train_lib_l)


con_gnb = GaussianNB()

con_gnb.fit(X_train_con_l, y_train_con_l)

GaussianNB(priors=None, var_smoothing=1e-09)

In [91]:
lib_gnb_predicted = lib_gnb.predict(X_test_lib_l)
gnb_cm_lib = metrics.confusion_matrix(y_test_lib_l,lib_gnb_predicted,labels = )
tn, fp, fn, tp = metrics.confusion_matrix(y_test_lib_l,lib_gnb_predicted).ravel()

con_gnb_predicted = con_gnb.predict(X_test_con_l)
gnb_cm_con = metrics.confusion_matrix(y_test_con_l,con_gnb_predicted)
print(accuracy_score(y_test_lib_l,lib_gnb_predicted))
print(gnb_cm_lib)
print(tn, fp, fn, tp)
print(accuracy_score(y_test_con_l,con_gnb_predicted))
print(gnb_cm_con)



0.8729372937293729
[[  82 1199]
 [ 187 9440]]
82 1199 187 9440
0.8504767143381005
[[  10 1621]
 [  10 9267]]


## Feature selection - experiments: 
*first attempt on the conservative classifier*

Liberal significant features: 

Conservative significant features: Conformity, Tradition, Universalism, Self direction, Stimulation, Hedonism, Achievement (power), Security



In [206]:
X_con_featureslected = X_con.copy().drop(['benevolence'], axis = 1)

X_con_featureslected.head()

Unnamed: 0,universalism,achievement,self_direction,stimulation,hedonism,power,security,conformity,tradition
0,1.5,2.5,3.0,3.5,4.0,4.0,4.5,6.0,4.5
1,2.2,2.866667,2.366667,3.866667,3.866667,3.866667,4.366667,4.866667,4.366667
2,2.35,5.35,2.85,6.35,2.85,4.35,1.35,4.35,2.85
3,3.4,3.566667,3.066667,4.066667,3.066667,3.566667,3.066667,4.566667,3.066667
4,2.55,4.383333,2.883333,4.883333,1.883333,4.383333,4.383333,4.383333,3.383333


In [207]:
X_train_con_oh_fs, X_test_con_oh_fs, y_train_con_oh_fs, y_test_con_oh_fs = train_test_split(X_con_featureslected, y_con_onehot, test_size=0.25, random_state=10)

In [208]:
fs_con_gnb = GaussianNB()

fs_con_gnb.fit(X_train_con_oh_fs, y_train_con_oh_fs)

GaussianNB(priors=None, var_smoothing=1e-09)

In [209]:
fs_con_gnb_predicted = fs_con_gnb.predict(X_test_con_oh_fs)
fs_gnb_cm_con = metrics.confusion_matrix(y_test_con_oh_fs,fs_con_gnb_predicted)
print(metrics.confusion_matrix(y_test_con_oh_fs,fs_con_gnb_predicted).ravel())
print(accuracy_score(y_test_con_oh_fs,fs_con_gnb_predicted))

print(label_encoder.inverse_transform(fs_con_gnb_predicted))
print(metrics.confusion_matrix(y_test_con_oh_fs,fs_con_gnb_predicted))

print(list(fs_con_gnb_predicted).count(0))
print(list(label_encoder.inverse_transform(fs_con_gnb_predicted)).count('conservative'))

[   2  605    4 8479]
0.933003300330033
['moderate' 'moderate' 'moderate' ... 'moderate' 'moderate' 'moderate']
[[   2  605]
 [   4 8479]]
6
6


Observations: 
(with normalisations of conservatives over 7 (i.e. > 7)
- NB using OneHot encoding and having removed Benevolence: 93% accuracy
[   2  605]
 [   4 8479]]
 
- NB using OneHot encoding and having removed Benevelonece & Power: 93% accuracy
[[   0  607]
 [   2 8481]]

-  NB using Label encoding and having removed Benevolence: 84.8% accuracy
[[  10 1370]
 [   8 7702]]
 
- NB using Label encoding and having removed Benevolence & Power: 84.8%
[[   6 1374]
 [   3 7707]]

Logistic regression: 

In [219]:
fs_rf_con = RandomForestClassifier(max_depth=None, random_state=0)
fs_rf_con.fit(X_train_con_oh_fs, y_train_con_oh_fs)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [220]:
fs_rf_con_predicted = fs_con_gnb.predict(X_test_con_oh_fs)
print(accuracy_score(y_test_con_oh_fs,fs_rf_con_predicted))
print(metrics.confusion_matrix(y_test_con_oh_fs,fs_rf_con_predicted))

0.933003300330033
[[   2  605]
 [   4 8479]]


# Ranking Algorithms

- Training data consists of lists of items with some partial order specified between items in each list. This order is typically induced by giving a numerical or ordinal score or a binary judgment (e.g. "relevant" or "not relevant") for each item. The ranking model purposes to rank, i.e. producing a permutation of items in new, unseen lists in a similar way to rankings in the training data.

Ideas: 
    - Rank based on 'higher value = most important' (i.e. based on the assumption that conservatives have similar higher values  and similar lower values)
    - Rank based  on ''



In [457]:
X_lib_RANK1 = X_lib.copy()
X_con_RANK1 = X_con.copy()

# X_lib_RANK1

In [458]:
def relative_value (dataframe):
    df = dataframe.copy()
    df_comps = pd.DataFrame()
    col_name =''
    for column_name, data in df.iteritems():
        for other_column_name, other_data in df.iteritems():
            if column_name != other_column_name:
                comp_col_name = column_name + ' < ' + other_column_name
                df_comps[comp_col_name] = df[column_name] < df[other_column_name]

    return df_comps

In [459]:
X_con_RANK = relative_value(X_con_RANK1)
X_lib_RANK = X_con_RANK.copy()

## Encodings

In [460]:
# X_lib = lib_dataframe.iloc[:, 2:12].copy()
# y_lib = lib_dataframe.iloc[:,1].copy()
# #Now the data frame for conservative prediction
# X_con = con_dataframe.iloc[:, 2:12].copy()
# y_con = con_dataframe.iloc[:,1].copy()

In [461]:
label_encoder = LabelEncoder()
y_lib_labelencoded = label_encoder.fit_transform(y_lib)
y_con_labelencoded = label_encoder.fit_transform(y_con)

In [462]:
enc = OneHotEncoder(sparse=False)
y_lib_onehot = enc.fit_transform(y_lib.to_numpy().reshape(-1, 1))
y_con_onehot = enc.fit_transform(y_con.to_numpy().reshape(-1, 1))

## Data split

In [488]:
X_train_lib, X_test_lib, y_train_lib, y_test_lib = train_test_split(X_lib_RANK, y_lib_labelencoded, test_size=0.30, random_state=10)
X_train_con, X_test_con, y_train_con, y_test_con = train_test_split(X_con_RANK, y_con_labelencoded, test_size=0.30, random_state=10)

In [489]:
list(y_test_con).count("conservative")

0

## Naive Bayes

In [490]:
con_gnb_RANK = GaussianNB()
con_gnb_RANK.fit(X_train_con, y_train_con_l)

GaussianNB(priors=None, var_smoothing=1e-09)

In [491]:
con_RANK_predicted = con_gnb_RANK.predict(X_test_con)

print(accuracy_score(y_test_con,con_RANK_predicted))
print(metrics.confusion_matrix(y_test_con,con_RANK_predicted))

0.6491565823248991
[[ 932 1927]
 [1900 6149]]


## Random Forest

In [495]:
rf_con_RANK = RandomForestClassifier(max_depth= None, random_state=0)
rf_con_RANK.fit(X_train_con, y_train_con_l)




RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [500]:
rf_con_RANK_predicted = rf_con_RANK.predict(X_test_con)


print("Accuracy of conservative Random Forest ", accuracy_score(y_test_con,rf_con_RANK_predicted))

tn, fp, fn, tp = metrics.confusion_matrix(y_test_con,rf_con_RANK_predicted).ravel()

print(metrics.confusion_matrix(y_test_con,rf_con_RANK_predicted))

# print("tn",tn, "fp",fp, "fn",fn,"tp", tp)



Accuracy of conservative Random Forest  0.7326732673267327
[[  45 2814]
 [ 102 7947]]
tn 45 fp 2814 fn 102 tp 7947
[1 1 1 ... 1 1 1]


In [502]:
print(rf_con_RANK_predicted)
print(label_encoder.inverse_transform(rf_con_RANK_predicted))

[1 1 1 ... 1 1 1]
['moderate' 'moderate' 'moderate' ... 'moderate' 'moderate' 'moderate']


Random Forest:
_Normalistion point @ 8_
Accuracy of conservative Random Forest  0.9242757609094243

Sensitivity = 0.9872486512996567
Specificity = 0.023842917251051893

[   17   696]
[  130 10065]

---------------------------------------------------------------
 
_Normalisation point @ 7_
Accuracy of conservative Random Forest  0.8423175650898423

Sensitivity = 0.9872803708095289
Specificity = 0.04595879556259905

[  29 1602]
[ 118 9159]

---------------------------------------------------------------
 
_Normalisatin point @ 6
Accuracy of conservative Random Forest  0.7326732673267327

Sensitivity = 0.9873276183376817
Specificity = 0.015739769150052464

[  45 2814]
[ 102 7947]

In [494]:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Sensitivity = TP / (TP + FN)
Specificity = TN / (TN + FP)

NameError: name 'TP' is not defined

In [497]:
def sensitivity(TP, FN):
    return (TP / (TP + FN))

def specificity (TN, FP):
    return (TN / (TN + FP))

print(sensitivity(tp,fn))
print(specificity( tn,fp))

0.9873276183376817
0.015739769150052464
