## Importing Essential Libraries

In [None]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, balanced_accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import re
from sklearn.experimental import enable_iterative_imputer
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import IterativeImputer
warnings.filterwarnings('ignore')
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.svm import SVC

## Loading the dataset

In [None]:
df = pd.read_csv('Survey.csv')

## Remove unwanted columns

In [None]:
## columns are either removed because they are redundant (test questions, meta data etc.)
## or columns are remove because questions are deemed to similar to target variable

df = df.drop(['name','essround','edition','proddate','idno', 'cntry','dweight', 'pweight','domain',
         'stratum','prob','psu','inwds', 'ainws', 'ainwe','binwe','cinwe','vteubcmb', 'dinwe', 'finwe', 'ginwe', 'hinwe', 'iinwe',
         'kinwe', 'vinwe', 'inwde', 'jinws', 'jinwe', 'inwtm', 'vdcond', 'vdovexre', 'vdtype', 'vdtpsvre', 'vdtpitre',
         'vdtpscre', 'vdtpaure', 'vdtpvire', 'vdtpoire', 'vdtpntre', 'vdtpapre', 'vdtprere', 'vdtpdkre', 'vdtpnare',
         'panpriph','govmonpb','cntbrthd', 'lnghom1', 'lnghom2', 'fbrncntc', 'mbrncntc', 'region',"admit","showcv","eisced","pdwrk","edctn", 
              "uempli","dsbld","rtrd","cmsrv","hswrk","dngoth","dngref","dngdk","dngna","pdwrkp","edctnp","uemplap",
              "uemplip","dsbldp","dsbldp","rtrdp","cmsrvp","hswrkp","dngothp","dngdkp","dngnapp","dngrefp","dngnap",
              "dscrsex","dscrdsb","dscroth","dscrdk","dscrref","dscrnap","dscrna","uempla",
             "eiscedf","edulvlfb","emprf14","occf14b","eiscedm","emprm14","atncrse","anctry1","regunit","acchome","accwrk","accmove",
              "accoth","accnone","accref","accdk","accna","admc19","hapljc19","hapirc19","hapwrc19","hapfuc19","hapfoc19",
              "hapnoc19","hapnwc19","hapnpc19","haprec19","hapdkc19","hapnac19","edulvlb", "nacer2", "isco08", "dscrgnd"], axis=1)

## Dealing with NA's

In [None]:
## NA's are denoted by 6666,7777,8888,9999
## or 666,777,888,999 etc. depending on the question
## These numbers are changed to NaN

for i in df.columns:
    if((df[i].max() >= 666666)):
        df[i] = df[i].replace(666666, np.nan)
        df[i] = df[i].replace(777777, np.nan)
        df[i] = df[i].replace(888888, np.nan)
        df[i] = df[i].replace(999999, np.nan)
    elif((df[i].max() >= 66666)):
        df[i] = df[i].replace(66666, np.nan)
        df[i] = df[i].replace(77777, np.nan)
        df[i] = df[i].replace(88888, np.nan)
        df[i] = df[i].replace(99999, np.nan)
    elif((df[i].max() >= 6666)):
        df[i] = df[i].replace(6666, np.nan)
        df[i] = df[i].replace(7777, np.nan)
        df[i] = df[i].replace(8888, np.nan)
        df[i] = df[i].replace(9999, np.nan)
    elif((df[i].max() >= 666)):
        df[i] = df[i].replace(555, np.nan)
        df[i] = df[i].replace(666, np.nan)
        df[i] = df[i].replace(777, np.nan)
        df[i] = df[i].replace(888, np.nan)
        df[i] = df[i].replace(999, np.nan)
    elif((df[i].max() >= 66)):
        df[i] = df[i].replace(66, np.nan)
        df[i] = df[i].replace(77, np.nan)
        df[i] = df[i].replace(88, np.nan)
        df[i] = df[i].replace(99, np.nan)
    elif((df[i].max() >= 6)):
        df[i] = df[i].replace(6, np.nan)
        df[i] = df[i].replace(7, np.nan)
        df[i] = df[i].replace(8, np.nan)
        df[i] = df[i].replace(9, np.nan)

## Some Basic EDA

In [None]:
## Barplot of the distribution of the target variable

def countplot(df):
    plt.figure(figsize=(15,5))
    g = sns.countplot(data=df, y="panmonpb")
    plt.title("Label distribution for panmonpb variable");
    return g
countplot(df)

## Splitting the target into 2 variables

In [None]:
## values 0-5 are encoded with the value 0
## values 6-10 are encode with the value 1

df["panmonpb"] = df["panmonpb"].replace([0, 1, 2, 3, 4, 5], 0)
df["panmonpb"] = df["panmonpb"].replace([6, 7, 8, 9, 10], 1)

In [None]:
## a quick look at the distribution after splitting

df['panmonpb'].value_counts().plot.bar()

## Dropping all columns with 20% or more NA's & dropping all rows with NA in target variable

In [None]:
## removing all the rows that have NA in target variable
data_cleaned = df.dropna(subset=['panmonpb'])

In [None]:
## all columns with more than 20% missing variables are droppped

cols = data_cleaned.drop([ 'panmonpb'], axis = 1).columns

cols_high_nans = cols[((df[cols].isna().sum() / len(df)) > 0.2).values]
data_cleaned = data_cleaned.drop(cols_high_nans, axis = 1)

## EDA after NA' dropping and making the target variable binary

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

fig = data_cleaned.groupby('gndr')['panmonpb'].plot(kind='kde')
plt.legend(["Male", "Female"], title='Distribution')
plt.title("Distribution of Genders Between the Groups")
plt.xticks(ticks = [0,1], labels = ["Surveillance over Privacy", "Privacy over Surveillance"])
plt.savefig('panmonpbgenderdistr.png')

In [None]:
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

fig = plt.subplots(figsize = (10, 5))
plt.pie(data_cleaned.panmonpb.value_counts(), labels = ['Privacy over Surveillance', 
                        'Surveillance over Privacy'], shadow = True,
                         autopct='%1.0f%%')
plt.title("Distribution of attitudes toward Privacy versus Surveillance", 
          fontdict = {'fontsize':20})
fig[0].savefig('pie_panmonpb.png')

## Dealing with Categorical Values

In [None]:
#One - hot encoding
#Getting dummies for the columns that although in the dataset are numerical, in the questionnaire
#the answers are categorical

df1_ohe = pd.get_dummies(data = data_cleaned, columns = ["gndr","vote","contplt","donprty","badge","sgnptit","pbldmna","bctprd",
                                                        "pstplonl","volunfp","clsprty","crmvct","hlthhmp","rlgblg", "dscrgrp","ctzcntr",
                                                          "brncntr","feethngr","facntr","mocntr","chpldm","lvgptnea","dvrcdeva","maritalb",
                                                          "domicil","mnactic","emplrel","wrkctra","estsz","jbspv","tporgwk","wrkac6m","uemp3m",
                                                          "mbtru","hincsrca", "livpnt"], drop_first=True, dummy_na = True)

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

nan_df = df1_ohe.loc[:, df1_ohe.columns.str.endswith("_nan")]

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

nan_df = df1_ohe.loc[:, df1_ohe.columns.str.endswith("_nan")]

#Specifying that nans in the columns after the dummy coding in order to fill them
#in with imputation
pattern = "^([^_]*)_"
regex = re.compile(pattern)

for index in df1_ohe.index:
    for col_nan in nan_df.columns:
        if df1_ohe.loc[index,col_nan] == 1:
            col_id = regex.search(col_nan).group(1)
            targets = df1_ohe.columns[df1_ohe.columns.str.startswith(col_id+'_')]
            df1_ohe.loc[index, targets] = np.nan
            
df1_ohe.drop(df1_ohe.columns[df1_ohe.columns.str.endswith('_nan')], axis=1, inplace=True)

data_cleaned = df1_ohe

## Check for the problem of multicollinearity

In [None]:
## check for the problem of multicollinearity, no values with a correlation of > 0.80

cor_with_dep = data_cleaned.corr()['panmonpb']
cols_high_cor_with_dep = cor_with_dep.index[((cor_with_dep > 0.8)).values]
cols_high_cor_with_dep

## Checking for the LogOdds assumption

In [None]:
#Checking for logistic regression assumptions linearity with the logodds, creating the plots
no_dummies = pd.DataFrame()

# For plotting/checking assumptions

gre = sns.regplot(x= 'panfolru', y= 'panmonpb', data= df, logistic= True).set_title("GRE Log Odds Linear Plot")
gre.figure.savefig("gre log lin.png")

In [None]:
## create a list of all the features with more than 2 entries 

not_binary_cols = []
for col in data_cleaned.columns:
    if len(data_cleaned[col][data_cleaned[col].notnull()].unique()) > 2: # if feature has more than 2 non-nan entires
        not_binary_cols.append(col)

In [None]:
## check for len so we know how many plots to create

len(not_binary_cols)

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

not_binary_cols = []
for col in data_cleaned.columns:
    if len(data_cleaned[col][data_cleaned[col].notnull()].unique()) > 2: # if feature has more than 2 non-nan entires
        not_binary_cols.append(col) 

fsize = 8
plt.rcParams.update({'font.size': fsize})
sns.set_context(rc={'axes.labelsize': fsize,'axes.titlesize': fsize})
fig1, axes1 = plt.subplots(6,6, sharey=True)
fig2, axes2 = plt.subplots(6,6, sharey=True)
fig3, axes3 = plt.subplots(6,6, sharey=True)
fig4, axes4 = plt.subplots(6,6, sharey=True)

i=0
for axes in [axes1, axes2, axes3, axes4]:
    for axy in axes:
        for j,axx in enumerate(axy):
            if i<len(not_binary_cols):
                axx.tick_params(axis='x', labelsize=fsize)
                axx.tick_params(axis='y', labelsize=fsize)
                sns.regplot(x = not_binary_cols[i], y= 'panmonpb', data= data_cleaned, logistic= True, ax=axx)
                i+=1
            if j>0:         #if plot is not in first row remove ylabel
                axx.set(ylabel=None)


from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'qt') #to make figures pop out

## Splitting the Data


In [None]:
X = data_cleaned.drop(columns = ['panmonpb'], axis=1)
y = data_cleaned['panmonpb']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=42)

In [None]:
num_cols_cleaned = data_cleaned.drop([ 'panmonpb'], axis = 1).select_dtypes('number').columns
cat_cols_cleaned = data_cleaned.drop([ 'panmonpb'], axis = 1).select_dtypes('object').columns

num_cols_with_na = num_cols_cleaned[X_train[num_cols_cleaned].isna().mean() > 0]
num_cols_no_na = num_cols_cleaned[~(X_train[num_cols_cleaned].isna().mean() > 0)]

## Missing Data Imputation

In [None]:
#Replacing na
imputer = IterativeImputer(max_iter = 10, random_state = 0)

# # fit the imputer on X_train. pass only numeric columns.
imputer.fit(X_train[num_cols_with_na])

In [None]:
#Transform the data using the fitted imputer
X_train_impute_num = imputer.transform(X_train[num_cols_with_na])
X_test_impute_num = imputer.transform(X_test[num_cols_with_na])

In [None]:
# put the output into DataFrame. remember to pass columns used in fit/transform
X_train_impute_num = pd.DataFrame(X_train_impute_num, columns = num_cols_with_na)
X_test_impute_num = pd.DataFrame(X_test_impute_num, columns = num_cols_with_na)

In [None]:
#Dropping the numerical columns and the columns that are imputed from the dataset in order to join the imputed ones
#and create the numerical features dataset
cols_no_na_train = X_train.drop(columns = num_cols_with_na, axis = 1)
cols_no_na_test = X_test.drop(columns = num_cols_with_na, axis = 1)
X_train = X_train_impute_num.join(cols_no_na_train.reset_index(drop = True))
X_test = X_test_impute_num.join(cols_no_na_test.reset_index(drop = True))

## Rescaling the Data

In [None]:
#Rescaling the data
scaler = MinMaxScaler()
X_train_rescaled = scaler.fit_transform(X_train)
X_test_rescaled = scaler.fit_transform(X_test)

In [None]:
pd.DataFrame(X_train_rescaled, columns = X_train.columns).to_csv('X_train_rescaled.csv', index=False)
pd.DataFrame(X_test_rescaled, columns = X_test.columns).to_csv('X_test_rescaled.csv', index=False)

In [None]:
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [None]:
X_train_rescaled = pd.read_csv('X_train_rescaled.csv' )
X_test_rescaled = pd.read_csv('X_test_rescaled.csv' )

In [None]:
y_test = pd.read_csv('y_test.csv' )
y_train = pd.read_csv('y_train.csv' )

## RFE

In [None]:
# create pipeline
## from https://machinelearningmastery.com/rfe-feature-selection-in-python/
rfe10 = RFE(estimator=RandomForestClassifier(), n_features_to_select=10, step = 10)
model10 = RandomForestClassifier()
pipeline = Pipeline(steps=[('s',rfe10),('m',model10)])
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores10 = cross_val_score(pipeline, X_train_rescaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores10), std(n_scores10)))

In [None]:
# create pipeline
## from https://machinelearningmastery.com/rfe-feature-selection-in-python/
rfe20 = RFE(estimator=RandomForestClassifier(), n_features_to_select=20, step = 10)
model20 = RandomForestClassifier()
pipeline = Pipeline(steps=[('s',rfe20),('m',model20)])
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores20 = cross_val_score(pipeline, X_train_rescaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores20), std(n_scores20)))

In [None]:
# create pipeline
## from https://machinelearningmastery.com/rfe-feature-selection-in-python/
rfe30 = RFE(estimator=RandomForestClassifier(), n_features_to_select=30, step = 10)
model30 = RandomForestClassifier()
pipeline = Pipeline(steps=[('s',rfe30),('m',model30)])
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores30 = cross_val_score(pipeline, X_train_rescaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores30), std(n_scores30)))

In [None]:
# create pipeline
## from https://machinelearningmastery.com/rfe-feature-selection-in-python/

from numpy import mean
from numpy import std

rfe40 = RFE(estimator=RandomForestClassifier(), n_features_to_select=40, step = 10)
model40 = RandomForestClassifier()
pipeline = Pipeline(steps=[('s',rfe40),('m',model40)])
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores40 = cross_val_score(pipeline, X_train_rescaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores40), std(n_scores40)))

In [None]:
# create pipeline
## from https://machinelearningmastery.com/rfe-feature-selection-in-python/
rfe50 = RFE(estimator=RandomForestClassifier(), n_features_to_select=50, step = 10)
model50 = RandomForestClassifier()
pipeline = Pipeline(steps=[('s',rfe50),('m',model50)])
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores50 = cross_val_score(pipeline, X_train_rescaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores50), std(n_scores50)))

In [None]:
# initialize data of lists.
data = {'Features': [10,20,30,40,50],
        'Accuracy': [round(mean(n_scores10),3),round(mean(n_scores20),3),round(mean(n_scores30),3),round(mean(n_scores40),3),round(mean(n_scores50),3)]}

# Create DataFrame
df23 = pd.DataFrame(data)

## create plot
ax = sns.barplot(x='Features', y='Accuracy',
                 data=df23,
                 errwidth=0, color = "blue")
 
# now simply assign the bar values to
# each bar by passing containers method
# to bar_label function
ax.bar_label(ax.containers[0])


ax.figure.savefig("Features versus accuracy.png")

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

pd.DataFrame(X_train_rescaled, columns = X_train.columns).to_csv('X_train_rescaled.csv', index=False)
pd.DataFrame(X_test_rescaled, columns = X_test.columns).to_csv('X_test_rescaled.csv', index=False)

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

X_train_rescaled = pd.read_csv('X_train_rescaled.csv' )
X_test_rescaled = pd.read_csv('X_test_rescaled.csv' )

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

y_test = pd.read_csv('y_test.csv' )
y_train = pd.read_csv('y_train.csv' )

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

rfe40 = RFE(estimator=RandomForestClassifier(), n_features_to_select=40, step = 10)
selector1 = rfe40.fit(X_train_rescaled, y_train.values.ravel())

In [None]:
## copied from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

X_train_rfe40 = X_train_rescaled.drop(X_train_rescaled.columns.difference(X_train_rescaled.columns[selector1.support_]), axis=1)
X_test_40 = X_test_rescaled.drop(X_test_rescaled.columns.difference(X_test_rescaled.columns[selector1.support_]), axis=1)

In [None]:
## create a plot of the most important features
rcParams['figure.figsize'] = 12, 8

model = RandomForestClassifier()
model.fit(X_train_rfe40, y_train.values.ravel())
# summarize feature importance
imp = pd.DataFrame()

imp["var"] = X_train_rfe40.columns

importance = model.feature_importances_

imp["imp"] = importance

imp = imp.sort_values("imp", ascending = False).reset_index(drop = True)


# PRINT OUT THE DATA FRAME
table = imp.sort_values(by = 'imp', ascending = False)

### Select top 40 highly correlated features
selected_features_tree =  list(table.loc[0:40]['var'])
reduced_variables = X_train_rfe40[selected_features_tree]

fig = table[0:10].plot.barh(x='var', y='imp')

plt.title("Top 10 features selected with Random Forest algorithm")
plt.show()
plt.savefig('featureimp.png')

## log reg

In [None]:
#LOGISTIC REGRESSION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

#Model without tuning
clf = LogisticRegression(random_state = 0, class_weight = None, max_iter=500)
model_res = clf.fit(X_train_rescaled, y_train)

test_log_no_tun = model_res.predict(X_test_rescaled)

print("The accuracy for the logistic regression model without tuning is:", 
      balanced_accuracy_score(y_test, test_log_no_tun))
      
print("The confusion matrix for the logistic regression model without tuning is:", 
       confusion_matrix(y_test, test_log_no_tun))

In [None]:
#FINE TUNING LOGISTIC REGRESSION

#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
# Define models and parameters
model = LogisticRegression(random_state = 0, class_weight = None, max_iter=1000)
c_values = [100, 50, 20, 5, 1.0, 0.5, 0.1, 0.05, 0.01]
penalty = ["l1", "l2", "elasticnet", None]

# Define grid search
grid = dict(C=c_values, penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=0)

#Fitting the model
random_search = RandomizedSearchCV(estimator = model, param_distributions = grid, 
                                   cv = cv, scoring = "accuracy", n_iter = 20)
random_result = random_search.fit(X_train_rescaled, y_train)

#Summarizing the results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
means = random_result.cv_results_['mean_test_score']
stds = random_result.cv_results_['std_test_score']
params = random_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
#Selecting the parameters from the best performing model and testing it on test data

final_model_lg = random_result.best_estimator_
final_model_fitting = final_model_lg.fit(X_train_rescaled, y_train)

#Predicting the data on the test set

test_log_tun = final_model_fitting.predict(X_test_rescaled)


print("The accuracy for the logistic regression model with tuning is:", 
      accuracy_score(y_test, test_log_tun))
      
print("The confusion matrix for the logistic regression model with tuning is:", 
       confusion_matrix(y_test, test_log_tun))

In [None]:
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

ax = sns.heatmap(confusion_matrix(y_test, test_rf_tun)/np.sum(confusion_matrix(y_test, test_rf_tun)), annot=True, 
            fmt='.2%', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

ax.xaxis.set_ticklabels(['Surveillance over Privacy','Privacy over Surveillance'])
ax.yaxis.set_ticklabels(['Surveillance over Privacy','Privacy over Surveillance'])
sns.set(font_scale=0.5)

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
#LOGISTIC REGRESSION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
#Model without tuning, 40 features
clf = LogisticRegression(random_state = 0, class_weight = None, max_iter=500)
model_res = clf.fit(X_train_rfe40, y_train)

test_log_no_tun = model_res.predict(X_test_40)

print("The accuracy for the logistic regression model without tuning is:", 
      balanced_accuracy_score(y_test, test_log_no_tun))
      
print("The confusion matrix for the logistic regression model without tuning is:", 
       confusion_matrix(y_test, test_log_no_tun))

In [None]:
#FINE TUNING LOGISTIC REGRESSION
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
# Define models and parameters
model = LogisticRegression(random_state = 1, class_weight = None, max_iter=1000)
c_values = [100, 50, 20, 5, 1.0, 0.5, 0.1, 0.05, 0.01]
penalty = ["l1", "l2", "elasticnet", None]
# Define grid search
grid = dict(C=c_values, penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=0)

#Fitting the model
random_search = RandomizedSearchCV(estimator = model, param_distributions = grid, 
                                   cv = cv, scoring = "accuracy", n_iter = 20)
random_result = random_search.fit(X_train_rfe40, y_train)

#Summarizing the results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
means = random_result.cv_results_['mean_test_score']
stds = random_result.cv_results_['std_test_score']
params = random_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
#Selecting the parameters from the best performing model and testing it on test data

final_model_lg = random_result.best_estimator_
final_model_fitting = final_model_lg.fit(X_train_rfe40, y_train)

#Predicting the data on the test set

test_log_tun = final_model_fitting.predict(X_test_40)


print("The accuracy for the logistic regression model with tuning is:", 
      accuracy_score(y_test, test_log_tun))
      
print("The confusion matrix for the logistic regression model with tuning is:", 
       confusion_matrix(y_test, test_log_tun))

In [None]:
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'qt') #to make figures pop out

In [None]:
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

ax = sns.heatmap(confusion_matrix(y_test, test_log_tun)/np.sum(confusion_matrix(y_test, test_log_tun)), annot=True, 
            fmt='.2%', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

ax.xaxis.set_ticklabels(['Surveillance over Privacy','Privacy over Surveillance'])
ax.yaxis.set_ticklabels(['Surveillance over Privacy','Privacy over Surveillance'])
sns.set(font_scale=0.50)

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
#RANDOM FOREST~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The model without fine tuning
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
clf_rf= RandomForestClassifier(random_state = 0, class_weight = None)
model_res_rf = clf_rf.fit(X_train_rescaled, y_train)

test_rf_no_tun = model_res_rf.predict(X_test_rescaled)

print("The accuracy for the Random Forests model without tuning is:", 
      accuracy_score(y_test, test_rf_no_tun))
      
print("The confusion matrix for the Random Forests model without tuning is:", 
       confusion_matrix(y_test, test_rf_no_tun))

In [None]:
#FINE TUNING RANDOM FOREST~~~~~~~~~~~~
#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
model_rf = RandomForestClassifier(random_state = 0, class_weight=None)

# define the grid search parameters
n_estimators= [10,50, 100, 200, 500, 750, 1000]
max_features=[1, X_train_rescaled.shape[1]]
param_grid_rf = dict(n_estimators=n_estimators, max_features=max_features)
cv_rf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 0)

#Fitting the model
random_search_rf = RandomizedSearchCV(model_rf, param_distributions=param_grid_rf,
                    n_jobs = -1, cv = cv_rf, scoring = 'accuracy', n_iter = 20)
random_result_rf = random_search_rf.fit(X_train_rescaled, y_train)

# summarize results
print("Best: %f using %s" % (random_result_rf.best_score_, random_result_rf.best_params_))
means_rf = random_result_rf.cv_results_['mean_test_score']
stds_rf = random_result_rf.cv_results_['std_test_score']
params_rf = random_result_rf.cv_results_['params']
for mean_rf, stdev_rf, param_rf in zip(means_rf, stds_rf, params_rf):
    print("%f (%f) with: %r" % (mean_rf, stdev_rf, param_rf))

#Fitting the best model in the training data

final_model_rf = random_result_rf.best_estimator_
final_model_fitting_rf = final_model_rf.fit(X_train_rescaled, y_train)

#Predicting the data on the test set
#Selecting the parameters from the best performing model and testing it on test data

test_rf_tun = final_model_fitting_rf.predict(X_test_rescaled)

print("The accuracy for the Random Forest model with tuning is:", 
      accuracy_score(y_test, test_rf_tun))
      
print("The confusion matrix for the Random Forest model with tuning is:", 
       confusion_matrix(y_test, test_rf_tun))

In [None]:
#RANDOM FOREST~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The model without fine tuning, 40 features
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
clf_rf= RandomForestClassifier(random_state = 0, class_weight = None)
model_res_rf = clf_rf.fit(X_train_rfe40, y_train)

test_rf_no_tun = model_res_rf.predict(X_test_40)

print("The accuracy for the Random Forests model without tuning is:", 
      accuracy_score(y_test, test_rf_no_tun))
      
print("The confusion matrix for the Random Forests model without tuning is:", 
       confusion_matrix(y_test, test_rf_no_tun))

In [None]:
#FINE TUNING RANDOM FOREST~~~~~~~~~~~~
#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
# The model with fine tuning, 40 features
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
model_rf = RandomForestClassifier(random_state = 0, class_weight=None)

# define the grid search parameters
n_estimators= [10,50, 100, 200, 500, 750, 1000]
max_features=[1, X_train_rfe40.shape[1]]
param_grid_rf = dict(n_estimators=n_estimators, max_features=max_features)
cv_rf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 0)

#Fitting the model
random_search_rf = RandomizedSearchCV(model_rf, param_distributions=param_grid_rf,
                    n_jobs = -1, cv = cv_rf, scoring = 'accuracy', n_iter = 20)
random_result_rf = random_search_rf.fit(X_train_rfe40, y_train)

# summarize results
print("Best: %f using %s" % (random_result_rf.best_score_, random_result_rf.best_params_))
means_rf = random_result_rf.cv_results_['mean_test_score']
stds_rf = random_result_rf.cv_results_['std_test_score']
params_rf = random_result_rf.cv_results_['params']
for mean_rf, stdev_rf, param_rf in zip(means_rf, stds_rf, params_rf):
    print("%f (%f) with: %r" % (mean_rf, stdev_rf, param_rf))

#Fitting the best model in the training data

final_model_rf = random_result_rf.best_estimator_
final_model_fitting_rf = final_model_rf.fit(X_train_rfe40, y_train)

#Predicting the data on the test set
#Selecting the parameters from the best performing model and testing it on test data

test_rf_tun = final_model_fitting_rf.predict(X_test_40)

print("The accuracy for the Random Forest model with tuning is:", 
      accuracy_score(y_test, test_rf_tun))
      
print("The confusion matrix for the Random Forest model with tuning is:", 
       confusion_matrix(y_test, test_rf_tun))

In [None]:
#### adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

ax = sns.heatmap(confusion_matrix(y_test, test_rf_tun)/np.sum(confusion_matrix(y_test, test_rf_tun)), annot=True, 
            fmt='.2%', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

ax.xaxis.set_ticklabels(['Surveillance over Privacy','Privacy over Surveillance'])
ax.yaxis.set_ticklabels(['Surveillance over Privacy','Privacy over Surveillance'])
sns.set(font_scale=0.5)

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
#SUPPORT VECTOR MACHINE~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#The model without fine tuning 
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
clf_svm= SVC(random_state = 0, class_weight = None)
model_res_svm = clf_svm.fit(X_train_rescaled, y_train.values.ravel())

test_svm_no_tun = model_res_svm.predict(X_test_rescaled)

print("The accuracy for the SVM model without tuning is:", 
      balanced_accuracy_score(y_test.values.ravel(), test_svm_no_tun))
      
print("The confusion matrix for the SVM model without tuning is:", 
       confusion_matrix(y_test.values.ravel(), test_svm_no_tun))

In [None]:
#FINE TUNING SUPPORT VECTOR MACHINE~~~~~~~~~~~~~~
#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
#The model without fine tuning 
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
# Define model and parameters
model_svm = SVC(random_state = 0, class_weight = None)
kernel = ['linear', 'poly' , 'rbf', 'sigmoid']
C = [50, 30,40, 20,10, 5, 1.0, 0.5, 0.1, 0.05, 0.01]

# Define grid search
grid_svm = dict(C = C, kernel=kernel)
cv_svm = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 0)
#Fitting the model
random_search_svm = RandomizedSearchCV(estimator = model_svm, 
        param_distributions = grid_svm, n_jobs = -1, cv = cv_svm, scoring = 'accuracy',
        error_score = 0, n_iter = 40)
random_result_svm = random_search_svm.fit(X_train_rescaled, y_train.values.ravel())

# Summarize results

print("Best: %f using %s" % (random_result_svm.best_score_, random_result_svm.best_params_))
means_svm = random_result_svm.cv_results_['mean_test_score']
stds_svm = random_result_svm.cv_results_['std_test_score']
params_svm = random_result_svm.cv_results_['params']
for mean_svm, stdev_svm, param_svm in zip(means_svm, stds_svm, params_svm):
    print("%f (%f) with: %r" % (mean_svm, stdev_svm, param_svm))

In [None]:
#SUPPORT VECTOR MACHINE~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#The model without fine tuning, 40 features
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
clf_svm= SVC(random_state = 0, class_weight = None)
model_res_svm = clf_svm.fit(X_train_rfe40, y_train.values.ravel())

test_svm_no_tun = model_res_svm.predict(X_test_40)

print("The accuracy for the SVM model without tuning is:", 
      accuracy_score(y_test.values.ravel(), test_svm_no_tun))
      
print("The confusion matrix for the SVM model without tuning is:", 
       confusion_matrix(y_test.values.ravel(), test_svm_no_tun))

In [None]:
#FINE TUNING SUPPORT VECTOR MACHINE~~~~~~~~~~~~~~
#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
## The model with fine tuning, 40 features
## adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py
# Define model and parameters
model_svm = SVC(random_state = 0, class_weight = None)
kernel = ['linear', 'poly' , 'rbf', 'sigmoid']
C = [50, 30,40, 20,10, 5, 1.0, 0.5, 0.1, 0.05, 0.01]

# Define grid search
grid_svm = dict(C = C, kernel=kernel)
cv_svm = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 0)
#Fitting the model
random_search_svm = RandomizedSearchCV(estimator = model_svm, 
        param_distributions = grid_svm, n_jobs = -1, cv = cv_svm, scoring = 'accuracy',
        error_score = 0, n_iter = 40)
random_result_svm = random_search_svm.fit(X_train_rfe40, y_train.values.ravel())

# Summarize results

print("Best: %f using %s" % (random_result_svm.best_score_, random_result_svm.best_params_))
means_svm = random_result_svm.cv_results_['mean_test_score']
stds_svm = random_result_svm.cv_results_['std_test_score']
params_svm = random_result_svm.cv_results_['params']
for mean_svm, stdev_svm, param_svm in zip(means_svm, stds_svm, params_svm):
    print("%f (%f) with: %r" % (mean_svm, stdev_svm, param_svm))
    
#Fitting the best model in the training data

final_model_svm = random_result_svm.best_estimator_
final_model_fitting_svm = final_model_svm.fit(X_train_rfe40, y_train)

#Predicting the data on the test set
#Selecting the parameters from the best performing model and testing it on test data

test_svm_tun = final_model_fitting_svm.predict(X_test_40)

print("The accuracy for the Random Forest model with tuning is:", 
      accuracy_score(y_test, test_svm_tun))
      
print("The confusion matrix for the Random Forest model with tuning is:", 
       confusion_matrix(y_test, test_svm_tun))

In [None]:
#### adapted from https://github.com/kantarafr/Thesis/blob/main/Pre-processing.py

ax = sns.heatmap(confusion_matrix(y_test, test_svm_tun)/np.sum(confusion_matrix(y_test, test_svm_tun)), annot=True, 
            fmt='.2%', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

ax.xaxis.set_ticklabels(['Surveillance over Privacy','Privacy over Surveillance'])
ax.yaxis.set_ticklabels(['Surveillance over Privacy','Privacy over Surveillance'])
sns.set(font_scale=0.5)

## Display the visualization of the Confusion Matrix.
plt.show()