In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Intelligent Mobility Classifiers

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline
#import sys
#sys.path.append('/content/drive/MyDrive/MIDA2/')

In [None]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/2_feature_encoding_no_region.csv'

df = pd.read_csv(ds_path)

In [None]:
print("dataset size: ",len(df))
df.describe(include='all')

In [None]:
target_variable = "Considering_electric_or_hybrid_vehicle_next_purchase"
input_variables = list(df.columns[df.columns!=target_variable])

train_df, test_df = train_test_split(df, test_size=0.2, random_state=1111)
print("train_len: ", len(train_df))
print("test_len: ", len(test_df))

# Encodings

In [None]:
def one_hot_encode(train_data, test_data, columns):
    conc = pd.concat([train_data, test_data], axis=0)
    encoded = pd.get_dummies(conc.loc[:, columns], drop_first=True,
                             sparse=True) 
    return (encoded.iloc[:train_data.shape[0],:], 
            encoded.iloc[train_data.shape[0]:,:])

def label_encode(train_data, test_data, columns):
    'Returns a DataFrame with encoded columns'
    encoded_cols = []
    for col in columns:
        factorised = pd.factorize(train_data[col])[1]
        labels = pd.Series(range(len(factorised)), index=factorised)
        encoded_col_train = train_data[col].map(labels) 
        encoded_col_test = test_data[col].map(labels)
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = -1
        encoded_cols.append(pd.DataFrame({'label_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

def freq_encode(train_data, test_data, columns):
    '''Returns a DataFrame with encoded columns'''
    encoded_cols = []
    nsamples = train_data.shape[0]
    for col in columns:    
        freqs_cat = train_data.groupby(col)[col].count()/nsamples
        encoded_col_train = train_data[col].map(freqs_cat)
        encoded_col_test = test_data[col].map(freqs_cat)
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = 0
        encoded_cols.append(pd.DataFrame({'freq_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])
    
def encoding_stats(train_data, test_data, X_train, X_test, target_col, encoding_function, feature_cols_to_use):
    if encoding_function.__name__ == 'one_hot_encode':
        return np.nan, np.nan, np.nan, np.nan
    if encoding_function.__name__ == 'mean_encode':
        enc_suffix = 'mean_'+target_col+'_'
    if encoding_function.__name__ == 'freq_encode':    
        enc_suffix = 'freq_'
    if encoding_function.__name__ == 'label_encode':
        enc_suffix = 'label_'
    cols_to_encoded_mapping = {}
    for col in feature_cols_to_use:
        for col_enc in X_train.columns:
            if col == col_enc[len(enc_suffix):]:
                cols_to_encoded_mapping[col] = col_enc
    train_conc = pd.concat([train_data, X_train], axis=1)
    test_conc = pd.concat([test_data, X_test], axis=1)
    mean_stds_train = []
    std_means_train = []
    mean_stds_test = []
    std_means_test = []
    for key in cols_to_encoded_mapping.keys():
        #how much randomisation added
        mean_stds_train.append(train_conc.groupby(key)[cols_to_encoded_mapping[key]].std().mean())
        mean_stds_test.append(test_conc.groupby(key)[cols_to_encoded_mapping[key]].std().mean())
        # how distinguishable are categories with that encoding
        std_means_train.append(train_conc.groupby(key)[cols_to_encoded_mapping[key]].mean().std())
        std_means_test.append(test_conc.groupby(key)[cols_to_encoded_mapping[key]].mean().std())
    
    encoding_stats = (np.mean(mean_stds_train), np.mean(std_means_train),
                      np.mean(mean_stds_test), np.mean(std_means_test))
    return encoding_stats

def test_all_encodings(train_data, test_data, target_col, testing_params, 
                       test_one_hot=False, regression=False, skip_first_iters_graph=0,
                      max_features_one_hot=0.01):
    encoding_settings = [[label_encode, {}, 'Label encoding', '#960000'],
                         [freq_encode, {}, 'Frequency encoding', '#FF2F02'],
                         [mean_encode, {'alpha':0, 'folds':None, 'reg_method':None, 
                                        'add_random':False, 'rmean':0, 'rstd':0.0,
                                        'target_col':target_col},
                         'Mean encoding, alpha=0', '#A4C400'],
                         [mean_encode, {'alpha':2, 'folds':None, 'reg_method':None, 
                                        'add_random':False, 'rmean':0, 'rstd':0.0,
                                        'target_col':target_col}, 
                         'Mean encoding, alpha=2', '#73B100'],
                         [mean_encode, {'alpha':5, 'folds':None, 'reg_method':None, 
                                        'add_random':False, 'rmean':0, 'rstd':0.0,
                                        'target_col':target_col}, 
                         'Mean encoding, alpha=5', '#2B8E00'],
                         [mean_encode, {'alpha':5, 'folds':3, 'reg_method':'k_fold', 
                                        'add_random':False, 'rmean':0, 'rstd':0.0,
                                        'target_col':target_col}, 
                         'Mean encoding, alpha=5, 4 folds', '#00F5F2'],
                         [mean_encode, {'alpha':5, 'folds':5, 'reg_method':'k_fold', 
                                        'add_random':False, 'rmean':0, 'rstd':0.0,
                                        'target_col':target_col}, 
                         'Mean encoding, alpha=5, 7 folds', '#00BAD3'],
                         [mean_encode, {'alpha':5, 'folds':None, 'reg_method':'expanding_mean', 
                                        'add_random':False, 'rmean':0, 'rstd':0.0,
                                        'target_col':target_col}, 
                         'Mean encoding, alpha=5, expanding mean', '#B22BFA']]
    review_rows = []
    if test_one_hot:
        oh_settings = [[one_hot_encode, {}, 'One hot encoding', '#E7E005']]
        encoding_settings = oh_settings + encoding_settings
    feature_cols_to_use = list(train_data.columns)
    feature_cols_to_use.remove(target_col)
    if regression:
        scoring_function = scoring_gbr_sklern
        best_score_function = min
    else:
        scoring_function = scoring_gbc_sklern
        best_score_function = max     
    plt.figure(figsize=(10,7))
    for encoding_function, encoding_params, str_name, color in encoding_settings:
        if encoding_function.__name__ == 'one_hot_encode':
            testing_params['max_features'] = max_features_one_hot
        else:
            testing_params['max_features'] = None
        X_train, X_test = encoding_function(train_data, test_data, feature_cols_to_use,
                                            **encoding_params)
        scores = scoring_function(X_train, train_data[target_col], X_test, 
                                    test_data[target_col], 
                                    min_samples_leaf=1, max_depth=3, **testing_params)
        skip_it = int(skip_first_iters_graph)
        train_scores, test_scores, iters, model_ = scores
        plt.plot(iters[skip_it:], 
                 test_scores[skip_it:], 
                 label='Test, ' + str_name, linewidth=1.5, color=color)
        best_score_test = best_score_function(test_scores)
        best_iter_test = iters[test_scores.index(best_score_test)]
        best_score_train = best_score_function(train_scores[:best_iter_test])
        print('Best score for {}: is {}, on iteration {}'.format(str_name, 
                                                                 best_score_test, 
                                                                 best_iter_test,
                                                                 best_score_train))
        enc_stats = encoding_stats(train_data, test_data, X_train, X_test, 
                                   target_col, encoding_function, feature_cols_to_use)
        review_rows.append([str_name, best_score_train, best_score_test, best_iter_test] + list(enc_stats))
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    if regression:
        columns=['Encoding', 'Train RMSE score on best iteration', 
             'Best RMSE score (test)', 'Best iteration (test)',
             'EV (train)', 'ED (train)', 'EV (test)', 'ED (test)']
    else:
        columns=['Encoding', 'Train AUC score on best iteration', 
             'Best AUC score (test)', 'Best iteration (test)',
             'EV (train)', 'ED (train)', 'EV (test)', 'ED (test)']
    return pd.DataFrame(review_rows, columns=columns)

In [None]:
object_type = df["Country"].dtype

categorical_var = [var for var in df if df[var].dtype == object_type]
categorical_var.remove(target_variable)
categorical_var

In [None]:
train, test = freq_encode(train_df, test_df, categorical_var)

In [None]:
for var in categorical_var:
  #train_df["freq_" + var] = train["freq_" + var]
  #test_df["freq_" + var] = test["freq_" + var]
  train_df[var] = train["freq_" + var]
  test_df[var] = test["freq_" + var]

In [None]:
#train_df = train_df.drop(categorical_var, axis=1)
#test_df = test_df.drop(categorical_var, axis=1)

In [None]:
train_df.dtypes

In [None]:
X_train = train_df[input_variables].values
y_train = train_df[target_variable].values

X_test = test_df[input_variables].values
y_test = test_df[target_variable].values

In [None]:
train_df.dtypes

In [None]:
models = {'Gradient Boost':GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0)}

In [None]:
random_seed=1111

In [None]:

#scores = {}
#for model_name in models:
#    clf = models[model_name];
#    #score = cross_val_score(clf, X, y, cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=random_seed))
#    
#    scores[model_name]=(np.average(score),np.std(score))
#    print('%26s %3.1f %3.1f'%(model_name,100.0*np.average(score),100.0*np.std(score)))

In [None]:
clf = models["Gradient Boost"]
clf.fit(X_train, y_train)

In [None]:
X_train

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [None]:
clf.predict_proba(X_train)[:, 1]

In [None]:
train_auc = roc_auc_score(y_train, clf.predict_proba(X_train)[:, 1], multi_class='ovr')
test_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1], multi_class='ovr')
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, clf.predict(X_test))
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, clf.predict(X_test))
train_recall = recall_score(y_train, clf.predict(X_train), average='micro')
test_recall = recall_score(y_test, clf.predict(X_test), average='macro')
#train_f1 = f1_score(y_train, clf.predict(X_train), average = "weighted")
#test_f1 = f1_score(y_test, clf.predict(X_test), average = "weighted")

print(f'train_auc: {train_auc}')
print(f'test_auc: {test_auc}')
print(f'train_accuracy: {train_accuracy}')
print(f'test_accuracy: {test_accuracy}')
#print(f'train_recall: {train_recall}')
#print(f'test_recall: {test_recall}')
#print(f'train_f1: {train_f1}')
#print(f'test_f1: {test_f1}')

In [None]:
importances = clf.feature_importances_

In [None]:
from sklearn.inspection import permutation_importance
import time

start_time = time.time()
result = permutation_importance(
    clf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")



In [None]:
result.importances_std

In [None]:
forest_importances = pd.Series(result.importances_mean, index=input_variables)
forest_importances_sorted = forest_importances.sort_values(ascending=False)
#forest_importances_sorted

In [None]:
forest_importances_sorted[:10]

In [None]:
fig, ax = plt.subplots(figsize=(20,12))
forest_importances_sorted.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
cm = confusion_matrix(y_test, clf.predict(X_test))
df_cm = pd.DataFrame(cm, index = df[target_variable].unique(),
                  columns = df[target_variable].unique())

In [None]:
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

In [None]:
df[target_variable].value_counts()

In [None]:
target_values = list(df[target_variable].unique())

In [None]:
target_value_counts_test ={}
for val in y_test:
  if target_value_counts_test.get(val) is not None:
    target_value_counts_test[val] += 1
  else:
    target_value_counts_test[val] = 0
target_value_counts_test

In [None]:
target_value_counts_train ={}
for val in y_train:
  if target_value_counts_train.get(val) is not None:
    target_value_counts_train[val] += 1
  else:
    target_value_counts_train[val] = 0
target_value_counts_train

## Trying with only 2 labels

In [None]:
df[target_variable].unique()

In [None]:
df_2 = df[df[target_variable] != 'Maybe yes maybe not']
df_2 = df_2[df_2[target_variable] != "Don't know/no answer"]
df_2

In [None]:
df_2[target_variable].value_counts()

In [None]:
target_map = {
    "Probably yes": "YES",
    "Certainly yes": "YES",
    "Probably not": "NO",
    "Certainly not": "NO",
}
df_2_fin = df_2.copy()
df_2_fin[target_variable] = df_2[target_variable].replace(target_map)

In [None]:
df_2_fin[target_variable].value_counts()

In [None]:
df_2_fin.to_csv('/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/3_target_yes_no.csv', index=False)

In [None]:
df = df_2_fin
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1111)
print("train_len: ", len(train_df))
print("test_len: ", len(test_df))

In [None]:
object_type = df["Country"].dtype

categorical_var = [var for var in df if df[var].dtype == object_type]
categorical_var.remove(target_variable)
categorical_var

In [None]:
train, test = freq_encode(train_df, test_df, categorical_var)

In [None]:
for var in categorical_var:
  #train_df["freq_" + var] = train["freq_" + var]
  #test_df["freq_" + var] = test["freq_" + var]
  train_df[var] = train["freq_" + var]
  test_df[var] = test["freq_" + var]

In [None]:
#train_df = train_df.drop(categorical_var, axis=1)
#test_df = test_df.drop(categorical_var, axis=1)

In [None]:
train_df.dtypes

In [None]:
X_train = train_df[input_variables].values
y_train = train_df[target_variable].values

X_test = test_df[input_variables].values
y_test = test_df[target_variable].values

In [None]:
train_df.dtypes

In [None]:
models = {'Gradient Boost':GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0)}

In [None]:
random_seed=1111

In [None]:

#scores = {}
#for model_name in models:
#    clf = models[model_name];
#    #score = cross_val_score(clf, X, y, cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=random_seed))
#    
#    scores[model_name]=(np.average(score),np.std(score))
#    print('%26s %3.1f %3.1f'%(model_name,100.0*np.average(score),100.0*np.std(score)))

In [None]:
clf = models["Gradient Boost"]
clf.fit(X_train, y_train)

In [None]:
X_train

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [None]:
clf.predict_proba(X_train)

In [None]:
train_auc = roc_auc_score(y_train, clf.predict_proba(X_train), multi_class='ovr')
test_auc = roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr')
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, clf.predict(X_test))
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, clf.predict(X_test))
train_recall = recall_score(y_train, clf.predict(X_train), average='macro')
test_recall = recall_score(y_test, clf.predict(X_test), average='macro')
#train_f1 = f1_score(y_train, clf.predict(X_train), average = "weighted")
#test_f1 = f1_score(y_test, clf.predict(X_test), average = "weighted")

print(f'train_auc: {train_auc}')
print(f'test_auc: {test_auc}')
print(f'train_accuracy: {train_accuracy}')
print(f'test_accuracy: {test_accuracy}')
print(f'train_recall: {train_recall}')
print(f'test_recall: {test_recall}')
#print(f'train_f1: {train_f1}')
#print(f'test_f1: {test_f1}')

In [None]:
importances = clf.feature_importances_

In [None]:
from sklearn.inspection import permutation_importance
import time

start_time = time.time()
result = permutation_importance(
    clf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")



In [None]:
result.importances_std

In [None]:
forest_importances = pd.Series(result.importances_mean, index=input_variables)
forest_importances_sorted = forest_importances.sort_values(ascending=False)
#forest_importances_sorted

In [None]:
fig, ax = plt.subplots(figsize=(20,12))
forest_importances_sorted.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
cm = confusion_matrix(y_test, clf.predict(X_test))
df_cm = pd.DataFrame(cm, index = df[target_variable].unique(),
                  columns = df[target_variable].unique())

In [None]:
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)