# Public non-image dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn import preprocessing
from sklearn.metrics import *
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import *
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sb

### Variables

In [None]:
bank = pd.read_csv('drive/MyDrive/AI_1/bank.csv',sep=';')
bank_a = pd.read_csv('drive/MyDrive/AI_1/bank-additional.csv',sep=';')
bank_f = pd.read_csv('drive/MyDrive/AI_1/bank-full.csv',sep=';')
bank_af = pd.read_csv('drive/MyDrive/AI_1/bank-additional-full.csv',sep=';')
data_list = [bank, bank_a, bank_f, bank_af]
data_name = ['bank', 'bank-additional', 'bank-full', 'bank-additional-full']
score_idx = ['acccuracy','precision (no)','precision (yes)','recall (no)','recall (yes)','f1 (no)','f1 (yes)','points']

dec_tree = tree.DecisionTreeClassifier()
rnd_forest = RandomForestClassifier()
nb = GaussianNB()

### Data Checking & Visualization

In [None]:
def null_check(data):
  # check if there exist any null value 
  check = pd.isnull(data)
  for key in check:
    for i in check[key]:
      if i == True:
        return -1

  return 0

def visual(data,name):
  Y = np.array(data['y'])

  y_cnt = 0
  n_cnt = 0
  for i in Y:
    if i == 'no':
      n_cnt += 1
    else:
      y_cnt += 1

  plt.figure(figsize=(5,5))
  plt.bar([0,1],[n_cnt,y_cnt])
  plt.xticks([0,1],['no','yes'])
  plt.title('Label Count '+name,fontsize=20)
  plt.show()

In [None]:
for d,n in zip(data_list,data_name):
  flag = null_check(d)
  if flag == -1:
    print(data_name,': null detected')
    break
  visual(d,n)

### Data Preprocessing, Model Construction, Training & Aquiring Result

In [None]:
def preprocess(data,fit_data):
  Y = np.array(data['y'])
  data = data.drop(columns=['y'])
  fit_data = fit_data.drop(columns=['y'])
  # encode categorical attribute with OneHotEncoder
  cat_idx = []
  num_idx = []
  for i,obj in enumerate(fit_data.loc[0]):
    if type(obj) == type('string'):
      cat_idx.append(fit_data.columns[i])
    else:
      num_idx.append(fit_data.columns[i])
  enc = OneHotEncoder()
  enc.fit(np.array(fit_data[cat_idx]))
  encoded_data = enc.transform(np.array(data[cat_idx])).toarray()
  # encoded_data = enc.fit_transform(np.array(data[cat_idx])).toarray()
  data = np.hstack((np.array(data[num_idx]),encoded_data))

  return Y, data

def ML(data,clf,*args):
  
  if len(args) != 0:
    train_data = args[0]
  else:
    train_data = data
  # train_data = data
  # print(len(train_data),len(data))
  Y, data = preprocess(train_data,data)

  # 3-fold cross validation
  kf = KFold(n_splits=3,shuffle=True)
  X_train = []
  X_test = []
  Y_train = []
  Y_test = []
  for train, test in kf.split(data):
    X_train.append(data[train])
    X_test.append(data[test])
    Y_train.append(Y[train])
    Y_test.append(Y[test])

  # training and acquiring result
  acc = 0
  pre = [0,0]
  rec = [0,0]
  f1 = [0,0]
  matrix = [[0,0],[0,0]]
  for i in range(3):
    clf.fit(X_train[i],Y_train[i])
    result = clf.predict(X_test[i])
    acc += accuracy_score(Y_test[i],result)
    pre += precision_score(Y_test[i],result,average=None,labels=['no','yes'])
    rec += recall_score(Y_test[i],result,average=None,labels=['no','yes'])
    f1 += f1_score(Y_test[i],result,average=None,labels=['no','yes'])
    matrix += confusion_matrix(Y_test[i],result,labels=['no','yes'])

  # print('------------------------')
  form = [[acc/3],[pre[0]/3],[pre[1]/3],[rec[0]/3],[rec[1]/3],[f1[0]/3],[f1[1]/3]]
  return form, matrix
  # display(form)
  # print('------------------------------------')
  # form = pd.DataFrame(matrix/3,columns=['predicted no','predicted yes'],index=['real no','real yes'])
  # display(form)

def Rank(scores):
  point = [0 for i in range(len(scores[0])+1)]
  for s in scores:
    rank = sorted(range(len(s)),key = lambda k: s[k],reverse = True)
    rank_str = ''
    for i, r in enumerate(rank):
      point[r] += len(s) - i - 1
      rank_str += str(r) + '>' if r != rank[len(rank)-1] else str(r)
    s.append(rank_str)
  
  rank = sorted(range(len(point)),key = lambda k: point[k],reverse = True)
  rank_str = ''
  for i in range(len(rank)-1):
    rank_str += str(rank[i]) + '>' if i != len(rank)-2 else str(rank[i])
  point[len(point)-1] = rank_str

  for i in range(len(point)-1):
    point[i] = str(point[i])
  scores.append(point)

### Naive Bayes

In [None]:
matrics = []
scores = [[] for i in range(7)]
for d in data_list:
  result = ML(d,nb)
  scores = np.hstack((scores,result[0]))
  matrics.append(result[1])

# print(scores)
scores = scores.tolist()
Rank(scores)
form = pd.DataFrame(scores,columns=data_name+['rank'],index=score_idx)
display(form)

for m, n in zip(matrics,data_name):
  form = pd.DataFrame(m,['no','yes'],['no','yes'])
  sb.heatmap(form,annot=True,fmt='d',linewidth=5,cmap='YlGnBu')
  plt.xlabel('predicted')
  plt.ylabel('real')
  plt.title(n)
  plt.show()

### Decision Tree

In [None]:
matrics = []
scores = [[] for i in range(7)]
depth = []
for d,n in zip(data_list,data_name):
  result = ML(d,dec_tree)
  scores = np.hstack((scores,result[0]))
  matrics.append(result[1])
  depth.append(dec_tree.get_depth())

# print(scores)
scores = scores.tolist()
Rank(scores)
form = pd.DataFrame(scores,columns=data_name+['rank'],index=score_idx)
display(form)

for m, n in zip(matrics,data_name):
  form = pd.DataFrame(m,['no','yes'],['no','yes'])
  sb.heatmap(form,annot=True,fmt='d',linewidth=5,cmap='YlGnBu')
  plt.xlabel('predicted')
  plt.ylabel('real')
  plt.title(n)
  plt.show()

### Random Forest

In [None]:
matrics = []
scores = [[] for i in range(7)]
for d in data_list:
  result = ML(d,rnd_forest)
  scores = np.hstack((scores,result[0]))
  matrics.append(result[1])

# print(scores)
scores = scores.tolist()
Rank(scores)
form = pd.DataFrame(scores,columns=data_name+['rank'],index=score_idx)
display(form)

for m, n in zip(matrics,data_name):
  form = pd.DataFrame(m,['no','yes'],['no','yes'])
  sb.heatmap(form,annot=True,fmt='d',linewidth=5,cmap='YlGnBu')
  plt.xlabel('predicted')
  plt.ylabel('real')
  plt.title(n)
  plt.show()

# Experiment

### less layer for decision tree

In [None]:
for i,n in enumerate(data_name):
  print('depth of',n,':',depth[i])

In [None]:
DEPTH = [2,5,10,15,20,25,30,35]

matrics = []
scores = [[] for i in range(7)]
for d in DEPTH:
  clf = tree.DecisionTreeClassifier(max_depth=d)
  result = ML(bank_f,clf)
  scores = np.hstack((scores,result[0]))
  matrics.append(result[1])

scores = scores.tolist()
Rank(scores)
form = pd.DataFrame(scores,columns=['depth '+str(d) for d in DEPTH]+['rank'],index=score_idx)
display(form)

for m, n in zip(matrics,data_name):
  form = pd.DataFrame(m,['no','yes'],['no','yes'])
  sb.heatmap(form,annot=True,fmt='d',linewidth=5,cmap='YlGnBu')
  plt.xlabel('predicted')
  plt.ylabel('real')
  plt.title(n)
  plt.show()

### more trees in random forest

In [None]:
tree_num = [100,200,300,400]

matrics = []
scores = [[] for i in range(7)]
for t in tree_num:
  more_tree = RandomForestClassifier(n_estimators=t)
  result = ML(bank_f,more_tree)
  scores = np.hstack((scores,result[0]))
  matrics.append(result[1])

scores = scores.tolist()
Rank(scores)
form = pd.DataFrame(scores,columns=['tree_num '+str(t) for t in tree_num]+['rank'],index=score_idx)
display(form)

for m, n in zip(matrics,data_name):
  form = pd.DataFrame(m,['no','yes'],['no','yes'])
  sb.heatmap(form,annot=True,fmt='d',linewidth=5,cmap='YlGnBu')
  plt.xlabel('predicted')
  plt.ylabel('real')
  plt.title(n)
  plt.show()

### more balanced data

In [None]:
# drop the original data with label 'no' until there exist same quantity of rows with different labels
yn_cnt = bank_af['y'].value_counts()
# balanced_data = bank_af.sort_values(by='y').reset_index(drop=True).drop(bank_af.index[0:yn_cnt['no']-yn_cnt['yes']]).reset_index(drop=True)
# rest_data = bank_af.sort_values(by='y').reset_index(drop=True).drop(bank_af.index[yn_cnt['no']-yn_cnt['yes']:]).reset_index(drop=True)
# all_data = pd.concat([balanced_data,rest_data],ignore_index=True)

yes_data = bank_af.sort_values(by='y').reset_index(drop=True).drop(bank_af.index[0:yn_cnt['no']]).reset_index(drop=True)
no_data = bank_af.sort_values(by='y').reset_index(drop=True).drop(bank_af.index[yn_cnt['no']:]).reset_index(drop=True)
rnd_no = no_data.sample(n=yn_cnt['yes'])
rest_data = no_data.drop(index=rnd_no.index)
train_data = pd.concat([yes_data,no_data.sample(n=yn_cnt['yes'])],ignore_index=True)
all_data = pd.concat([yes_data,no_data],ignore_index=True)

model = RandomForestClassifier()

matrics = []
scores = [[] for i in range(7)]

result = ML(all_data,model,train_data)
# result = ML(all_data,model)
scores = np.hstack((scores,result[0]))
matrics.append(result[1])

# print(scores)
form = pd.DataFrame(scores,columns=['score'],index=score_idx[0:7])
display(form)

all_data = pd.concat([rest_data,train_data],ignore_index=True)
y, x = preprocess(rest_data,all_data)
print('-------------------------')
print(model.score(x,y))

for m, n in zip(matrics,data_name):
  form = pd.DataFrame(m,['no','yes'],['no','yes'])
  sb.heatmap(form,annot=True,fmt='d',linewidth=5,cmap='YlGnBu')
  plt.xlabel('predicted')
  plt.ylabel('real')
  plt.title(n)
  plt.show()
# 一開始效果很差，我覺得應該是因為丟進去 train 的資料少了幾個 attribute，我想把資料再洗牌一下