# init

In [None]:
import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

recommendation_table_path = "drive/My Drive/Mestrado/data/recommendation table/recommendation-table.csv"

ccTable = "drive/My Drive/Mestrado/data/recommender/meta-table-cc.csv"
dysTable = "drive/My Drive/Mestrado/data/recommender/meta-table-dys.csv"
accTable = "drive/My Drive/Mestrado/data/recommender/meta-table-acc.csv"
msTable = "drive/My Drive/Mestrado/data/recommender/meta-table-ms.csv"
algList = ['CC', 'DyS', 'ACC', 'MS']
pathList = [ccTable, dysTable, accTable, msTable]

path_arff = "drive/My Drive/Mestrado/data/datasets/arff/"
path_index = "drive/My Drive/Mestrado/data/index/arff/"
files_arff = os.listdir(path_arff)

path = "drive/My Drive/Mestrado/data/experimento/datasets/selected/cleaned/"
files = os.listdir(path)

# load datasets

In [None]:
i = 0
dataframe = None
X = None
y = None
X_list = []
y_list = []

for f in files:
  if i == 9 or i == 14:
    i += 1
    continue

  df = pd.read_csv(path + f)
  df = df.dropna()

  if i == 0:
    df.drop('author', axis=1, inplace=True)

  y = df.pop(df.columns[-1])
  X = df

  # y_list.append(y.to_numpy())
  # X_list.append(X.to_numpy())
  X_list.append(X)
  y_list.append(y)

  i += 1

df_list = []
for f in files_arff:
  data = loadarff(path_arff + f)
  dataframe = pd.DataFrame(data[0])

  catCols = [col for col in dataframe.columns if dataframe[col].dtype=="O"]
  dataframe[catCols] = dataframe[catCols].apply(lambda x: x.str.decode('utf8'))

  dataframe.replace(dataframe.mode()['class'][0], 1, inplace=True)
  dataframe['class'].mask(dataframe['class'] != 1, 0, inplace=True)

  y = (dataframe.pop('class'))
  X = dataframe

  # X_list.append(X.to_numpy())
  # y_list.append((y.to_numpy()).astype(int))
  X_list.append(X)
  y_list.append(y.astype(int))

ignore_list = [24, 27, 28, 34, 35, 38, 50, 51, 61, 64, 71, 74, 80, 82, 98, 111, 116, 119, 129, 143, 144, 148, 149, 155, 156, 157, 159, 160]
i = len(X_list)

# temp_list = []
while i >= 0:
  if i in ignore_list:
    del X_list[i]
    del y_list[i]
  i -= 1

i = 0

# É PRECISO SER FEITO ISSO POR QUE A INSTÂNCIA DE POS 82 NA
# METATABELA POSSUI VALORES INVÁLIDOS PARA O ALGORITMO DE
# ÁRVORE DE DECISÃO
del X_list[82]
del y_list[82]

len(y_list)

# load absolute error for each dataset from each algorithm

In [None]:
abs_error_dict = {}
i = 0
for path in pathList:
  table = pd.read_csv(path)
  table.drop(82, inplace = True)
  data = table.values
  abs_error_list = data[:, -1]
  abs_error_dict[str(algList[i])] = abs_error_list
  i += 1

In [None]:
recommendation_table = pd.read_csv(recommendation_table_path)
abs_error_ideal_list = recommendation_table['abs-error-ideal'].values
quantifier_ideal_list = recommendation_table['quantifier-ideal-num'].values

abs_error_recommended_list = recommendation_table['abs-error-recommended'].values
quantifier_recommended_list = recommendation_table['quantifier-recommended-num'].values

In [None]:
recommendation_table = recommendation_table.sort_values(by=['abs-error-ideal'])
recommendation_table

# calculate and plot AUC versus Algorithm (eg. AUC x CC, AUC x DyS)

In [None]:
auc_list = []
clf = RandomForestClassifier(n_estimators=500, random_state=42)
for i in range(0, len(X_list)):
  X = X_list[i]
  y = y_list[i]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

  clf.fit(X_train, y_train)

  y_prediction_proba = clf.predict_proba(X_test)[::,1]

  auc_list.append(metrics.roc_auc_score(y_test, y_prediction_proba))

In [None]:
x = auc_list
y = abs_error_ideal_list
label = quantifier_ideal_list
colors = ['red', 'green', 'blue', 'yellow']

fig = plt.figure(figsize=(30,10))
plt.scatter(x, y, c=label, cmap=matplotlib.colors.ListedColormap(colors))

plt.title("AUC x Absolute Error (Melhor algoritmo)")
plt.xlabel("AUC")
plt.ylabel("ABS ERROR")

cb = plt.colorbar()
loc = np.arange(0,max(label),max(label)/float(len(colors)))
cb.set_ticks(loc)

unique, counts = np.unique(quantifier_ideal_list, return_counts=True)
counts = dict(zip(unique, counts))

cb.set_ticklabels(['CC (' + str(counts[0]) + ' pontos)',
                   'DyS (' + str(counts[1]) + ' pontos)',
                   'ACC (' + str(counts[2]) + ' pontos)',
                   'MS (' + str(counts[3]) + ' pontos)'])

In [None]:
x = auc_list
y = abs_error_recommended_list
label = quantifier_recommended_list
colors = ['red', 'green', 'blue', 'yellow']

fig = plt.figure(figsize=(30,10))
plt.scatter(x, y, c=label, cmap=matplotlib.colors.ListedColormap(colors))

plt.title("AUC x Absolute Error (Algoritmo predito)")
plt.xlabel("AUC")
plt.ylabel("ABS ERROR")

cb = plt.colorbar()
loc = np.arange(0,max(label),max(label)/float(len(colors)))
cb.set_ticks(loc)

unique, counts = np.unique(quantifier_recommended_list, return_counts=True)
counts = dict(zip(unique, counts))

cb.set_ticklabels(['CC (' + str(counts[0]) + ' pontos)',
                   'DyS (' + str(counts[1]) + ' pontos)',
                   'ACC (' + str(counts[2]) + ' pontos)',
                   'MS (' + str(counts[3]) + ' pontos)'])

In [None]:
x = auc_list

y1 = abs_error_ideal_list
y2 = abs_error_recommended_list

y3 = quantifier_ideal_list == quantifier_recommended_list
colors = ['red', 'blue']

print(y3)

plt.figure(figsize=(30,10))

plt.title('AUC x ABSOLUTE ERROR')
plt.xlabel('AUC\nAzul = Ideal\nLaranja = Regressor')
plt.ylabel('ABS ERROR')

plt.scatter(x, abs(y1-y2), c=y3, cmap=matplotlib.colors.ListedColormap(colors))
# plt.scatter(x, y2)

plt.show()

print(y1[0])
print(y2[0])
print(np.mean(abs(y1-y2)))
print(np.std(abs(y1-y2)))

In [None]:
# AUC x CC

x = auc_list
y = abs_error_dict['CC']

plt.plot(x, y, 'o')

plt.title("AUC x Absolute Error (CC)")
plt.xlabel("AUC")
plt.ylabel("ABS ERROR")

In [None]:
# AUC x DyS

x = auc_list
y = abs_error_dict['DyS']

plt.plot(x, y, 'o')

plt.title("AUC x Absolute Error (DyS)")
plt.xlabel("AUC")
plt.ylabel("ABS ERROR")

In [None]:
# AUC x ACC

x = auc_list
y = abs_error_dict['ACC']

plt.plot(x, y, 'o')

plt.title("AUC x Absolute Error (ACC)")
plt.xlabel("AUC")
plt.ylabel("ABS ERROR")

In [None]:
# AUC x MS

x = auc_list
y = abs_error_dict['MS']

plt.plot(x, y, 'o')

plt.title("AUC x Absolute Error (MS)")
plt.xlabel("AUC")
plt.ylabel("ABS ERROR")

In [None]:
abs_error_dict['CC']