In [1]:
# -*- coding: utf-8 -*-
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 20)

import numpy as np

from scipy import stats

from bokeh.io import show, output_file, export_png
from bokeh.plotting import figure
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.transform import factor_cmap
from bokeh.palettes import Category20b
from bokeh.models import LabelSet
import math

output_notebook()



In [2]:
sheet_id = "1s4qnV7d6VqVk-m-VEZyASdyt7K6RvR8-"
sheet_name = "Final_papers"
url = "https://docs.google.com/spreadsheets/d/{0}/gviz/tq?tqx=out:csv&sheet={1}".format(sheet_id, sheet_name)
df = pd.read_csv(url)

In [3]:
df.head()

Unnamed: 0,Title,Authors,Year,Source (name of publication),"Type of publication (Journal, Book, Conference proceedings, etc)",DOI,Factors,Detailed factors,Methods,Detailed methods,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,A Comparative Study for the Selection of Machi...,"Kumar, C.; Käppel, M.; Schützenmeier, N.;Eisen...",2019,DATA 2019 - Proceedings of the 8th Internation...,Conference Paper,10.5220/0008117404080415,Data,20 dataset characteristics,Meta-learning,C5.0,...,,,,,,,,,,
1,Metalearning for choosing feature selection al...,"Parmezan, A.R.S.; Lee, H.D.; Wu, F.C.",2017,Expert Systems with Applications,Article,10.1016/j.eswa.2017.01.013,Data,"Simple, statistical and information theoretical",Meta-learning,C5.0,...,,,,,,,,,,
2,Model complexity and algorithm selection in cl...,"Hilario, M.",2002,5th International Conference on Discovery Science,Conference Paper,10.1007/3-540-36182-0_12,Data,"21 (five simple, six of statistical, six of in...",Meta-learning,J48,...,,,,,,,,,,
3,On learning algorithm selection for classifica...,"Ali, S; Smith, KA",2006,Applied Soft Computing,Article,10.1016/j.asoc.2004.12.002,Data + Algorithm characteristics,"Type of data, dimension of the dataset, outlie...",Review,Literature review,...,,,,,,,,,,
4,Towards a unified model representation of mach...,"Martínez-Rojas, A.; Jiménez-Ramírez, A.; Enríq...",2019,WEBIST 2019 - Proceedings of the 15th Internat...,Conference Paper,10.5220/0008559204700476,Data + Task,Not detailed,Review,Cheat sheet review,...,,,,,,,,,,


In [4]:
print("Number of papers resulting the SLR: ", len(df))

Number of papers resulting the SLR:  9


In [5]:
authors = df["Authors"].values
authors_list = []
for author in authors:
    [authors_list.append(x.strip()) for x in author.split(';')]
    
authors_list = [x for x in authors_list if x != '']

dfAuthors = pd.DataFrame(authors_list)
dfAuthors.columns = ['Name']

In [6]:
resultsAuthors = pd.DataFrame(dfAuthors['Name'].value_counts())
resultsAuthors.reset_index(inplace=True)
resultsAuthors.columns = ['Name', 'Count']

In [7]:
p = figure(x_range=resultsAuthors['Name'].tolist(), plot_width=990, toolbar_location=None, title="Authors Counts")
p.vbar(x=resultsAuthors['Name'].tolist(), top=resultsAuthors['Count'].tolist(), width=0.3, color='#571742')
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.y_range.end = 4
p.xaxis.major_label_orientation = math.pi/2

show(p)

In [8]:
resultsYears = pd.DataFrame(df['Year'].value_counts())
resultsYears.reset_index(inplace=True)
resultsYears.columns = ['Year', 'Count']
resultsYears.sort_values(['Year', 'Count'], ascending=[False, False], inplace=True)
resultsYears

Unnamed: 0,Year,Count
1,2022,2
4,2021,1
0,2019,2
2,2018,1
6,2017,1
3,2006,1
5,2002,1


In [9]:
x = resultsYears['Year'].astype('str').tolist() + ['2003', '2004', '2005', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2020']
x.sort()
p = figure(x_range=x, plot_width=900, plot_height=400, toolbar_location=None, title="Papers per year")
p.vbar(x=resultsYears['Year'].astype('str').tolist(), top=resultsYears['Count'].astype('int').tolist(), color='#571742', width=0.9)
p.xgrid.grid_line_color = None
p.yaxis[0].ticker.desired_num_ticks = 7
p.y_range.start = 0
p.y_range.end = 6.5
p.title.text_font_size = '12pt'
p.xaxis.major_label_text_font_size = '12px'
p.yaxis.major_label_text_font_size = '12px'

show(p)

In [10]:
resultsType = pd.DataFrame(df['Type of publication (Journal, Book, Conference proceedings, etc)'].value_counts())
resultsType.reset_index(inplace=True)
resultsType.columns = ['Type', 'Count']
resultsType.sort_values(['Count', 'Type'], ascending=[False, False], inplace=True)
resultsType

Unnamed: 0,Type,Count
0,Conference Paper,5
1,Article,4


In [11]:
p = figure(x_range=resultsType['Type'].tolist(), plot_width=600, plot_height=600, title="Publication type")
p.vbar(x=resultsType['Type'].tolist(), top=resultsType['Count'].tolist(), width=0.9, color='#571742')
p.xgrid.grid_line_color = None
p.yaxis[0].ticker.desired_num_ticks = 10
p.y_range.start = 0

show(p)

In [12]:
factors = df["Factors"].values
factors_list = []
for f in factors:
    print(f)
    [factors_list.append(x.strip().lower().capitalize()) for x in f.split(';')]
    
factors_list = [x for x in factors_list if x != '']

dfFactors = pd.DataFrame(factors_list)
dfFactors.columns = ['Name']

resultsFactors = pd.DataFrame(dfFactors['Name'].value_counts())
resultsFactors.reset_index(inplace=True)
resultsFactors.columns = ['Name', 'Count']
resultsFactors.sort_values(['Count', 'Name'], ascending=[True, False], inplace=True)
resultsFactors

Data
Data
Data
Data + Algorithm characteristics
Data + Task
Data
Data + Algorithm requirements
Data + Problem type
Data


Unnamed: 0,Name,Count
1,Data + task,1
2,Data + problem type,1
3,Data + algorithm requirements,1
4,Data + algorithm characteristics,1
0,Data,5


In [13]:
p = figure(x_range=resultsFactors['Name'].astype('str').tolist(), plot_width=800, plot_height=500, title="Factors")
p.vbar(x=resultsFactors['Name'].astype('str').tolist(), top=resultsFactors['Count'].astype('int').tolist(), width=0.5, color='#571742')
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = math.pi/3.8
p.yaxis[0].ticker.desired_num_ticks = 12
p.y_range.start = 0

show(p)

In [14]:
resultsMethods = pd.DataFrame(df['Methods'].value_counts())
resultsMethods.reset_index(inplace=True)
resultsMethods.columns = ['Name', 'Count']
resultsMethods.sort_values(['Count', 'Name'], ascending=[True, False], inplace=True)
resultsMethods

Unnamed: 0,Name,Count
2,Ontology,1
0,Review,4
1,Meta-learning,4


In [15]:
p = figure(x_range=resultsMethods['Name'].astype('str').tolist(), plot_width=600, plot_height=500, title="Methods")
p.vbar(x=resultsMethods['Name'].astype('str').tolist(), top=resultsMethods['Count'].astype('int').tolist(), width=0.5, color='#571742')
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = math.pi/4
p.xaxis.major_label_text_font_size = "9pt"
p.yaxis[0].ticker.desired_num_ticks = 12
p.y_range.start = 0

show(p)

In [16]:
problem = df["Problems supported"].values
problem_list = []
for p in problem:
    [problem_list.append(x.strip().lower().capitalize()) for x in p.split(';')]
    
problem_list = [x for x in problem_list if x != '']

dfProblem = pd.DataFrame(problem_list)
dfProblem.columns = ['Name']

resultsProblem = pd.DataFrame(dfProblem['Name'].value_counts())
resultsProblem.reset_index(inplace=True)
resultsProblem.columns = ['Name', 'Count']
resultsProblem.sort_values(['Count', 'Name'], ascending=[True, False], inplace=True)
resultsProblem

Unnamed: 0,Name,Count
5,Encoding,1
4,Feature selection,2
2,Dimensionality reduction,2
3,Clustering,2
1,Regression,3
0,Classification,7


In [17]:
p = figure(x_range=resultsProblem['Name'].astype('str').tolist(), plot_width=600, plot_height=500, title="Problems supported")
p.vbar(x=resultsProblem['Name'].astype('str').tolist(), top=resultsProblem['Count'].astype('int').tolist(), width=0.5, color='#571742')
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = math.pi/4
p.y_range.start = 0

show(p)

In [18]:
resultsTest = pd.DataFrame(df['Tested?'].value_counts())
resultsTest.reset_index(inplace=True)
resultsTest.columns = ['Name', 'Count']
resultsTest.sort_values(['Name', 'Count'], ascending=[True, False], inplace=True)
resultsTest

Unnamed: 0,Name,Count
1,No,2
0,Yes,7


In [19]:
p = figure(x_range=resultsTest['Name'].astype('str').tolist(), plot_width=600, plot_height=500, title="Tested?")
p.vbar(x=resultsTest['Name'].astype('str').tolist(), top=resultsTest['Count'].astype('int').tolist(), width=0.5, color='#571742')
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = math.pi/4
p.y_range.start = 0

show(p)