# Data analysis
En este notebook se procedera con la exploracion de los datos, obtenidos al clasificar los articulos obtenidos (n=99) en las tablas que se encuentran en la carpeta /data. Primero se procede con el seteo del work directory, el importado de las librerias y funciones a usar, y la creacion de los dataframes para el analisis exploratorio de los datos.

## Primary stepts

Import libraries and functions

In [None]:
#Import libraries to be used
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import zepid
from zepid.graphics import EffectMeasurePlot
import networkx as nx
from numpy import genfromtxt
from scipy import stats
from IPython.display import Image
from thefuzz import fuzz

#Import functions to be used
from scripts import functions as fn

Dataframes´s creation

In [None]:
#dataframes to be used, one for every sheet
df_metadata = pd.read_csv('.\data\Tabla Normalizada - Metadata.csv')
df_data_type = pd.read_csv('.\data\Tabla Normalizada - Data type.csv')
df_participants = pd.read_csv('.\data\Tabla Normalizada - Participants.csv')
df_self_report = pd.read_csv('.\data\Tabla Normalizada - Self report.csv')
df_eet = pd.read_csv('.\data\Tabla Normalizada - Emotion elicitation techniques.csv')
df_eda = pd.read_csv('.\data\Tabla Normalizada - EDA.csv')
df_slm = pd.read_csv('.\data\Tabla Normalizada - Statistical Learning model.csv')
df_performances = pd.read_csv('.\data\Tabla Normalizada - Performances.csv')
df_alg_perf = pd.read_csv('.\data\Tabla Normalizada - Alg_Perf.csv')

Set default plots caracteristiccs

In [None]:
sns.set_context('notebook')

## Analysis
Teniendo en cuenta las supracategorias () con las que fueron clasificados los articulos, se procede con el analisis de las mismas en orden.

### 1. Metadata

In [None]:
df_metadata=df_metadata.fillna('-')
df_metadata_without_duplicates = df_metadata.drop_duplicates(subset='paper_id')

#### 1.1. Countries

In [None]:
countries_1 = df_metadata_without_duplicates["first_author_country_affiliation"].unique()

countries = df_metadata_without_duplicates.pivot_table(columns=['first_author_country_affiliation'], aggfunc='size')
df_countries = pd.DataFrame(countries)

order = ['China','USA', 'Germany', 'India','Turkey','Italy', 'Malaysia','Spain','Iran', 'Switzerland','Romania','Pakistan', 'Taiwan','Greece', 'Japan',
'Austria', 'Tunisia','Macedonian', 'Finland', 'Slovenia', 'Portugal', 'Korea',
'UK', 'Indonesia','Canada', 'France', 'Lithuania','Egypt','Colombia', 'Australia', 'Poland']

df_countries.loc[order].plot(kind='bar', xlabel='country', ylabel='paper quantity')
plt.show()

#Papers por continente - plot
papers_continents = {'continents' : ['Asia', 'Europa','America','Africa','Australia'],
'quantity' : [49, 39, 9, 3, 1]}
df_continents = pd.DataFrame(papers_continents)
print(df_continents)

df_continents.set_index('continents').plot(kind='bar',xlabel='continent', ylabel='paper quantity')
plt.show()

In [None]:
#Aquellos que tenian una frecuencia de 4 o menos los agrupamos en 'Others'
mapping_others ={
        
        'Australia': 'WEIRD',
        'Austria': 'WEIRD',
        'Canada': 'WEIRD',
        'Finland': 'WEIRD',
        'France': 'WEIRD',
        'Germany': 'WEIRD',
        'Italy': 'WEIRD',
        'Lithuania': 'WEIRD',
        'Poland': 'WEIRD',
        'Portugal': 'WEIRD',
        'Slovenia': 'WEIRD',
        'Spain': 'WEIRD',
        'Switzerland': 'WEIRD',
        'UK': 'WEIRD',
        'USA': 'WEIRD'}

df_countries_with_weird = df_countries.copy()
df_countries_with_weird.reset_index(inplace=True)
df_countries_with_weird.loc[:,'first_author_country_affiliation'] = df_countries_with_weird.loc[:,"first_author_country_affiliation"].replace(mapping_others)
df_countries_with_weird.value_counts()

In [None]:
df_countries

#### 1.2. Source type

In [None]:
df_sources = df_metadata_sin_duplicates.iloc[:,7:10]
def get_value(row):
     for c in df_sources.columns:
         if row[c]== 'x':
             return c

df_sources = df_sources.apply(get_value, axis=1)
df_sources = pd.DataFrame(df_sources)
df_sources.columns = ['Source type']

#ploteo
quantity = df_sources['Source type'].value_counts()
df_quantity = pd.DataFrame(quantity)


sns.countplot(x='Source type', data=df_sources)

In [None]:
df_quantity

#### ¿De que revistas cientificas provinieron la mayoria de los papers? ¿Cual era su orientacion (psicologica o ingenieria de datos)?
7. Gráfico frecuencia de papers según revista científica de origen, distinguiendo entre las que poseen orientación en ingeniería de datos y las que no

Nota: falta filtrar bien cuales son journal, y cuales de estas son de ingenieria o no (diferenciarlas con color o hue)

In [None]:
#por revista
df_metadata=df_metadata.fillna('-')
df_metadata_sin_duplicates = df_metadata.drop_duplicates(subset='paper_id')

df_source_title = df_metadata_sin_duplicates[['paper_id','source_title','source_type_journal']]
df_source_title = df_source_title[df_source_title['source_type_journal'].isin(['x', 'X'])] 

titulos = [' ', 'Journal', 'Cantidad']
bar_plot('source_title',df_source_title,titulos)

In [None]:
df_source_title["source_title"].value_counts()

In [None]:
df_source_title["source_title"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
serie_sources = df_source_title["source_title"].value_counts()
df_serie_sources = serie_sources.to_frame()
df_serie_sources = df_serie_sources.reset_index()
df_serie_sources.rename(columns = {'index':'algoritmo', 'variable':'cantidad'}, inplace = True)

In [None]:
col1 = df_serie_sources.columns[0]
col2 = df_serie_sources.columns[1]

mapping_others = {}

for i, row in df_serie_sources.iterrows():
    if row[col2] == 1:
        mapping_others.update({row[col1]: "Others (=1)"})

In [None]:
sources_with_others = df_source_title.copy()
sources_with_others.loc[:,'source_title'] = sources_with_others.loc[:,'source_title'].replace(mapping_others)
sources_with_others['source_title'].value_counts()

titulos = [' ', 'Journal', 'Cantidad']
bar_plot('source_title',sources_with_others,titulos)

In [None]:
sources_with_others["source_title"].value_counts()

In [None]:
sources_with_others["source_title"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

### 2. Data type

In [None]:
df_data_type = df_data_type.fillna('-')

#### ¿Cual es la evolucion temporal (2010-2020) del uso de las bases de datos privadas y publicas?
5. Gráfico de barra por año (2010-2020) según tipos de base de datos (privada, pública)

In [None]:
df_data_type = df_data_type.groupby(['paper_id', 'db_access']).nth(0)
df_data_type.reset_index(inplace=True)

In [None]:
df_data_type

In [None]:
df_data_type["db_access"] = df_data_type["db_access"].str.capitalize()
df_data_type = df_data_type.rename(columns={"db_access":"Access"})

In [None]:
category_order = [2010, 2011, 2012, 2013, 2014, 2015, 2015, 2016, 2017, 2018, 2019, 2020]
g = sns.countplot(x='year', 
    data= df_data_type, 
    hue='Access', 
    order=category_order)
g.set(xlabel = 'Year', ylabel = 'Quantity')
plt.savefig('.\\Emmanuel\\figures\\NUEVOS. Frecuencia de uso de bases de datos públicas y privadas por año (2010 - 2020).png')

In [None]:
df_data_type["Access"].value_counts()

In [None]:
df_data_type["Access"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
df_access = df_data_type[["paper_id","apa_citation",'Access', "year", "model_id"]]

df_access = df_access.groupby(
        ["paper_id",'Access']
        ).nth(0)
df_access.reset_index(inplace=True)

df_access_crosstab = pd.crosstab(index=df_access['year'], columns=df_access['Access'],normalize='index')

g = df_access_crosstab.plot(kind='bar',
                        stacked=True,
                        rot=0)
g.set_ylim([0, 1])
g.set(xlabel = 'Year', ylabel = 'Quantity')

#### ¿Cual es la frecuencia de uso de cada bse de datos encontrada?

6. Gráfico frencuencia de uso de cada base de datos pública encontrada
- Interpretacion: Un predominio de pocas bases de datos. Estamos todo el tiempo sacando conclusiones sobre los mismos sujetos? Ver predominio de bases de datos publicas por sobre las privadas
- No se ha tenido en cuenta el dato aportado por Lorenzo (mas de una db por paper)

In [None]:
df_data_type_sin_duplicates = df_data_type.drop_duplicates(subset='paper_id')

In [None]:
df_db = df_data_type.groupby(['paper_id',"public_database",'use_multiple_db', 'db_private', 'db_public','db_private_and_public', 'db_uppon_request', 'is_database']).nth(0)
df_db.reset_index(inplace=True)

In [None]:
df_db = multi_reversing(df_db, 'model_id', df_db[["public_database",'use_multiple_db', 'db_private', 'db_public','db_private_and_public', 'db_uppon_request', 'is_database']])
df_db = df_db.replace('Multimodal Dyadic Behavior (MMDB)', 'MMDB')

In [None]:
df_db["variable"].value_counts()

In [None]:
df_db["variable"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
df_data_type[["Access", "model_id", "public_database",'use_multiple_db', 'db_private', 'db_public','db_private_and_public', 'db_uppon_request', 'is_database']]
df_data_type[["public_database",'use_multiple_db', 'db_private', 'db_public','db_private_and_public', 'db_uppon_request', 'is_database']].value_counts()


In [None]:
df_data_type["Access"].value_counts()

In [None]:
freq_data_base = df_data_type.groupby(['paper_id', 'DEAP',
       'AMIGOS', 'MAHNOB', 'CASE', 'Ascertain', 'Cog.load',
       'Multimodal Dyadic Behavior (MMDB)', 'RECOLA', 'DECAF',
       'Driving Workload', 'Liris', 'SenseEmotion', 'PMEmo',
       'Hazumi1911', 'Bio Vid Emo DB', 'DREAMER',
       'Non-EEG Biosignals Data Set for Assessment and Visualization of Neurological Status',
       'Stress Recognition in Automobile Drivers Data Set', 'PsPM-HRA1']).nth(0)
freq_data_base.reset_index(inplace=True)

db_freq = multi_reversing(freq_data_base, 'model_id', freq_data_base[['DEAP',
       'AMIGOS', 'MAHNOB', 'CASE', 'Ascertain', 'Cog.load',
       'Multimodal Dyadic Behavior (MMDB)', 'RECOLA', 'DECAF',
       'Driving Workload', 'Liris', 'SenseEmotion', 'PMEmo', 'Hazumi1911', 'Bio Vid Emo DB', 'DREAMER',
       'Non-EEG Biosignals Data Set for Assessment and Visualization of Neurological Status',
       'Stress Recognition in Automobile Drivers Data Set', 'PsPM-HRA1']])
db_freq['variable'].value_counts()

In [None]:
db_freq['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
db_freq['variable'] = db_freq['variable'].replace(
    to_replace=['Multimodal Dyadic Behavior (MMDB)'], 
    value='MMDB')

In [None]:
titulos = [' ', 'Databases', 'Quantity']
bar_plot('variable',db_freq,titulos)

### 3. Participants

In [None]:
df_participants= df_participants.fillna('-')
df_participants.head()

#### country

In [None]:
df_participants_country = df_participants.groupby(['paper_id', 'country']).nth(0)
df_participants_country.reset_index(inplace =  True)
df_participants_country["country"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
participants_total = df_participants.groupby(['paper_id', 'N']).nth(0)
participants_total.reset_index(inplace =  True)

In [None]:
participants_total["N"].unique()

In [None]:
participants_total["N"].value_counts()

In [None]:
participants_without_n= participants_total[participants_total['N']== '27']
participants_without_n

In [None]:
participants_n= participants_total[participants_total['N']!= '-']
participants_n

In [None]:
participants_n["N"] = participants_n["N"].astype(int)
print(participants_n['N'].mean())
g = sns.boxplot(y="N", data=participants_n)
plt.show()
g2 = participants_n['N'].plot(bins = 20,kind ='hist', xlim = [0,500])
plt.show()

In [None]:
participants_no_n= participants_total[participants_total['N']== '-']
participants_no_n

In [None]:
participants_female = df_participants.groupby(['paper_id', 'n_female']).nth(0)
participants_female.reset_index(inplace =  True)

In [None]:
participants_without_female= participants_female[participants_female['n_female'] == '-']
participants_without_female

In [None]:
participants_female= participants_female[participants_female['n_female'] != '-']
participants_female

In [None]:
participants_female["n_female"].unique()

In [None]:
participants_female["n_female"] = participants_female["n_female"].astype(int)
print(participants_female['n_female'].mean())
g = sns.boxplot(y="n_female", data=participants_female)
plt.show()
participants_female['n_female'].plot(bins = 20,kind ='hist', xlim = [0,250])
plt.show()

In [None]:
df_p_range_age = df_participants.groupby(['paper_id', 'range_age']).nth(0)
df_p_range_age.reset_index(inplace =  True)

In [None]:
df_p_range_age['range_age'].unique()

In [None]:
df_p_range_age_no_cero= df_p_range_age[df_p_range_age['range_age'] != '-']

In [None]:
df_p_range_age_no_cero

In [None]:
df_p_range_age_no_cero["range_age"].value_counts()

In [None]:
df_p_range_age_cero_na= df_p_range_age[df_p_range_age['range_age'] == '-']
df_p_range_age_cero_na
#48

In [None]:
df_p_mean_age = df_participants.groupby(['paper_id', 'mean_age']).nth(0)
df_p_mean_age.reset_index(inplace =  True)

In [None]:
df_p_mean_with_age= df_p_mean_age[df_p_mean_age['mean_age'] != '-']
df_p_mean_with_age

In [None]:
df_p_mean_with_no_age= df_p_mean_age[df_p_mean_age['mean_age'] == '-']
df_p_mean_with_no_age
#44

In [None]:
df_p_mean_with_age["mean_age"] = df_p_mean_with_age["mean_age"].astype(float)
print(df_p_mean_with_age['mean_age'].mean())
print(df_p_mean_with_age['mean_age'].min())
print(df_p_mean_with_age['mean_age'].max())
g = sns.boxplot(y="mean_age", data=df_p_mean_with_age)
plt.show()
df_p_mean_with_age['mean_age'].plot(bins = 10,kind ='hist', xlim = [15,40])
plt.show()

In [None]:
df_participants.loc[df_participants['range_age'] == "-", 'range_age'] = "0"

In [None]:
range_age = df_p_range_age[df_p_range_age['range_age'] != '-']
range_age

In [None]:
range_age['range_age'] = range_age['range_age'].str.replace('-',' ')
range_age['range_age'].str.split()
df_ranges = range_age['range_age'].str.split(pat = ' ', expand = True)
df_ranges = df_ranges.dropna()
df_ranges

In [None]:
df_ranges[0] = df_ranges[0].astype(int)
df_ranges[1] = df_ranges[1].astype(int)

In [None]:
df_ranges[0].min()

In [None]:
df_ranges[1].max()

In [None]:
first = df_ranges[0].to_list()
second = df_ranges[1].to_list()
final_list = first + second
df_range_merged = pd.DataFrame(final_list, columns = ['ages'])
df_range_merged

In [None]:
print(df_range_merged['ages'].mean())

In [None]:
g = sns.boxplot(y="ages", data=df_range_merged)
plt.show()

### 4. Self-report

In [None]:
df_self_report = df_self_report.fillna('-')

#### ¿Cuantas veces fueron testeadas juntas las distintas dimensiones y categorias emocionales?

##### Primero obtenemos las categorias emocionales usadas y su frecuencia

* Hubo 55 modelos que usaron categorias emocionales, siendo disgust, fear, y sadness las mas repetidas.
* Tener en cuenta que un modelo puede usar mas de una categoria, por lo que la funcion get_values no funciona en este caso, y un paper puede concentrar la mayoria de los usos de un conjunto de categorias

In [None]:
emotional_categories_grouped = df_self_report.groupby(['paper_id','Stress', 'Disgust', 'Fear', 'Sadness', 'Surprise' ,'Happiness', 'Pleasant',
                            'Anxiety', 'Neutral', 'Funny', 'Boredom', 'Relaxation', 'Amusement', 'Joy']).nth(0)
emotional_categories_grouped.reset_index(inplace=True)

In [None]:
emotional_categories_grouped = multi_reversing(
    emotional_categories_grouped, 'paper_id', 
    emotional_categories_grouped
    [[
    'Stress', 'Disgust', 'Fear', 'Sadness', 'Surprise' ,'Happiness', 'Pleasant',
    'Anxiety', 'Neutral', 'Funny', 'Boredom', 'Relaxation', 'Amusement', 'Joy'
    ]])

In [None]:
emotional_categories_grouped

In [None]:
print(emotional_categories_grouped['variable'].value_counts())
print(emotional_categories_grouped['variable'].unique())

In [None]:
titulos = [' ', 'Categoria', 'Frecuencia']
bar_plot('variable',emotional_categories_grouped,titulos)

In [None]:
emotional_dimensions_grouped = df_self_report.groupby([
    'paper_id','valence',
    'arousal', 'dominance', 'like / dislike', 'familiarity', 'engagement',
    'predictability']).nth(0)
emotional_dimensions_grouped.reset_index(inplace=True)

In [None]:
emotional_dimensions_grouped = multi_reversing(
    emotional_dimensions_grouped, 'paper_id', 
    emotional_dimensions_grouped
    [[
    'valence',
    'arousal', 'dominance', 'like / dislike', 'familiarity', 'engagement',
    'predictability'
    ]])

In [None]:
emotional_dimensions_grouped

In [None]:
print(emotional_dimensions_grouped['variable'].value_counts())
print(emotional_dimensions_grouped['variable'].unique())

In [None]:
titulos = [' ', 'Dimension', 'Frequency']
bar_plot('variable',emotional_dimensions_grouped,titulos)

In [None]:
g= sns.countplot(x='year', 
    data= df_models, 
    hue='model', 
    order=category_order)
g.set(xlabel = 'Año', ylabel = 'Cantidad de modelos')
plt.savefig('.\\Emmanuel\\figures\\NUEVOS. Cantidad de modelos de regresion y clasificacion por año (2010 - 2020).png')

In [None]:
df_models["model"].value_counts()

In [None]:
df_models["model"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

#### Grafo

In [None]:
#creacion matrix de adyacencia
df_self_report.fillna('-', inplace= True)
self_0 = df_self_report[df_self_report['is_categorial']!= '-']

df_matrix_con_paperid_0 = self_0.groupby(
        ["paper_id",'Anger',
        'Stress', 'Disgust', 'Fear', 'Sadness', 'Surprise', 'Happiness',
        'Pleasant', 'Anxiety', 'Neutral', 'Funny', 'Boredom', 'Relaxation', 'Amusement', 'Joy']
        ).nth(0)
df_matrix_con_paperid_0.reset_index(inplace=True)

print(df_matrix_con_paperid_0)

df_matrix = df_matrix_con_paperid_0[['Anger',
        'Stress', 'Disgust', 'Fear', 'Sadness', 'Surprise', 'Happiness',
        'Pleasant', 'Anxiety', 'Neutral', 'Funny', 'Boredom', 'Relaxation', 'Amusement', 'Joy']
        ]

df_matrix = df_matrix.replace('-', 0)
df_matrix = df_matrix.replace('x', 1)
adj_matrix = df_matrix.T.dot(df_matrix)
np.fill_diagonal(adj_matrix.values, 0)

In [None]:
adj_matrix

In [None]:
weights = nx.get_edge_attributes(Gd,'weight').values()

fig, ax = plt.subplots(figsize=(30, 30))

pos = nx.circular_layout(Gd)

"""nx.draw(Gd, pos, 
        edge_color=weights, 
        node_color='skyblue',
        width=[i*1.5 for i in weights],
        with_labels=True,
        node_size=13000,
        alpha=1,
        font_size=20,
        font_weight="bold",
        arrows=False,
        edge_cmap=plt.colormaps['cividis'])"""



nx.draw(Gd, pos, 
        edgecolors="black", 
        node_color='white',
        width=[i/1.5 for i in weights],
        node_size=2000,
        linewidths=3,
        alpha=1,
        font_size=25,
        font_weight="bold",
        arrows=False,
        edge_cmap=plt.colormaps['copper'])

boxes = dict(facecolor='white', alpha=1)

pos_nodes = nudge(pos, 0, 0.1)    
nx.draw_networkx_labels(Gd, pos=pos_nodes, labels=None, font_size=30, font_color='k',
                        font_family='serif', font_weight='normal', alpha=None, bbox=boxes, horizontalalignment='center', 
                        verticalalignment='center', ax=None, clip_on=True)


plt.savefig('.\\Emmanuel\\figures\\NUEVOS. Conexiones entre dimensiones emocionales - Colores y grosor segun numero de relaciones.png')

In [None]:
relaciones(weights)

In [None]:
df_self_report

In [None]:
df_self_report['use_questionnaite'] = df_self_report['use_questionnaite'].str.replace('x',"Yes")
df_self_report['use_questionnaite'] = df_self_report['use_questionnaite'].str.replace('-',"No")
df_self_report['use_questionnaite'] = df_self_report['use_questionnaite'].str.replace("Relies on  other's questionnaire","Relies on other's questionnaire")
df_self_report['use_questionnaite'] = df_self_report['use_questionnaite'].str.replace("Relies on other´s questionaire","Relies on other's questionnaire")

In [None]:
df_self_report.iloc[:,0:9]
used_questionnaries = df_self_report.groupby(['paper_id', "use_questionnaite"]).nth(0)
used_questionnaries.reset_index(inplace= True)
used_questionnaries["use_questionnaite"].value_counts()

In [None]:
used_questionnaries["use_questionnaite"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
questionnaries = df_self_report.groupby(['paper_id', 'affective_questionaire_SAM','affective_questionaire_PSS', 
                                         "affective_questionaire_PANAS", "affective_questionaire_DES", "affective_questionaire_affective_grid"]).nth(0)
questionnaries.reset_index(inplace= True)
questionnaries = multi_reversing(questionnaries, 'paper_id', questionnaries[['affective_questionaire_SAM','affective_questionaire_PSS', "affective_questionaire_PANAS", "affective_questionaire_DES", "affective_questionaire_affective_grid"]])
questionnaries['variable'].value_counts()

In [None]:
questionnaries['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

#### relax_bore_stress

In [None]:
df_self_report_compressed = df_self_report[["apa_citation", "Boredom", "Relaxation", "Stress"]]
relax_bore_stress = df_self_report_compressed[df_self_report_compressed['Boredom'].isin(['x', 'X']) | df_self_report_compressed['Relaxation'].isin(['x', 'X']) | df_self_report_compressed['Stress'].isin(['x', 'X'])]

In [None]:
relax_bore_stress

### 5. Emotion elicitation techniques

In [None]:
df_emotion_elicitation_techniques.fillna('-', inplace= True)

In [None]:
df_technniques_no_dup = df_emotion_elicitation_techniques.drop_duplicates(subset="paper_id")

In [None]:
df_technniques_no_dup = df_technniques_no_dup.groupby(['paper_id',"technique_name"]).nth(0)
df_technniques_no_dup.reset_index(inplace=True)
df_technniques_no_dup["technique_name"].value_counts()
df_technniques_no_dup["technique_name"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
df_emotion_elicitation_techniques.columns
df_emotion_elicitation_techniques["technique_name"].value_counts()
df_emotion_elicitation_techniques["technique_name"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
techniques_defined = df_emotion_elicitation_techniques[df_emotion_elicitation_techniques["technique_name"] != "-"]
techniques_defined["technique_name"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
df_emotion_elicitation_techniques["is_multimodal"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
type_task = df_emotion_elicitation_techniques.groupby(['paper_id', 'task_type_active','task_type_pasive']).nth(0)
type_task.reset_index(inplace= True)
len(type_task[(type_task['task_type_active'] == '-') &
            (type_task['task_type_pasive'] == '-')])
task_type = multi_reversing(type_task, 'model_id', type_task[['task_type_active','task_type_pasive']])
task_type['variable'].value_counts()
task_type['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
freq_modality = df_emotion_elicitation_techniques.groupby(['paper_id','is_multimodal','modality_visual','modality_auditory', 'modality_somatosensory']).nth(0)
freq_modality.reset_index(inplace=True)
df_modality = multi_reversing(freq_modality, 'model_id', freq_modality[['is_multimodal','modality_visual','modality_auditory', 'modality_somatosensory']])
df_modality['variable'].value_counts()
df_modality['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
number_modality = sum(df_modality['variable'].value_counts())
print (f'Se hallaron {number_modality} instancias en total entre multimodal, visual, auditory y somatosensory.')
visual_modality = df_emotion_elicitation_techniques.groupby(['paper_id','visual_pictures', 'visual_videos', 'visual_words', 'visual_other']).nth(0)
visual_modality.reset_index(inplace=True)

df_visual_modality = multi_reversing(visual_modality, 'model_id', visual_modality[['visual_pictures', 'visual_videos', 'visual_words', 'visual_other']])
df_visual_modality['variable'].value_counts()
df_visual_modality['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
auditory_modality = df_emotion_elicitation_techniques.groupby(['paper_id','auditory_miusic', 'auditory_other']).nth(0)
auditory_modality.reset_index(inplace=True)

df_auditory_modality = multi_reversing(auditory_modality, 'model_id', auditory_modality[['auditory_miusic', 'auditory_other']])
df_auditory_modality['variable'].value_counts()
df_auditory_modality['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
other_techniques = df_emotion_elicitation_techniques.groupby(['paper_id','technique_clasif_multiple_techniques', 'technique_clasif_driving',
'technique_clasif_Imagination techniques /memory recall',
'technique_clasif_Social interactions',
'technique_clasif_Virtual Reality', 'technique_clasif_Meditation',
'technique_clasif_Reading', 'technique_clasif_Ux',
'technique_clasif_TEM clips (Tactile Enhanced Multimedia)',
'technique_clasif_Videogame', 'technique_clasif_Puzzle']).nth(0)
other_techniques.reset_index(inplace=True)

df_other_techniques = multi_reversing(other_techniques, 'model_id', other_techniques[['technique_clasif_multiple_techniques', 'technique_clasif_driving',
'technique_clasif_Imagination techniques /memory recall',
'technique_clasif_Social interactions',
'technique_clasif_Virtual Reality', 'technique_clasif_Meditation',
'technique_clasif_Reading', 'technique_clasif_Ux',
'technique_clasif_TEM clips (Tactile Enhanced Multimedia)',
'technique_clasif_Videogame', 'technique_clasif_Puzzle']])
df_other_techniques['variable'].value_counts()
all_techniques = df_emotion_elicitation_techniques.groupby(['paper_id','visual_pictures', 'visual_videos', 'visual_words', 'visual_other',
                                                            'auditory_miusic', 'auditory_other', 'technique_clasif_driving',
                                                            'technique_clasif_Imagination techniques /memory recall',
                                                            'technique_clasif_Social interactions',
                                                            'technique_clasif_Virtual Reality', 'technique_clasif_Meditation',
                                                            'technique_clasif_Reading', 'technique_clasif_Ux',
                                                            'technique_clasif_TEM clips (Tactile Enhanced Multimedia)',
                                                            'technique_clasif_Videogame', 'technique_clasif_Puzzle']).nth(0)
all_techniques.reset_index(inplace=True)

df_all_techniques = multi_reversing(all_techniques, 'model_id', all_techniques[['visual_pictures', 'visual_videos', 'visual_words', 'visual_other',
                                                            'auditory_miusic', 'auditory_other',
                                                            'technique_clasif_driving',
                                                            'technique_clasif_Imagination techniques /memory recall',
                                                            'technique_clasif_Social interactions',
                                                            'technique_clasif_Virtual Reality', 'technique_clasif_Meditation',
                                                            'technique_clasif_Reading', 'technique_clasif_Ux',
                                                            'technique_clasif_TEM clips (Tactile Enhanced Multimedia)',
                                                            'technique_clasif_Videogame', 'technique_clasif_Puzzle']])
df_all_techniques['variable'].value_counts()
df_all_techniques['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
len(df_all_techniques['variable'])

### 6.EDA

In [None]:
df_eda = df_eda.fillna('-')
eda_devices = df_eda[df_eda['eda_device_specification']!= '-']  #me quedo solo con las rows que usaron dispositivos

#### ¿En cuantas ocasiones no aclara el dispositivo de EDA utilizado?

In [None]:
aver = df_eda.groupby(['paper_id', 'eda_device_specification']).nth(0)
aver.reset_index(inplace= True)
len(aver[aver['eda_device_specification'] == '-'])

In [None]:
len(aver["eda_device_specification"])

In [None]:
aver["eda_device_specification"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
aver["eda_device_specification"].value_counts()
plt.figure(figsize = (15,5))
g = sns.countplot(x="eda_device_specification", data=aver, order = getattr(aver, "eda_device_specification").value_counts().index)
plt.xticks(rotation=90)

#### ¿Cuantos eda devices HOMEMADE hay?

In [None]:
df_eda['eda_device_is_homemade'].value_counts()

In [None]:
df_eda['eda_device_is_homemade'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
eda_devices = eda_devices.groupby(['paper_id', 'eda_device_specification']).nth(0)
eda_devices.reset_index(inplace=True)

In [None]:
eda_devices['eda_device_specification'].value_counts()

In [None]:
# REDUCIMOS LAS CATEGORÍAS AGRUPANDOLAS
main_categories = ['BIOPAC', 'Empatica','GSR2','Sociograph', 'Variport', 'Shimmer', 'BioRadio 150', 'Affectiva-QSensors5','ProComp Infinity','PowerLab', 'Grove']

def reduce_categories(list_categories):
    '''
    esta función toma una lista de categorias y reemplaza por ellas a las rows con categorías similares
    
    ej: reemplaza por BIOPAC a 'BIOPAC 150'

    '''
    for category in list_categories:
        for i in eda_devices.loc[:,'eda_device_specification']:
            if fuzz.ratio(category, i) > 45:
                eda_devices.loc[:,'eda_device_specification'] = eda_devices.loc[:,'eda_device_specification'].replace({i:category})

# Reducimos algunas categorías a partir de la funcion reduce_categories
reduce_categories(main_categories)

# Aún quedan algunas categorias sin unificar asi que unifico las restantes y resuelvo las inconsistencias por diferencias de tipeo o abreviaciones
mapping_eda = {
    'MP150 Biopac': 'BIOPAC', 'MP35 Biopac' : 'BIOPAC','MP150':'BIOPAC','Biopac\r\nMP36' : 'BIOPAC',
    'Biosemi activeTwo' : 'Biosemi ActiveTwo', 'Biosemi ActiveTwo ':'Biosemi ActiveTwo',
    'PowerLab (manufactured\r\nby ADInstruments)': 'PowerLab',
    'Affectiva-QSensors5': 'Q Sensor by Afectiva',
    'Shimmer3 GSR+ Unit sensor':'Shimmer',
    'sensors produced by Thought Technology' : 'Thought Technology',
    'Grove\r\n(a standalone LM324 quadruple operational amplifier based on EDA sensor kit)':'Grove',
    'Grove GSR sensor produced by Seeed':'Grove',
    'Gen II integrated wearable device from Analog Devices, Inc': 'Gen II Analog Devices',
    'e-Health Sensor\nPlatform V2.0' : 'e-Health Sensor Platform V2.0',
    '(BITalino (r)evolution Plugged\r\nKit BT':'(BITalino (r)evolution Plugged Kit BT ',
    'Nexus-10' : 'NEXUS', 'Nexus 4 Biofeedback system3':'NEXUS', #NO ESTOY SEGURO SI ESTO REALMENTE SON LO MISMO, O SIQUIERA SI NEXUS ES LA MARCA
    'Nexus-32' : 'NEXUS'         }


eda_devices.loc[:,'eda_device_specification'] = eda_devices.loc[:,'eda_device_specification'].replace(mapping_eda)

In [None]:
eda_devices['eda_device_specification'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
#In percentage
#Plot
eda_devices['eda_device_specification'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

¿Cuantos dipositivos de eda diferentes se utilizaron?

In [None]:
n_dispositivos_eda = eda_devices['eda_device_specification'].nunique()
print (f'Se utilizaron {n_dispositivos_eda} dispositivos de EDA diferentes')

¿En cuántas instancias de nuestro interés se especificó el dispositivo de EDA utilizado?

In [None]:
number_eda = sum(eda_devices['eda_device_specification'].value_counts())
print (f' Tenemos {number_eda} de instancias donde se aclaró el dispositivo de EDA utilizado.')

In [None]:
#Plot
from turtle import width


plt.figure(figsize = (23,16))
sns.set_context('paper')
sns.countplot(y = 'eda_device_specification',
            data = eda_devices,
            order = eda_devices['eda_device_specification'].value_counts().index)
plt.ylabel('Device',
            fontsize = 24,
            fontweight = 'bold')
plt.xlabel('')
plt.yticks(fontsize = 25)
plt.xticks(ticks = range(1,19), fontsize = 23)
plt.show()

In [None]:
#Aquellos que tenian una frecuencia de 4 o menos los agrupamos en 'Others'
mapping_others ={
        
        'Mindfield eSense': 'Others (<1)',
        'BioNeuro multichannel biofeedback instrument': 'Others (<1)',
        'Thought Technology': 'Others (<1)',
        'e-Health Sensor Platform V2.0': 'Others (<1)',
        'Bodymedia': 'Others (<1)',
        'LabVIEW': 'Others (<1)',
        'Biosignalplux': 'Others (<1)',
        'Gen II Analog Devices': 'Others (<1)',
        'Microsoft Band 2': 'Others (<1)',
        'RespiBAN Professional': 'Others (<1)',
        '(BITalino (r)evolution Plugged Kit BT ': 'Others (<1)'}

devices_with_others = eda_devices.copy()

devices_with_others.loc[:,'eda_device_specification'] = devices_with_others.loc[:,'eda_device_specification'].replace(mapping_others)
devices_with_others['eda_device_specification'].value_counts()

In [None]:
#Orden
plot_order = ['Biosemi ActiveTwo','BIOPAC', 'Shimmer', 'Empatica','BioRadio 150', 'PowerLab', 'Q Sensor by Afectiva' , 'Grove','NEXUS','ProComp Infinity', 'GSR2','Sociograph', 'Commercial bluetooth sensor', 'Variport', 'Others (<1)']

#Plot
plt.figure(figsize = (28,14))
sns.set_context('paper')
sns.countplot(y = 'eda_device_specification',
            data = devices_with_others,
            order = plot_order,
            palette = "bone")
plt.ylabel('Device', fontsize = 23,fontweight = 'bold')
plt.xlabel('Quantity', fontsize = 23)
plt.yticks(fontsize = 24)
plt.xticks(ticks = range(1,19), fontsize = 22)
plt.show()

In [None]:
devices_with_others["eda_device_specification"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
df_eda['location_hemibody'] = df_eda['location_hemibody'].replace({'non-dominant': 'not dominant'})

In [None]:
hemibody = df_eda.groupby(['paper_id', 'location_hemibody']).nth(0)
hemibody.reset_index(inplace=True)
hemibody['location_hemibody'].value_counts()

In [None]:
sum(hemibody['location_hemibody'].value_counts())

In [None]:
hemibody['location_hemibody'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
hemibody_only_reported = hemibody[hemibody['location_hemibody'] != "-"]
hemibody_only_reported['location_hemibody'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
sensors = df_eda.groupby(['paper_id','is_hands','wrist', 'chest', 'left_lobe_temporalis',
                    'finger_thumb', 'finger_index', 'finger_mIddle', 'finger_ring', 'finger_little',
                     'phalange_proximal', 'phalange_medial','phalange_distal',
                     ]).nth(0)
sensors.reset_index(inplace= True)

In [None]:
sensors_location = df_eda.groupby(['paper_id','is_hands','wrist', 'chest', 'left_lobe_temporalis']).nth(0)
sensors_location.reset_index(inplace = True)

PAPERS WITH NO DATA RELATED TO SENSORS LOCATION

In [None]:
len(sensors[(sensors['is_hands'] == '-') &
            (sensors['wrist'] == '-') &
            (sensors['chest'] == '-') &
            (sensors['left_lobe_temporalis'] == '-')])

In [None]:
general_place = multi_reversing(sensors, 'model_id',sensors[['is_hands','wrist', 'chest', 'left_lobe_temporalis']])
general_place['variable'].value_counts()

In [None]:
general_place['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
finger_sensor = multi_reversing(sensors, 'model_id',sensors[['finger_thumb', 'finger_index', 'finger_mIddle', 'finger_ring', 'finger_little']])
finger_sensor['variable'].value_counts()

In [None]:
finger_sensor['variable'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [None]:
location_phalanges = multi_reversing(sensors, 'model_id',sensors[['phalange_proximal', 'phalange_medial','phalange_distal']])
location_phalanges['variable'].value_counts()

In [None]:
finger_sensor['variable'].value_counts()

In [None]:
finger_sensor['variable'] = finger_sensor['variable'].str.replace('finger_mIddle','Middle')
finger_sensor['variable'] = finger_sensor['variable'].str.replace('finger_index','Index')
finger_sensor['variable'] = finger_sensor['variable'].str.replace('finger_ring','Ring')
finger_sensor['variable'] = finger_sensor['variable'].str.replace('finger_thumb','Thumb')
finger_sensor['variable'] = finger_sensor['variable'].str.replace('finger_little','Little')

In [None]:
fig, axes = plt.subplots(2, 2)

fig.set_figheight(7)
fig.set_figwidth(10)

ax1 = sns.countplot(x = 'location_hemibody',
    data = hemibody,
    order = plot_order,
    ax=axes[0,0])

ax2 = sns.countplot(x = 'variable', data = general_place,
    ax=axes[0,1])

ax3 = sns.countplot(x = 'variable', data = finger_sensor,
    ax=axes[1,0])

fig.delaxes(axes[1,1])

"""ax1.set_ylim([0, 1])
ax2.set_ylim([0, 1])"""

ax1.set_yticklabels(ax1.get_yticklabels())
ax2.set_yticklabels(ax2.get_yticklabels())
ax3.set_yticklabels(ax3.get_yticklabels())
ax1.set_xticklabels(ax1.get_xticklabels())
ax2.set_xticklabels(ax2.get_xticklabels())
ax3.set_xticklabels(ax3.get_xticklabels())

ax1.set_xlabel("Hemibody location")
ax1.set_ylabel("Count")
ax2.set_xlabel("Location of electrodes in the body")
ax2.set_ylabel("Count")
ax3.set_xlabel("Location of electrodes in the hand")
ax3.set_ylabel("Count")

"""ax1.set_title("Proporción de trabajos por año (2010 - 2020) segun tipo de modelo emocional")
ax2.set_title("Proporción de trabajos por año (2010 - 2020) segun tipo de algoritmo")"""

"""ax1.legend(title='Modelo afectivo', loc='upper left')
ax2.legend(title='Tipo de algoritmo', loc='upper left')"""

### 7. Statistical learning models

In [None]:
df_statistical_learning_models = df_statistical_learning_models.fillna('-')

#### 7.1. Affective models

In [None]:
#rellenar datos faltantes y NO SE DROPEAN los duplicados (hay papers que usan multiples modelos)
df_statistical_learning_models=df_statistical_learning_models[df_statistical_learning_models['affective_model'].isin(['categorical', 'dimensional'])]
df_statistical_learning_models_0 = df_statistical_learning_models.groupby(['paper_id','affective_model']).nth(0)
df_statistical_learning_models_0.reset_index(inplace=True)

#ploteo
category_order = [2010, 2011, 2012, 2013, 2014, 2015, 2015, 2016, 2017, 2018, 2019, 2020]
g= sns.countplot(x='year', 
    data= df_statistical_learning_models_0, 
    hue='affective_model', 
    order=category_order)
g.set(xlabel = 'Año', ylabel = 'Cantidad de papers')
plt.savefig('.\\Emmanuel\\figures\\NUEVOS. Cantidad de papers por año (2010-2020) segun tipo de modelo emocional.png')

In [None]:
df_statistical_learning_models_0["year"] = df_statistical_learning_models_0["year"].astype(int)
df_statistical_learning_models_0

#### ¿Cual es la evolucion temporal (2010-2020) del uso de los modelos de regresion y categoriales empleados?
Cantidad de modelos de regresion o categorial por anio

2. Hacer un plot de la progresión de modelos de regresión vs clasificación de 2010 a 2020. Este plot deberia ser igual al plot que ve la progresion de modelos dimensioanles vs categoriales. La idea con este plot seria analizar sin la progresion de papers basados en modelos dimensioanels se acompaña con modelso estadísticos de regresion (que es lo que se esperaria dado el tipo de variable, pero asumimos que no sucede)

In [None]:
def label_model (row):
   if row['is_classifier'] == "x" :
      return 'classifier'
   if row['is_regressor'] == "x" :
      return 'regressor'
   return 'Other'

In [None]:
df_statistical_learning_models['model']  = df_statistical_learning_models.apply(lambda row: label_model(row), axis=1)

In [None]:
df_statistical_learning_models['model'].value_counts()

In [None]:
df_models = df_statistical_learning_models[["apa_citation",'model', "year", "model_id"]]

In [None]:
#ploteo
category_order = [2010, 2011, 2012, 2013, 2014, 2015, 2015, 2016, 2017, 2018, 2019, 2020]
g= sns.countplot(x='year', 
    data= df_models, 
    hue='model', 
    order=category_order)
g.set(xlabel = 'Año', ylabel = 'Cantidad de modelos')
plt.savefig('.\\Emmanuel\\figures\\NUEVOS. Cantidad de modelos de regresion y clasificacion por año (2010 - 2020).png')

#### ¿Cual es la frecuencia del uso de modelos algoritmicos de regresion y clasificacion?
3. Gráficos frencuencia de los modelos algoritimicos, según modelos de regresión y clasificación

- Interpretaciones: Los algoritmos clasificadores son por mucho los mas usados, ademas de ser los que mas variedad representan. Que implica esto? Es lo mismo aplicar algoritmos clasificadores o regresores?
- Nota: ordenar los valores de los gráficos y unirlos en uno, buscar graficar los mas usados (primeros 5 o 10) 

In [None]:
df_statistical_learning_models = df_statistical_learning_models.fillna('-')

#para regressor
df_algoritmos_regre = multi_reversing(df_statistical_learning_models, 'model_id', df_statistical_learning_models.iloc[:,43:57])
df_algoritmos_regre['variable'] = df_algoritmos_regre['variable'].str.replace('regre_','')

titulos = [' ', 'Algoritmo', 'Cantidad de modelos']
bar_plot('variable',df_algoritmos_regre,titulos)

In [None]:
#para classifier

df_algoritmos_class = multi_reversing(df_statistical_learning_models, 'model_id', df_statistical_learning_models.iloc[:,8:40])
df_algoritmos_class['variable'] = df_algoritmos_class['variable'].str.replace('class_','')

#Lista de algoritmos unicos de clasificacion
algoritmos_de_clasificacion = df_algoritmos_class["variable"].unique()
#print(algoritmos_de_clasificacion)

titulos = [' ', 'Algoritmo', 'Cantidad de modelos']
bar_plot('variable',df_algoritmos_class,titulos)

In [None]:
df_all_models = df_statistical_learning_models.iloc[:,1:57]
df_all_models.drop(df_all_models.columns[[1,2,3,4,5,6,39,40,41]], axis=1, inplace=True)

df_all_models = multi_reversing(df_all_models, 'model_id', df_all_models.iloc[:,1:])
df_all_models['variable'] = df_all_models['variable'].str.replace('class_','')
df_all_models['variable'] = df_all_models['variable'].str.replace('regre_','')

titulos = [' ', 'Algoritmo', 'Cantidad de modelos']
bar_plot('variable',df_all_models,titulos)

In [None]:
vc = df_all_models['variable'].value_counts()
vc = vc.iloc[:10]
df_dv = vc.to_frame()
df_dv = df_dv.reset_index()
df_dv.rename(columns = {'index':'algoritmo', 'variable':'cantidad'}, inplace = True)

titulos = [' ', 'Algoritmo', 'Cantidad de modelos']
g = sns.barplot(data=df_dv, x='algoritmo', y='cantidad', palette="PuBuGn")
g.set(title = titulos[0], xlabel = titulos[1], ylabel = titulos[2])
plt.xticks(rotation=90)
plt.savefig('.\\Emmanuel\\figures\\NUEVOS. Los 10 algoritmos mas usados.png')

### 8. Algorithms and performances

#### Proporción de trabajos por año (2010 - 2020) segun tipo de modelo emocional y tipo de algoritmo

In [None]:
df_models = df_statistical_learning_models[["paper_id","apa_citation",'model', "year", "model_id"]]

df_models = df_models.groupby(
        ["paper_id",'model']
        ).nth(0)
df_models.reset_index(inplace=True)

In [None]:
"""df_statistical_learning_models['affective_model'] = df_statistical_learning_models['affective_model'].str.replace('categorical','categorial')"""

In [None]:
models = df_statistical_learning_models[["paper_id", "year", "affective_model", "model_id"]]

models = models.groupby(
        ["paper_id",'affective_model']
        ).nth(0)
models.reset_index(inplace=True)

models["year"] = models["year"].astype(int)

In [None]:
models["affective_model"].value_counts()

In [None]:
models_crosstab = pd.crosstab(index=models['year'], columns=models['affective_model'],normalize='index')

In [None]:
"""df_models['model'] = df_models['model'].str.replace('classifier','clasificación')
df_models['model'] = df_models['model'].str.replace('regressor','regresión')"""

In [None]:
n_models = df_models.groupby(
        ["paper_id",'model']
        ).nth(0)
n_models.reset_index(inplace=True)

n_models["year"] = n_models["year"].astype(int)

In [None]:
n_models_crosstab = pd.crosstab(index=n_models['year'], columns=n_models['model'],normalize='index')

In [None]:
fig, axes = plt.subplots(1, 2)

fig.set_figheight(7)
fig.set_figwidth(17)

ax1 = models_crosstab.plot(kind='bar', 
    stacked=True,
    rot=0,
    ax=axes[0])

ax2 = n_models_crosstab.plot(kind='bar', 
    stacked=True,
    rot=0,
    ax=axes[1])

ax1.set_ylim([0, 1])
ax2.set_ylim([0, 1])

ax1.set_yticklabels(ax1.get_yticklabels())
ax2.set_yticklabels(ax2.get_yticklabels())
ax1.set_xticklabels(ax1.get_xticklabels())
ax2.set_xticklabels(ax2.get_xticklabels())

ax1.set_xlabel("Year")
ax1.set_ylabel("Proportion of articles")
ax2.set_xlabel("Year")
ax2.set_ylabel("Proportion of articles")

"""ax1.set_title("Proporción de trabajos por año (2010 - 2020) segun tipo de modelo emocional")
ax2.set_title("Proporción de trabajos por año (2010 - 2020) segun tipo de algoritmo")"""

ax1.legend(title='Affective model', loc='upper left')
ax2.legend(title='Type of algorithm', loc='upper left')

#### Interpretation

In [None]:
data_interpretation_model = pd.read_csv('.\data\Tabla Normalizada - Statistical Learning model.csv')
data_interpretation_model= data_interpretation_model.fillna('-')
model_interpretion = data_interpretation_model[data_interpretation_model['model_interpretation'] !='-']
model_interpretation = model_interpretion.groupby(['paper_id', 'model_interpretation']).nth(0)

In [None]:
model_interpretation.reset_index(inplace= True)
model_interpretation.drop_duplicates(subset = ['paper_id'],inplace=True)
model_interpretation

In [None]:
print(f' En {len(model_interpretation)} papers se realizan interpretaciones emocionales de los modelos')

In [None]:
# lista papers id
model_interpretation_list = model_interpretation["paper_id"].to_list()
model_interpretation_list = [int(a) for a in model_interpretation_list]
model_interpretation_list

In [None]:
df_metadata_filtered = df_metadata[df_metadata['paper_id'].isin(model_interpretation_list)]
df_metadata_filtered.drop_duplicates("paper_id", inplace= True)
df_metadata_filtered[["paper_id", "apa_citation", "year", "source_title"]]

In [None]:
list_journal_int = df_metadata_filtered["source_title"].unique()
list_journal_int = list_journal_int.tolist()
list_journal_int

In [None]:
titulos = [' ', 'Journal', 'Cantidad']

var_x = "source_title"
df = df_metadata_filtered

g = sns.countplot(y=var_x, data=df, order = getattr(df, var_x).value_counts().index)
g.set(title = titulos[0], xlabel = titulos[1], ylabel = titulos[2])
plt.xticks(rotation=90)
    
plt.tight_layout()
plt.savefig(f'.\\Emmanuel\\figures\\NUEVOS. {titulos[0]}.jpg', dpi=1000)
plt.show()