# Construção do Dataframe

## Carregamento dos dados

In [None]:
!pip install bibtexparser

Collecting bibtexparser
  Downloading bibtexparser-1.4.3.tar.gz (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bibtexparser
  Building wheel for bibtexparser (setup.py) ... [?25l[?25hdone
  Created wheel for bibtexparser: filename=bibtexparser-1.4.3-py3-none-any.whl size=43549 sha256=e87a2a11cdeeb9659220b2d97930db0f9bad5a7150c077ed2d785db8ffa2a7db
  Stored in directory: /root/.cache/pip/wheels/16/fb/76/306387739cf9d53b1c39b0c8aadbbb17dc05f256756d8fd915
Successfully built bibtexparser
Installing collected packages: bibtexparser
Successfully installed bibtexparser-1.4.3


In [None]:
import pandas as pd
import bibtexparser
import re

In [None]:
bib = '/content/included.bib'

In [None]:
# Carregar o arquivo .bib
with open(bib, encoding='utf-8') as bibfile:
    bib_database = bibtexparser.load(bibfile)

In [None]:
# Obter as entradas como dicionários
entries = bib_database.entries

In [None]:
df = pd.DataFrame(entries)

In [None]:
df['note'][0]

'Times Cited in Web of Science Core Collection:  0     Total Times Cited:  0     Cited Reference Count:  19 | RAYYAN-INCLUSION: {"Gabriel"=>"Included"} | RAYYAN-LABELS: Insights para tomada de decisão | USER-NOTES: {"Gabriel"=>["SIM", "SIM"]}'

## Funções para extração e tratamento dos dados

In [None]:
def extract_labels(text):
    match = re.search(r'RAYYAN-LABELS\s*:\s*(.*?)(\s*\||$)', text)
    return match.group(1).strip() if match else None

In [None]:
def extract_qdata(text):
    match = re.search(r'USER-NOTES\s*:\s*{.*?\[([^\]]+)\]}', text)
    if match:
        first_value = match.group(1).split(',')[0].strip().strip('"')
        return first_value
    return None

In [None]:
def extract_2qdata(text):
    match = re.search(r'USER-NOTES\s*:\s*{.*?\[([^\]]+)\]}', text)
    if match:
        values = [v.strip().strip('"') for v in match.group(1).split(',')]
        if len(values) > 1:
            return values[1]  # retorna o segundo valor, se existir
    return None

In [None]:
def substitute_labels(df_column):
    # Converte a coluna para string
    df_column = df_column.astype('string')
    substitutes = {'Suporte à aprendizagem e personalização': 'Learning support and personalization',
                   'Insights para tomada de decisão': 'Data-driven insights for decision-making',
                   'Promoção do engajamento e motivação': 'Promotion of engagement and motivation',
                   'Acessibilidade e usabilidade': 'Accessibility and usability improvements',
                   'SIM': 'YES',
                   'NÃO': 'NO'}
    df_column = df_column.replace(substitutes)
    return df_column

## Tratamento dos dados

In [None]:
data = df[['title', 'year']]

In [None]:
# Ano para número
data['year'] = pd.to_numeric(data['year'].copy())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = pd.to_numeric(data['year'].copy())


In [None]:
data['category'] = df['note'].copy().apply(extract_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['category'] = df['note'].copy().apply(extract_labels)


In [None]:
data['category'] = substitute_labels(data['category'].copy())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['category'] = substitute_labels(data['category'].copy())


In [None]:
data['presents-qdata'] = substitute_labels(df['note'].copy().apply(extract_qdata))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['presents-qdata'] = substitute_labels(df['note'].copy().apply(extract_qdata))


In [None]:
data['applied'] = substitute_labels(df['note'].copy().apply(extract_2qdata))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['applied'] = substitute_labels(df['note'].copy().apply(extract_2qdata))


In [None]:
data

Unnamed: 0,title,year,category,presents-qdata,applied
0,The analysis of emotions for academic performa...,2022,Data-driven insights for decision-making,YES,YES
1,ENHANCING SOCIO-PEDAGOGICAL ENGAGEMENT IN MOOD...,2024,Promotion of engagement and motivation,NO,YES
2,Generative AI for Customizable Learning Experi...,2024,Learning support and personalization,YES,YES
3,Participation in Online Courses and Interactio...,2019,Promotion of engagement and motivation,YES,YES
4,Unveiling Uncertainty: Supporting Learners Thr...,2023,Data-driven insights for decision-making,NO,YES
5,An intelligent platform with automatic assessm...,2019,Promotion of engagement and motivation,YES,YES
6,Understanding Collaborative Learning Processes...,2025,Data-driven insights for decision-making,YES,YES
7,The NAJEH Effect: How ChatGPT is Shaping the F...,2024,Learning support and personalization,NO,YES
8,Doc2Vec based Question and Answer Search System,2021,Learning support and personalization,NO,YES
9,AI in education: Evaluating the impact of mood...,2025,Learning support and personalization,YES,YES


## Ordenação e exportação do dataframe

In [None]:
#data = data.sort_values(by='presents-qdata', ascending=True, ignore_index=True)
data = data.sort_values(by='year', ascending=True, ignore_index=True)
#data = data.sort_values(by='title', ascending=True, ignore_index=True)
#data

In [None]:
data.to_excel('tabela-de-revisao.xlsx', index=False, sheet_name='Resultados')

# Criação dos Gráficos

In [None]:
import plotly.graph_objects as go
import plotly.colors as pc

In [None]:
fig_data = data.copy()

In [None]:
# ano → nome
fluxo1 = fig_data[['title', 'year']].copy()
fluxo1.columns = ['source', 'target']

# nome → categoria
fluxo2 = fig_data[['year', 'category']].copy()
fluxo2.columns = ['source', 'target']

# categoria → aplicação
fluxo3 = fig_data[['category', 'presents-qdata']].copy()
fluxo3.columns = ['source', 'target']


# Combina os fluxos
sankey_df = pd.concat([fluxo2, fluxo3], ignore_index=True)

In [None]:
# Cria a lista única de nós
labels = pd.unique(sankey_df[['source', 'target']].values.ravel())
label_map = {label: i for i, label in enumerate(labels)}

# Mapeia o source/target para os IDs
sankey_df['source_id'] = sankey_df['source'].map(label_map)
sankey_df['target_id'] = sankey_df['target'].map(label_map)

In [None]:
label_map

{2022: 0,
 'Data-driven insights for decision-making': 1,
 2024: 2,
 'Promotion of engagement and motivation': 3,
 'Learning support and personalization': 4,
 2019: 5,
 2023: 6,
 2025: 7,
 2021: 8,
 'YES': 9,
 'NO': 10}

In [None]:
# Conta os fluxos (agrupamento)
sankey_df = sankey_df.value_counts().reset_index(name='value')

In [None]:
# Define as cores de nodos específicos
colors = pc.qualitative.Plotly.copy()
colors[1], colors[9] = 'coral', '#636EFA'
colors.append('#EF553B')

In [None]:
colors, len(colors)

(['#636EFA',
  'coral',
  '#00CC96',
  '#AB63FA',
  '#FFA15A',
  '#19D3F3',
  '#FF6692',
  '#B6E880',
  '#FF97FF',
  '#636EFA',
  '#EF553B'],
 11)

In [None]:
# Plota o gráfico
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=20,
        thickness=25,
        label=labels,
        color = colors
    ),
    link=dict(
        source=sankey_df['source_id'],
        target=sankey_df['target_id'],
        value=sankey_df['value']
    )
)])

In [None]:
fig.update_layout(
    #title_text="Distribuição de estudos",
    height=800,
    width=1400,
    font=dict(
        size=24,
        style="normal",
        color = 'black'
    )
)
