# Post-processing papers after selection by hand

In this file:
1. We load the papers that have been selected by hand
2. We load papers from the initial pool
3. We merge both pools of papers
4. We create a new excel file appropriate to do a classification of all papers as: tool creation/ no tool creation, for AI/not for AI

***

**Loading libraries:**

In [1]:
import pandas as pd
import os
import json
import re
from pprint import pprint
from collections import Counter
import xlsxwriter

## 1. Load initial pool articles

In [2]:
_path_ini_pool = os.path.join('initial_pool_articles.csv')

cols= ['Key', 'Item Type', 'Publication Year', 'Author', 'Title',
       'Publication Title', 'ISBN', 'ISSN', 'DOI', 'Url', 'Abstract Note']
df_ini = pd.DataFrame(pd.read_csv(_path_ini_pool, sep=';', usecols=cols))

df_ini = df_ini.rename(columns={'Title': 'title'})

# add columns with lower case titles with letters only
t_list = []
for title in df_ini['title'].str.lower():
    t_list.append(" ".join(re.findall("[A-Za-z]+",title)))
df_ini = pd.concat([df_ini.copy(deep=True), pd.DataFrame(t_list, columns=["title_lower"])], axis=1)

df_ini = df_ini.rename(columns={'Key': 'key'})
df_ini = df_ini.rename(columns={'ISBN': 'isbn'})
df_ini = df_ini.rename(columns={'ISSN': 'issn'})
df_ini = df_ini.rename(columns={'DOI': 'doi'})
df_ini = df_ini.rename(columns={'Abstract Note': 'abstract'})
df_ini = df_ini.rename(columns={'Publication Title': 'pub_info'})
df_ini = df_ini.rename(columns={'Publication Year': 'date'})
df_ini = df_ini.rename(columns={'Url': 'link'})
df_ini = df_ini.rename(columns={'Author': 'authors'})
df_ini = df_ini.rename(columns={'Item Type': 'item_type'})

print("Number of papers in initial pool: ", df_ini.shape[0])

Number of papers in initial pool:  25


## 2. Load selected papers

In [3]:
_path_selected = os.path.join('../4_selection_by_hand/selection_by_hand_2023_09_13.xlsx')
usecols = ['id', 'title', 'link', 'date', 'pub_info', 'data_source', 'keep', 'don_t_know',
        #    'discard', 'comments_first_reviewer', 'assessor', 'keep_don_t_know', 'keep_correction', 'comments_second_reviewer', 'keep_title']
           'discard', 'comments_first_reviewer', 'assessor', 'keep_don_t_know', 'keep_correction', 'keep_title']
df_selected = pd.DataFrame(pd.read_excel(_path_selected, usecols = usecols))

df_selected = df_selected[['id', 'title', 'link', 'date', 'pub_info', 'data_source', 'keep', 'don_t_know',
        #    'discard', 'comments_first_reviewer', 'assessor', 'keep_don_t_know', 'keep_correction', 'comments_second_reviewer', 'keep_title']]
           'discard', 'comments_first_reviewer', 'assessor', 'keep_don_t_know', 'keep_correction', 'keep_title']]

# If keep_correction is not empty then we use it to replace the current value of keep_title:
for i, row in df_selected.iterrows():
    if row['keep_correction']==row['keep_correction']:
        df_selected.loc[i, 'keep_title'] = row['keep_correction']

all_t_list = []
for title in df_selected['title'].str.lower():
    all_t_list.append(" ".join(re.findall("[A-Za-z]+",title)))
print("Total number of papers: ", len(all_t_list))
        
df_selected = df_selected[df_selected['keep_title']==1]
df_selected = df_selected.reset_index(drop=True)

# list of lower case titles (and with letters only):
t_list = []
for title in df_selected['title'].str.lower():
    t_list.append(" ".join(re.findall("[A-Za-z]+",title)))
print("Number of selected papers: ", len(t_list))

# if we want to add t_list as a new column:
# df_selected = pd.concat([df_selected.copy(deep=True), pd.DataFrame(t_list, columns=["title_lower"])],
#                         axis=1)

# print("Number of papers in selected pool: ", df_selected.shape[0])
# df_selected

Total number of papers:  2607
Number of selected papers:  146


## 3. Merge selected papers and initial pool papers

Just to check if the initial titles are in the selected articles

In [4]:
titles_ini = df_ini['title_lower'].values
already_in = 0
not_in = 0
for title in titles_ini:
    if title in t_list:
#         print("In")
#         print(title)
        already_in += 1
    else:
#         print("Not in")
#         print(title)
        not_in += 1
#     print('**************')
print("already in: ", already_in)
print("not in: ", not_in)

already in:  13
not in:  12


Just to check if the initial titles are in the papers list (resulting from the automatic selection) at all

In [5]:
titles_ini = df_ini['title_lower'].values
already_in = 0
not_in = 0
for title in titles_ini:
    if title in all_t_list:
#         print("In")
#         print(title)
        already_in += 1
    else:
#         print("Not in")
#         print(title)
        not_in += 1
#     print('**************')
print("already in: ", already_in)
print("not in: ", not_in)

already in:  13
not in:  12


Check if the initial titles are in the initial search results (before automatic selection): todo.

In [6]:
# TODO

Adding titles not already in the selected papers:

In [7]:
for i, row in df_ini.iterrows():
    # rewrite the pub_info column to contain authors and document title info:
    tmp1 = row['authors'] if row['authors'] == row['authors'] else ""
    tmp2 = row['pub_info'] if row['pub_info'] == row['pub_info'] else ""
    tmp = tmp1 + " " + tmp2
    df_ini.loc[i, 'pub_info'] = tmp
    df_ini.loc[i, 'data_source'] = 'ini'
    
    # add 1 for keep title if not already in the selected titles:
    if row['title_lower'] in t_list:
        df_ini.loc[i, 'keep_title'] = 0
    else:
        df_ini.loc[i, 'keep_title'] = 1
        
        
df_ini_keep = df_ini[df_ini['keep_title']==1]
df_ini_keep = df_ini_keep.reset_index(drop=True)


df_ini_keep = pd.concat([df_ini_keep[['title', 'link', 'date', 'pub_info',
                        'data_source', 'keep_title']].copy(deep=True), 
                     pd.DataFrame([2661 + i for i in range(df_ini_keep.shape[0])], columns=['id'])
                     ], axis=1)

print("Number of papers from initial pool we add: ", df_ini_keep.shape[0])
df_ini_keep

Number of papers from initial pool we add:  12


Unnamed: 0,title,link,date,pub_info,data_source,keep_title,id
0,Energy Usage Reports: Environmental awareness ...,http://arxiv.org/abs/1911.08354,2019,"Lottick, Kadan; Susai, Silvia; Friedler, Sorel...",ini,1.0,2661
1,Energy of Computing on Multicore CPUs: Predict...,http://arxiv.org/abs/1907.02805,2021,"Shahid, Arsalan; Fahad, Muhammad; Manumachu, R...",ini,1.0,2662
2,"Energy Predictive Models of Computing: Theory,...",,2021,"Shahid, Arsalan; Fahad, Muhammad; Manumachu, R...",ini,1.0,2663
3,Energy and Policy Considerations for Deep Lear...,https://aclanthology.org/P19-1355,2019,"Strubell, Emma; Ganesh, Ananya; McCallum, Andr...",ini,1.0,2664
4,LIKWID: A Lightweight Performance-Oriented Too...,https://doi.org/10.1109/ICPPW.2010.38,2010,"Treibig, Jan; Hager, Georg; Wellein, Gerhard P...",ini,1.0,2665
5,A first look into the carbon footprint of fede...,https://www.semanticscholar.org/paper/A-first-...,2020,"Qiu, Xinchi; Parcollet, Titouan; Beutel, Danie...",ini,1.0,2666
6,PMT: Power Measurement Toolkit,,2022,"Corda, Stefano; Veenboer, Bram; Tolley, Emma 2...",ini,1.0,2667
7,CUMULATOR — a tool to quantify and report the ...,,2020,"Trébaol, Tristan",ini,1.0,2668
8,Efficient Execution of Convolutional Neural Ne...,,2020,"Rodrigues, Crefeda",ini,1.0,2669
9,A Comparative Study of Methods for Measurement...,https://www.mdpi.com/1996-1073/12/11/2204,2019,"Fahad, Muhammad; Shahid, Arsalan; Manumachu, R...",ini,1.0,2670


## 4. Prepare dataframe for classification phase

In [8]:
df_all = pd.concat([df_selected, df_ini_keep], axis=0, ignore_index=True)
df_all = pd.concat([df_all.copy(deep=True), 
                    pd.DataFrame([""]*df_all.shape[0], columns=['tool_creation']),             # has created the tool or not
                    pd.DataFrame([""]*df_all.shape[0], columns=['for_AI']),                    # applied to AI or not
                    pd.DataFrame([""]*df_all.shape[0], columns=['type']),                      # type of tool/method: meter,
                                                                                               # on-chip, utilization, other
                    pd.DataFrame([""]*df_all.shape[0], columns=['from_other_id'])], axis=1)    # if not created, is there an 
                                                                                               # id of other paper's tool USED
print('Final number of selected papers: ', df_all.shape[0])

Final number of selected papers:  158


## 5. Save as an Excel file for classification

In [9]:
def create_excel(file_name, df_save):

    # Apply custom colors to the dataframe
    color_mapping = {
        'keep': 'green',
        'don_t_know': 'orange',
        'discard': 'red',
        'tool_creation': 'orange',
        'for_AI': 'purple',
        'type': 'green',
        
    }

    # Create an Excel workbook and worksheet
    workbook = xlsxwriter.Workbook(file_name)
    worksheet = workbook.add_worksheet()

    # Write the headers
    headers = list(df_save.columns)
    for col_num, header in enumerate(headers):
        worksheet.write(0, col_num, header)

    # Write the data and apply background color
    for row_num, row_data in enumerate(df_save.values):
        for col_num, cell_data in enumerate(row_data):
            cell_data = cell_data if cell_data == cell_data else ""
            worksheet.write(row_num + 1, col_num, cell_data)
            #if df.columns[col_num] == 'City' and cell_data in color_mapping:
            #    cell_format = workbook.add_format({'bg_color': color_mapping[cell_data]})
            #    worksheet.write(row_num + 1, col_num, cell_data, cell_format)
            if df_save.columns[col_num] == 'keep':
                cell_format = workbook.add_format({'bg_color': color_mapping['keep']})
                worksheet.write(row_num + 1, col_num, cell_data, cell_format)
            if df_save.columns[col_num] == 'don_t_know':
                cell_format = workbook.add_format({'bg_color': color_mapping['don_t_know']})
                worksheet.write(row_num + 1, col_num, cell_data, cell_format)
            if df_save.columns[col_num] == 'discard':
                cell_format = workbook.add_format({'bg_color': color_mapping['discard']})
                worksheet.write(row_num + 1, col_num, cell_data, cell_format)
            if df_save.columns[col_num] == 'tool_creation':
                cell_format = workbook.add_format({'bg_color': color_mapping['tool_creation']})
                worksheet.write(row_num + 1, col_num, cell_data, cell_format)
            if df_save.columns[col_num] == 'for_AI':
                cell_format = workbook.add_format({'bg_color': color_mapping['for_AI']})
                worksheet.write(row_num + 1, col_num, cell_data, cell_format)
            if df_save.columns[col_num] == 'type':
                cell_format = workbook.add_format({'bg_color': color_mapping['type']})
                worksheet.write(row_num + 1, col_num, cell_data, cell_format)

    # Save the Excel file
    workbook.close()

# Creating the excel file:
create_excel('test.xlsx', df_all[['id', 'title', 'link', 'date', 'pub_info', 'data_source', 'keep',
       'don_t_know', 'discard', 'comments_first_reviewer', 'assessor',
       'keep_don_t_know', 'keep_correction', 
       'tool_creation', 'for_AI', 'type', 'from_other_id']])