# Selection phase 1: semi-automatic based on identified "excluding words"

In this notebook, we identify excluding words, i.e.,  words and pairs of words (among the words contained in all titles) that automatically make a title containing one of them off-topic.

We look for excluding words using the list of titles from the Google Scholar data source in the fisrt section of this notebook. Then, we use these exclusing words to select titles in the other two data sources: IEEE in Section 2, and ACM in Section 3.

***

**Importing libraries:**

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter

import os

from IPython.display import display, HTML, clear_output
from datetime import datetime
import json

**Usefull functions:**

In [2]:
def apply_excluding_words(words, titles_df):
    """ To exclude any title containing any elements of the list words"""

    df = titles_df.copy(deep=True)
    for i, row in df.iterrows():
        if df.loc[i,'keep_title'] == 1:
            title_no_punct = " ".join(re.findall("[A-Za-z]+",df.loc[i,'title']))
            # check if any of the 'words' is in the current title
            res = any(ele.lower() in title_no_punct.lower().split() for ele in words)
            if res:        
                df.loc[i, 'keep_title']=0
    return(df)

def apply_excluding_2_words(twoWords_to_exclude, titles_df):
    """ To exclude any title containing any elements of the list twoWords_to_exclude"""

    df = titles_df.copy(deep=True)
    _words = [" "+elem+" " for elem in twoWords_to_exclude]
    for i, row in df.iterrows():
        if df.loc[i,'keep_title'] == 1:
            title_no_punct = " ".join(re.findall("[A-Za-z]+",df.loc[i,'title']))
            title_no_punct = " "+title_no_punct+" "
            # check if any of the 'words' is in the current title
            res = any(ele.lower() in title_no_punct.lower() for ele in _words)
            if res:        
                df.loc[i, 'keep_title']=0 
    return(df)

def check_proportions(words, titles_df_new, titles_df_old):
    """ To check that the semi-automatic selection happened as expected"""

    preserved = titles_df_new[titles_df_new['keep_title'] == 1]['title']
    removed = titles_df_new[titles_df_new['keep_title'] == 0]['title']
    old_removed = titles_df_old[titles_df_old['keep_title'] == 0]['title']
    print('Number of words to exclude: ', len(words))
    print("Number articles removed: ", len(removed))
    print("Number articles newly removed: ", len(removed) - len(old_removed))
    print("Number articles preserved: ", len(preserved))
    print("Check: ", len(removed)+len(preserved), '==', len(titles_df_new['title']))

def make_2words_csv(titles_df):
    """Create csv file with the list of pairs of words contained in all the titles"""
    
    #source:
    #https://stackoverflow.com/questions/18952894/word-frequency-count-based-on-two-words-using-python#:~:text=%3E%3E%3E%20from%20collections%20import%20Counter%20%3E%3E%3E%20import%20re,makes%27%3A%202%2C%20%27makes%20me%27%3A%202%2C%20%27I%20love%27%3A%202%7D
    
    preserved = titles_df[titles_df['keep_title'] == 1]['title']
    sentence = " ".join(preserved).lower()
    words = re.findall(r'\w+', sentence)
    
    two_words = [' '.join(ws) for ws in zip(words, words[1:])]

    no_list = ['in', 'of', 'and', 'to', 'for', 'a', 'on', 'the', 'how', 'by', 'an', \
               'at', 'as', 'with', 'using', 'based', 'its', 'towards']
    two_words_preserved = []
    two_words_removed = []
    for elem in two_words:
        res = any(e in elem.split() for e in no_list)
        if res:
            two_words_removed.append(elem)
        else:
            two_words_preserved.append(elem)

    wordscount = {w:f for w, f in Counter(two_words_preserved).most_common() if f > 1}
    wordscount_df = pd.DataFrame(wordscount.items(), columns=['TwoWords', 'InstancesNumber'])
    str_current_datetime = datetime.now().strftime("%Y%m%d-%H%M%S")
    file_name = "twoWords_"+str_current_datetime+".csv"
    wordscount_df.to_csv(file_name)
    
def prepare_titles_df(df, my_columns, title_column):
    titles_df = df[my_columns].rename(columns={title_column: 'original_title'})

    # add columns with lower case titles with letters only
    t_list = []
    for title in titles_df['original_title'].str.lower():
        t_list.append(" ".join(re.findall("[A-Za-z]+",title)))
    titles_df = pd.concat([titles_df.copy(deep=True), pd.DataFrame(t_list, columns=["title"])], axis=1)

    # new column 'keep_title'
    keep_title_df = pd.DataFrame([1 for i in range(titles_df.shape[0])], columns=['keep_title'])
    titles_df = pd.concat([titles_df.copy(deep=True), keep_title_df], axis=1)
    
    return(titles_df)

## 1. Scholar

### 1.1. Loading the titles

In [3]:
_path_scholar_titles="../1_initial_search/saved_results_scholar/scholar_results.csv"
df = pd.DataFrame(pd.read_csv(_path_scholar_titles))

my_columns = ['title', 'result_id', 'publication_info_summary', 'link']
titles_df = prepare_titles_df(df, my_columns, 'title')

# just counting the titles
titles = [title for title in titles_df['title'].tolist()]
titles_nb = len(titles)
print("Number of titles: ", titles_nb)

Number of titles:  5822


### 1.2. Observation titles duplicates

We add the column 'count' to the dataframe title_df. This columns contains the number of times the title appears in the results.

Here, we just observe what the duplicates look like. Duplicates will be remove later on.

In [4]:
titles_dict_3 = dict(Counter(titles))
title_counts = []
for i, row in titles_df.iterrows():
    title_counts.append(titles_dict_3[row['title']])

count_df = pd.DataFrame(title_counts, columns=['count'])
titles_df = pd.concat([titles_df.copy(deep=True), count_df], axis=1)

dupl_titles_df = titles_df[titles_df['count'] > 1]
dupl_titles_set = set(dupl_titles_df['title'].to_list())


for title in dupl_titles_set:
    title_df = titles_df[titles_df['title'] == title]

    # UNCOMMENT HERE to check duplicates:
    # print("------------------------------------------")
    # print(title)
    # print(title_df[['result_id', 'publication_info_summary']])

In [5]:
for i, row in titles_df.iterrows():
    if row['count'] > 1:
        # All occurences of the same title are stored in df_tmp
        df_tmp = titles_df.loc[titles_df['title'] == row['title']]

        # UNCOMMENT HERE to check duplicates:
        # display(df_tmp[['original_title', 'link', 'publication_info_summary']])

In [6]:
# About duplicates
titles_dict_2 = dict(Counter(titles))
titles_dict_sorted_2 = dict(sorted(titles_dict_2.items(), key=lambda item: item[1], reverse = True))
one = False

# UNCOMMENT HERE to check duplicates:
# for key in titles_dict_sorted_2.keys():
#     if titles_dict_sorted_2[key] > 1:
#         print(titles_dict_sorted_2[key], key)
# print(titles_dict_sorted_2)

### 1.3. Extract and count titles' words

Below, we concatenate all titles, in lower case and without punctuation.

Then, we create a dictionnary of all words contained in this concatenation, with the associated number of occurences of the words.

In [7]:
titles_list = df['title'].tolist()
concat_titles = " ".join(titles_list).lower()
remove_punctutation = re.findall("[A-Za-z]+",concat_titles)
titles_dict = dict(Counter(remove_punctutation))
titles_dict_sorted = dict(sorted(titles_dict.items(), key=lambda item: item[1], reverse = True))
words_df = pd.DataFrame(titles_dict_sorted.items(), columns=['Word', 'InstancesNumber'])

# UNCOMMENT HERE to save:
# words_df.to_csv("file_name.csv") # to save

**The following step is done manually:** 

Then, manually we modify the csv file saved in (the last commented line of) the above cell. 

More precisely, we add a column in which we enter 1 if the word is an excluding word, 999 if we are not sure, an nothing otherwise.

### 1.4. Load csv with identified excluding words to apply the selection of titles

In [8]:
_path_scholar_titles="./0_selected_excluding_words/all_words_2.csv"

df = pd.DataFrame(pd.read_csv(_path_scholar_titles, sep=';', usecols = [1, 2, 3]))

# Take the list of selected words
df_select = df[df["Select"] == 1][["Word", "InstancesNumber"]]
words_to_exclude = df_select["Word"].to_list()
# print(words_to_exclude)
print('Number of words to exclude: ', len(words_to_exclude))

Number of words to exclude:  449


### 1.5. Apply the selection based on these excluding words

In [9]:
words_1 = words_to_exclude
titles_df_1 = apply_excluding_words(words_1, titles_df)
check_proportions(words_1, titles_df_1, titles_df) # to check the results of this selection step

Number of words to exclude:  449
Number articles removed:  2545
Number articles newly removed:  2545
Number articles preserved:  3277
Check:  5822 == 5822


In the next 2 sections, we look for new excluding words in another manner. 
First, in Section 1.6, we check what 'children words' of the identified excluding words (in ``word_1``) are contained in the titles.
Secondly, in Section 1.7

### 1.6. Check what are the words in the current title that have one of the identified 'excluding words' as root

Here, we save in the dictionary ``d`` the 'children words' (of excluding words) present in the titles.

In this dictionary, keys are the identified excluding words and values are corresponding lists of 'children words'.

In [10]:
new_words_parent_list = ['plants', 'grid', 'thermal', 'battery', 'water', 'health', \
'industrial', 'wave', 'particle', 'grids', 'home', 'risk', 'protein', 'iran', \
'reactor', 'waste', 'turbine', 'market', 'converter', 'spectral', 'urbanization', 'suburban', \
'failure', 'outage', 'molecular', 'space', 'fluid', 'magnetic', 'province', 'chemical', \
'indoor', 'alloy', 'room', 'india', 'satellite', 'food', 'machinery', 'forest', 'electrons', \
'spectra', 'wood', 'mechanical', 'vibration', 'reservoir', 'kinetic', 'hydro', 'diesel', 'motor', \
'microgrid', 'accident', 'railway', 'pump', 'robot', 'immune', 'pollutant', 'drug', 'house', \
'waves', 'business', 'shower', \
]

d = {}

cpt = 0

# Loop over all titles:
for i, row in titles_df.iterrows():
# for title in titles:
    
    title_no_punct = " ".join(re.findall("[A-Za-z]+",titles_df.loc[i,'title']))
    
    for ele in words_1:
        if cpt == 0:
            d[ele] = []
        title_words = title_no_punct.lower().split()
        for word in title_words:
            if ele.lower() in word.lower():
                if word.lower() not in d[ele] and word.lower() != ele.lower():
                    d[ele].append(word.lower())
        ### UNCOMMENT BELOW to make observations on the children of the excluding words:
        ### Here, we exclude from this observation ecluding words whose children have been selected in the Section 1.7
        ### in order to make observation about other possible children to select.
        # if ele not in ['ion', 'us', 'ant', 'cement', 'solid', 'sea', 'oil', 'mine', 'dam', 'city']+new_words_parent_list:
        #     if ele in title_no_punct.lower() and ele not in title_no_punct.lower().split():
        #         print('----------------------')
        #         print(ele)
        #         print(title_no_punct.lower())
    
    cpt+=1

We check which keys in d have a lot of "children", and choose whether or not to add these children or part of the list of words to be excluded.

To add the children of an excluding word, we add `` d['your excluding word']`` to the list ``new_word`` in Section 1.7 below.

In [11]:
for key in d.keys():
    print(key, d[key])

plant ['plants', 'implantable', 'powerplants']
nuclear ['internuclear']
pv ['pvt', 'spv', 'bipv', 'pvusa']
plants ['powerplants']
grid ['microgrid', 'grids', 'microgrids', 'supergrids']
applications []
thermal ['geothermal', 'hydrothermal']
gas ['fpgas', 'gastric', 'biogas', 'gasbxas', 'gases', 'gasification', 'gasoline', 'gasverbrauchsprognose', 'pegasus']
battery ['batteryless']
heat ['heated', 'wheat', 'heating', 'heatwave', 'heatwaves', 'reheaters', 'heaters', 'superheater', 'superheated']
water ['wastewater', 'groundwater', 'seawater', 'underwater', 'freshwater']
health ['healthcare']
fog []
air ['aircraft', 'pairwise', 'dairy', 'wheelchair', 'flair', 'fairness', 'clairvoyant', 'nucleaires', 'repair', 'impairment', 'airport', 'airfoil', 'airflow', 'paired', 'pairing', 'fair']
residential []
industrial ['industrialization']
fuel ['fuels', 'biofuels', 'fueled']
laser ['hilaserion', 'lasers']
coal ['charcoal']
heating []
economic ['socioeconomic', 'macroeconomic', 'economics']
househ

### 1.7. New exclusions based on the children words

In [12]:
new_words = d['plants'] + d['grid'] + d['thermal'] + d['battery'] + d['water'] + d['health'] + \
             d['industrial'] + d['wave'] + d['particle'] + d['grids'] + d['home'] + d['risk'] + d['protein'] + d['iran'] + \
             d['reactor'] + d['waste'] + d['turbine'] + d['market'] + d['converter'] + d['spectral'] + ['urbanization', 'suburban'] + \
             d['failure'] + d['outage'] + d['molecular'] + d['space'] + d['fluid'] + d['magnetic'] + d['province'] + d['chemical'] + \
             d['indoor'] + d['alloy'] + d['room'] + d['india'] + d['satellite'] + d['food'] + d['machinery'] + d['forest'] + ['electrons'] + \
             d['spectra'] + d['wood'] + d['mechanical'] + d['vibration'] + d['reservoir'] + d['kinetic'] + d['hydro'] + d['diesel'] + \
             d['motor'] + d['microgrid'] + d['accident'] + d['railway'] + d['pump'] + d['robot'] + d['immune'] + d['pollutant'] + d['drug'] + \
             d['house'] + d['waves'] + d['business'] + d['shower']

print('Number of new words to exclude (1): ', len(new_words))

# We use the above words to create new_words_parent_list

Number of new words to exclude (1):  127


In [13]:
new_words_2 = ['householders', 'household', 'households', 'houses', 'warehouses'] + d['steel'] + ['fuels', 'biofuels'] + \
['carbonyls', 'carcinoma', 'cardiovascular', 'carrier', 'cardiopulmonary', 'myocardial', 'hydrocarbon', 'cars', \
'healthcare', 'adenocarcinoma', 'cardiac', 'carcass', 'cargo', 'cardiomyopathy'] + d['molecule'] + d['africa'] + \
d['economic'] + d['membrane'] + ['gases', 'gasification', 'gasoline', 'gastric', 'biogas'] + d['drone'] + \
['mills'] + ['corneal'] + d['uav'] + ['wildfire', 'firefly'] + ['urbanization', 'suburban'] + d['orbit'] + ['converters'] + \
d['neutron'] + d['hydroelectric'] + ['atomization', 'atomistic', 'atomic', 'atoms'] + d['hospital'] + ['electrons'] + \
d['boiler'] + d['vessel'] + d['sulfur'] + d['oxygen'] + d['nuclear'] + d['chamber'] + d['coal'] + d['pv']

print('Number of new words to exclude (2): ', len(new_words_2))

# new_words_parent_list_2 = ...

Number of new words to exclude (2):  67


In [14]:
# update of the exclusion procedure:

words_2 = new_words + new_words_2
titles_df_2 = apply_excluding_words(words_2, titles_df_1)
check_proportions(words_2, titles_df_2, titles_df_1)

check = 0
for i, row in titles_df_2.iterrows():
    check += int(titles_df_2.loc[i, 'keep_title'] == titles_df_1.loc[i, 'keep_title'])
print(check)

Number of words to exclude:  194
Number articles removed:  2638
Number articles newly removed:  93
Number articles preserved:  3184
Check:  5822 == 5822
5729


### 1.8. New exclusions based on analysis of remaining titles

Here, we manually look for more excluding words.

In [15]:
words_3 = ['state estimation', 'power line', 'distribution line', 'transmission line', 'anomaly detection',\
              'fraud detection', 'fault detection', 'fault prediction']
titles_df_3 = apply_excluding_2_words(words_3, titles_df_2)
check_proportions(words_3, titles_df_3, titles_df_2)

Number of words to exclude:  8
Number articles removed:  2762
Number articles newly removed:  124
Number articles preserved:  3060
Check:  5822 == 5822


In [16]:
words_4 = ['shear walls', 'aircraft', 'metal forming processes', 'energy system', 'energy systems', \
                'energy harvesting', 'renewable energy', 'energy storage', 'energy expanditure' \
                'free energy estimation', 'hybrid energy']
titles_df_4 = apply_excluding_2_words(words_4, titles_df_3)
check_proportions(words_4, titles_df_4, titles_df_3)

Number of words to exclude:  10
Number articles removed:  2828
Number articles newly removed:  66
Number articles preserved:  2994
Check:  5822 == 5822


In [17]:
# UNCOMMENT HERE to create a csv file of pair of words from the remaining titles:
# make_2words_csv(titles_df_3)

In [18]:
_path=".\\0_selected_excluding_words\\twoWords_20230705-115941.csv"
df = pd.DataFrame(pd.read_csv(_path, sep=';', usecols = [1, 2, 3]))

# Take the list of selected words
df_select = df[df["Select"] == 1][["TwoWords", "InstancesNumber"]]
words_5 = df_select["TwoWords"].to_list() + ["power distribution system", "power distribution network"]

titles_df_5 = apply_excluding_2_words(words_5, titles_df_4)
check_proportions(words_5, titles_df_5, titles_df_4)

Number of words to exclude:  27
Number articles removed:  3233
Number articles newly removed:  405
Number articles preserved:  2589
Check:  5822 == 5822


In [19]:
# Uncomment HERE to create a csv file of pair of words from the remaining titles:
# make_2words_csv(titles_df_5)

In [20]:
preserved = titles_df_5[titles_df_5['keep_title'] == 1]['title']

# Uncomment HERE to print the preserved titles:
# for title in preserved:
#     print(title)

### 1.9. Saving the final results

In [21]:
str_current_datetime = datetime.now().strftime("%Y%m%d-%H%M%S")
file_name = "scholar_keep_"+str_current_datetime+".csv"
# Uncomment HERE to save the results:
# titles_df_5[titles_df_5['keep_title'] == 1].to_csv(file_name) # to save the results

**Remaining titles:**

In [22]:
remaining_titles = titles_df_5[titles_df_5['keep_title'] == 1]['original_title']
nb_remaining_titles = len(remaining_titles)
print("Number of remaining titles:", nb_remaining_titles)
print(remaining_titles)

Number of remaining titles: 2589
0       Estimation of energy consumption in machine le...
1       GPGPU performance and power estimation using m...
2       Mlee: Method level energy estimationâ€”a machine...
3       How to measure energy consumption in machine l...
5       Machine Learning Based Power Estimation for CM...
                              ...                        
5809    Power consumption prediction model and method ...
5813    Scheduling Algorithms for Federated Learning w...
5815    Resource Optimization and Device Scheduling fo...
5816    Balanced energy consumption based on historica...
5820    Energy-efficient in-situ monitoring using on-d...
Name: original_title, Length: 2589, dtype: object


**Test:**

Here we try removing all excluding words and pair of words at once for google scholar. We observe that we obtain the same resulst as before (with the step by step process).

In [23]:
words = words_1 + words_2 + words_3 + words_4 + words_5
titles_df_test = apply_excluding_2_words(words, titles_df)
check_proportions(words, titles_df_test, titles_df)

Number of words to exclude:  688
Number articles removed:  3233
Number articles newly removed:  3233
Number articles preserved:  2589
Check:  5822 == 5822


## 2. IEEE

### 2.1. Applying the excluding words

In [24]:
# create smaller df as for scholar - keep DOI as ID
_path_ieee="../1_initial_search/saved_results_other_datasources/ieee/export2023.06.12-11.30.07.csv"
df = pd.DataFrame(pd.read_csv(_path_ieee))

my_columns = ['Document Title', 'ISBNs', 'DOI', 'Publication Title', 'PDF Link', 'Abstract', \
              'Authors', 'Author Keywords', 'Publication Year']
titles_df_ieee = prepare_titles_df(df, my_columns, 'Document Title')

# Apply excluding words:
words = words_1 + words_2 + words_3 + words_4 + words_5
titles_df_ieee_selected = apply_excluding_2_words(words, titles_df_ieee)
check_proportions(words, titles_df_ieee_selected, titles_df_ieee)

Number of words to exclude:  688
Number articles removed:  176
Number articles newly removed:  176
Number articles preserved:  421
Check:  597 == 597


### 2.2. Importing the results into Zotero (reference managing software)

We use DOIs and ISBNs to import IEEE results into zotero. DOIs and ISNBs should be imported separately. In the DOIs list one of the articles appears two times with two different DOIs.

In [25]:
# import into zotero first_pool folder:
ieee_df = titles_df_ieee_selected[titles_df_ieee_selected['keep_title']==1]

str_current_datetime = datetime.now().strftime("%Y%m%d-%H%M%S")
file_name = "ieee_df_"+str_current_datetime+".csv"
# Uncomment HERE to save the results:
# ieee_df.to_csv(file_name) 

ISBNs = ieee_df[ieee_df['DOI'].isnull().values]['ISBNs'].to_list()
DOIs = ieee_df[ieee_df['DOI'].isnull().values == False]['DOI'].tolist()

print("x ---------------------------- x")
print(", ".join(ISBNs))
print("x ---------------------------- x")
print(", ".join(DOIs))
print("x ---------------------------- x")
print(len(DOIs))
print(len(ISBNs))
print(len(DOIs + ISBNs))

#look for duplicates with different DOIs:
test = ieee_df['title'].tolist()
test_dict = dict(Counter(test))
for key in test_dict.keys():
    if test_dict[key]>1:
        print(key)

x ---------------------------- x
978-3-8396-0439-7, 978-1-905824-26-7, 978-1-4503-4186-8, 978-1-4244-6747-1, 978-989-758-234-9, 978-1-4244-4165-5, 978-1-889335-42-1, 978-1-4503-5732-6, 978-3-901882-46-3, 978-89-88678-55-8
x ---------------------------- x
10.1109/EFEA56675.2022.10063818, 10.1109/NBiS.2016.88, 10.1109/ICAAIC53929.2022.9793147, 10.1109/ICELIE.2006.347212, 10.1109/WAINA.2015.127, 10.1109/BWCCA.2015.79, 10.1109/IECON.2006.348098, 10.1109/ICECIE52348.2021.9664681, 10.1109/ITOEC53115.2022.9734517, 10.1109/VLHCC.2014.6883045, 10.1109/CSO.2009.451, 10.1109/HONET.2016.7753443, 10.1109/MySEC.2015.7475216, 10.1109/ELTECH.2019.8839589, 10.1109/ICPADS.2016.0101, 10.1109/ICOECS50468.2020.9278506, 10.1109/ICE.2017.8279878, 10.1109/ICCASIT55263.2022.9986683, 10.1109/ICEIC51217.2021.9369725, 10.1109/ISCID.2018.10169, 10.1109/ICCNC.2019.8685588, 10.1109/UralCon54942.2022.9906691, 10.1109/ICACTE55855.2022.9943608, 10.1109/CISIS.2015.18, 10.1109/ICACCE.2015.148, 10.1109/ACSSC.2017.8335698,

## 3. ACM

In [26]:
_path_acm="../1_initial_search/saved_results_other_datasources/acm/first_pool_acm.csv"
df = pd.DataFrame(pd.read_csv(_path_acm, encoding = 'unicode_escape', sep = ';'))

my_columns = ['Title', 'ISBN', 'ISSN', 'DOI', 'URLs', 'Proceedings title', 'Abstract', 'Authors', \
              'Journal', 'Publication year', 'Date published', 'URLs']
titles_df_acm = prepare_titles_df(df, my_columns, 'Title')

# Apply excluding words:
words = words_1 + words_2 + words_3 + words_4 + words_5
titles_df_acm_selected = apply_excluding_2_words(words, titles_df_acm)
check_proportions(words, titles_df_acm_selected, titles_df_acm)

str_current_datetime = datetime.now().strftime("%Y%m%d-%H%M%S")
file_name = "acm_df_"+str_current_datetime+".csv"
acm_df = titles_df_acm_selected[titles_df_acm_selected['keep_title']==1]
# Uncomment HERE to save the results:
# acm_df.to_csv(file_name) 

acm_df_removed = titles_df_acm_selected[titles_df_acm_selected['keep_title']==0]
acm_df_removed.to_csv('acm_removed.csv') 

Number of words to exclude:  688
Number articles removed:  56
Number articles newly removed:  56
Number articles preserved:  137
Check:  193 == 193


In [27]:
### Not all element have an identifier (doi, isbn, issn). So we rather work with bib file to import in zotero.

# acm_df = titles_df_acm_selected[titles_df_acm_selected['keep_title']==1]
# DOIs = acm_df[acm_df['DOI'].isnull().values == False]['DOI'].to_list()
# acm_df_no_doi = acm_df[acm_df['DOI'].isnull().values]
# ISBNs = acm_df_no_doi[acm_df_no_doi['ISBN'].isnull().values == False]['ISBN'].tolist()
# acm_df_no_doi_or_isbn = acm_df_no_doi[acm_df_no_doi['ISBN'].isnull().values]
# # ISSNs = acm_df_no_doi_or_isbn[acm_df_no_doi_or_isbn['ISSN'].isnull().values == False]

# # print(ISSNs)

# print("x ---------------------------- x")
# print(", ".join(ISBNs))
# print("x ---------------------------- x")
# print(", ".join(DOIs))
# print("x ---------------------------- x")
# print(len(DOIs))
# print(len(ISBNs))
# print("check: ", acm_df.shape[0], " == ", len(DOIs + ISBNs))
# # print(len(ISSNs))
# # print(len(DOIs + ISBNs + ISSNs))

**Remark:** We notice that Zotero has created some duplicates. Compared with the bib file directly downloaded from ACM, the csv file generated by zootero contains more results.

In [28]:
with open("../1_initial_search/saved_results_other_datasources/acm/first_pool_acm.bib", "r") as f:
    text = f.read()
bib_list = text.split('\n@')
bib_reconstr = '\n@'.join(bib_list)
bib_dict = {}
for elem in bib_list:
    if elem[0]!='@':
        elem = '@'+elem
    title_match = re.search(r'title = {([^}]*)}', elem)
    title = title_match.group(1)
    bib_dict[title] = elem

# print(bib_dict)

print(len(bib_dict))

bib_list_keep = []
for key in bib_dict.keys():
    row = titles_df_acm_selected[titles_df_acm_selected['original_title'] == key]
#     print(key)
    if row['keep_title'].values[0] == 1:
        bib_list_keep.append(bib_dict[key])

print(len(bib_list_keep))

bib_reconstr_keep = '\n'.join(bib_list_keep)
    
# Uncomment HERE to save:
# with open("...\\1_initial_search\\saved_results_other_datasources\\acm\\test.bib", "w") as f:
#     f.write(bib_reconstr_keep)

185
132
