This notebook search the related ISSN of each DOI in the COCI dataset. Since loading the entire dataset (30 GB unzipped) into memory is impossible, I will download each zipped folder (since it works remotely the process takes very little time), unzip it in Colab temporary storage and work this the individual .csv files. 

In [None]:
%%capture
!pip install crossrefapi
from crossref.restful import Works

In [1]:
import os
import csv
import pandas as pd
import glob
import zipfile
import json
import time

To create a sample of dataset, I chose to work only with works that were published during 2020. To do so, I download the dumps after 2020-01-01 and filter them by date using Pandas. Finally I create that many .csv files as the original ones. 

In [None]:
%%capture 
#2020
!wget https://figshare.com/ndownloader/files/28988055
!wget https://figshare.com/ndownloader/files/25713470
!wget https://figshare.com/ndownloader/files/24594644
!wget https://figshare.com/ndownloader/files/23681441
!wget https://figshare.com/ndownloader/files/22661558

In [None]:
%%capture 
#2020
!unzip /content/28988055 -d /content/opencitationunzipped #unzip the downloaded folder
!rm /content/28988055
!unzip /content/25713470 -d /content/opencitationunzipped #unzip the downloaded folder
!rm /content/25713470
!unzip /content/24594644 -d /content/opencitationunzipped #unzip the downloaded folder
!rm /content/24594644
!unzip /content/23681441 -d /content/opencitationunzipped #unzip the downloaded folder
!rm /content/23681441
!unzip /content/22661558 -d /content/opencitationunzipped #unzip the downloaded folder
!rm /content/22661558

In [2]:
%%capture
#2021
!wget https://figshare.com/ndownloader/files/28988055
!wget https://figshare.com/ndownloader/files/28988502
!wget https://figshare.com/ndownloader/files/30645060
!wget https://figshare.com/ndownloader/files/31555355

In [None]:
%%capture
#2021
!unzip /content/28988502 -d /content/opencitationunzipped
!rm /content/28988502 
!unzip /content/30645060 -d /content/opencitationunzipped
!rm /content/30645060 
!unzip /content/31555355 -d /content/opencitationunzipped
!rm /content/31555355 
!unzip /content/28988055 -d /content/opencitationunzipped
!rm /content/28988055

In [None]:
#cell for elaborating only 2021 citations
list_csv = [el for el in os.listdir('/content/opencitationunzipped/') if '.csv' in el]
for el in list_csv:
  print(el)
  df = pd.read_csv('/content/opencitationunzipped/'+el, parse_dates=['creation'])
  df['creation']= pd.to_datetime(df['creation'])
  df_2020 = df[(df['creation'] >= '2021-01-01') & (df['creation'] <= '2021-12-31')] #filter by date
  df_2020.reset_index()
  df_2020.to_csv('/content/opencitationunzipped2020/'+ el, index=False)

In [None]:
df_2020 = pd.read_csv('/content/opencitationunzipped2020/2020-06-13T18:18:05_1.csv')
df_2020

Unnamed: 0,oci,citing,cited,creation,timespan,journal_sc,author_sc
0,0200202020001361818141237020000070809050114370...,10.22201/iiec.20078951e.2020.201.69395,10.1016/s0014-2921(01)00115-5,2020-03-10,P18Y10M,no,no
1,0200202020001361818141237020000070809050114370...,10.22201/iiec.20078951e.2020.201.69395,10.1111/1475-4991.00060,2020-03-10,P17Y6M,no,no
2,0200202020001361818141237020000070809050114370...,10.22201/iiec.20078951e.2020.201.69395,10.1093/wber/lht004,2020-03-10,P7Y0M19D,no,no
3,0200202020001361818141237020000070809050114370...,10.22201/iiec.20078951e.2020.201.69395,10.1016/j.econedurev.2012.05.002,2020-03-10,P7Y5M,no,no
4,0200202020001361818141237020000070809050114370...,10.22201/iiec.20078951e.2020.201.69395,10.1080/10627197.2015.1093926,2020-03-10,P4Y5M8D,no,no
...,...,...,...,...,...,...,...
5782788,0200303090036181922280201000903020805-02004010...,10.3390/ijms21093285,10.4103/jcar.jcar_17_18,2020-05-06,P2Y,no,no
5782789,0200303090036181922280201000903020805-02001080...,10.3390/ijms21093285,10.18632/oncotarget.13798,2020-05-06,P3Y5M1D,no,no
5782790,0200303090036181922280201000903020805-02001000...,10.3390/ijms21093285,10.1038/nrd2926,2020-05-06,P10Y9M,no,no
5782791,0200303090036181922280201000903020805-02001000...,10.3390/ijms21093285,10.1073/pnas.0408864102,2020-05-06,P15Y3M24D,no,no


This function loads the cleaned Crossref dataset into a Pandas DataFrame and for each COCI .csv file it searches the ISSN of the DOIs. Then it creates a .json that records which ISSN has been mentioned by each citing DOI (i.e. the journals mentioned by a DOI) and how many times that has happened. 
With Colab RAM limit you can only run one single .csv, with Amazon SageMaker you might be able to process at least 5 .csv (12 vs 16 GB of ram).

In [None]:
def get_issn_crossref(coci_files):
  df_cross = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/opencitations/total_crossref_pulito_final.csv', engine = 'c', 
                         dtype={"doi": 'string', "issn": "string"}) #engine c takes 1m 45s and loads 9GB RAM + 3m 34s and reaches 10.4 GB RAM
  df_cross.set_index('doi', inplace = True)#set the index at the DOI. This FUNDAMENTAL to make the process reasonably fast
  memory_dict = {}
  set_not_found_citing = set()
  set_not_found_cited = set()
  for coci in coci_files:
    print(coci)
    with open(coci, 'r') as csv_file: #read line by line the OC dataset and get citing and cited
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      for row in csv_reader:
        citing = row[1] #1 if using normal csv instead of 2021
        cited = row[2] #2 if using normal csv instead of 2021
        if citing in memory_dict.keys(): #check if citing has been already searched
          try:
            issn_cited = df_cross.at[cited, 'issn'] #try to get cited issn
            if issn_cited in memory_dict[citing]['has_cited_n_times']:
              memory_dict[citing]['has_cited_n_times'][issn_cited] += 1
            else:
              memory_dict[citing]['has_cited_n_times'][issn_cited] = 1
          except KeyError:
            continue
        elif citing not in set_not_found_citing:
          try:
            issn_citing = df_cross.at[citing, 'issn'] #first search for the citing issn
            if cited not in set_not_found_cited:
              try:
                issn_cited = df_cross.at[cited, 'issn']#then search for cited issn
                memory_dict[citing] = {} 
                memory_dict[citing]['issn'] = issn_citing
                memory_dict[citing]['has_cited_n_times'] = {}
                memory_dict[citing]['has_cited_n_times'][issn_cited] = 1
              except KeyError:
                set_not_found_cited.add(cited)
          except KeyError:
            set_not_found_citing.add(citing)
  with open('all_2020.json', 'w') as fp:
    json.dump(list(memory_dict.values()), fp) #transform the dict in a list of dicts to reduce the output size (it should be as the indices of the dict are int instead of strings representing DOIs)

#counters to check if everything works right
  print('lenght of dict: ', len(memory_dict.keys()))
  print('Number set not found citing: ', len(set_not_found_citing))
  print('Number set not found cited: ', len(set_not_found_cited))

  with open('citing_not_found.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    for line in set_not_found_citing:
      writer.writerow([line])
  csvfile.close()
  with open('cited_not_found.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    for line in set_not_found_cited:
      writer.writerow([line])
  csvfile.close()


In [None]:
get_issn_crossref(['/content/opencitationunzipped2020/'+el for el in os.listdir('/content/opencitationunzipped2020/') if '.csv' in el])

/content/opencitationunzipped2020/2021-01-27T23:11:15_8.csv
/content/opencitationunzipped2020/2020-04-25T04:48:36_5.csv
/content/opencitationunzipped2020/2020-06-13T18:18:05_1.csv
/content/opencitationunzipped2020/2021-01-27T23:11:15_4.csv
/content/opencitationunzipped2020/2020-04-25T04:48:36_1.csv
/content/opencitationunzipped2020/2021-01-27T23:11:15_6.csv
/content/opencitationunzipped2020/2021-01-27T23:11:15_5.csv
/content/opencitationunzipped2020/2020-06-13T18:18:05_2.csv
/content/opencitationunzipped2020/2021-01-27T23:11:15_1.csv
/content/opencitationunzipped2020/2020-11-22T17:48:01_1.csv
/content/opencitationunzipped2020/2020-04-25T04:48:36_4.csv
/content/opencitationunzipped2020/2021-01-27T23:11:15_9.csv
/content/opencitationunzipped2020/2021-01-27T23:11:15_2.csv
/content/opencitationunzipped2020/2020-11-22T17:48:01_2.csv
/content/opencitationunzipped2020/2021-01-27T23:11:15_3.csv
/content/opencitationunzipped2020/2020-04-25T04:48:36_2.csv
/content/opencitationunzipped2020/2020-1

In [None]:
!cp /content/all_2020.json /content/drive/MyDrive/Colab_Notebooks/opencitations/results

In [None]:
def get_issn_crossref(coci_files):
  works = Works()
  df_cross = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/opencitations/total_crossref_pulito_final.csv', engine = 'c', 
                         dtype={"doi": 'string', "issn": "string"}) #engine c takes 1m 45s and loads 9GB RAM + 3m 34s and reaches 10.4 GB RAM
  df_cross.set_index('doi', inplace = True)#set the index at the DOI. This FUNDAMENTAL to make the process reasonably fast
  memory_dict = {}
  set_not_found_citing = set()
  set_not_found_cited = set()
  citing_notfound = 0
  cited_notfound = 0
  n_lines = 0
  for coci in coci_files:
    print(coci)
    with open(coci, 'r') as csv_file: #read line by line the OC dataset and get citing and cited
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      for row in csv_reader:
        n_lines += 1
        citing = row[1] #1 if using normal csv instead of 2021
        cited = row[2] #2 if using normal csv instead of 2021
        if citing in memory_dict.keys(): #check if citing has been already searched
          try:
            issn_cited = df_cross.at[cited, 'issn'] #try to get cited issn
            if issn_cited in memory_dict[citing]['has_cited_n_times']:
              memory_dict[citing]['has_cited_n_times'][issn_cited] += 1
            else:
              memory_dict[citing]['has_cited_n_times'][issn_cited] = 1
          except KeyError:
            try:
              issn_cited = works.doi(cited)['ISSN']
              memory_dict[citing] = {} 
              memory_dict[citing]['issn'] = issn_citing
              memory_dict[citing]['has_cited_n_times'] = {}
              memory_dict[citing]['has_cited_n_times'][issn_cited] = 1
              time.sleep(0.2)
            except:
              set_not_found_cited.add(cited)
              cited_notfound += 1 
        elif citing not in set_not_found_citing:
          try:
            issn_citing = df_cross.at[citing, 'issn'] #first search for the citing issn
            if cited not in set_not_found_cited:
              try:
                issn_cited = df_cross.at[cited, 'issn']#then search for cited issn
                memory_dict[citing] = {} 
                memory_dict[citing]['issn'] = issn_citing
                memory_dict[citing]['has_cited_n_times'] = {}
                memory_dict[citing]['has_cited_n_times'][issn_cited] = 1
              except KeyError:
                set_not_found_cited.add(cited)
                cited_notfound += 1           
          except KeyError:
            try:
              issn_citing = works.doi(citing)['ISSN']
              time.sleep(0.1)
              if cited not in set_not_found_cited:
                try:
                  issn_cited = df_cross.at[cited, 'issn']#then search for cited issn
                  memory_dict[citing] = {} 
                  memory_dict[citing]['issn'] = issn_citing
                  memory_dict[citing]['has_cited_n_times'] = {}
                  memory_dict[citing]['has_cited_n_times'][issn_cited] = 1
                except KeyError:
                  try:
                    issn_cited = works.doi(cited)['ISSN']
                    memory_dict[citing] = {} 
                    memory_dict[citing]['issn'] = issn_citing
                    memory_dict[citing]['has_cited_n_times'] = {}
                    memory_dict[citing]['has_cited_n_times'][issn_cited] = 1
                    time.sleep(0.2)
                  except:
                    set_not_found_cited.add(cited)
                    cited_notfound += 1  
            except:
              set_not_found_citing.add(citing)
              citing_notfound += 1
  with open('23681441_2020_4.json', 'w') as fp:
    json.dump(list(memory_dict.values()), fp) #transform the dict in a list of dicts to reduce the output size (it should be as the indices of the dict are int instead of strings representing DOIs)

#counters to check if everything works right
  print('lenght of dict: ', len(memory_dict.keys()))
  print('total citing not found: ', citing_notfound)
  print('total cited not found: ', cited_notfound)
  print('Number of lines iterated: ', n_lines)

In [None]:
get_issn_crossref(['/content/opencitationunzipped2020/'+el for el in os.listdir('/content/opencitationunzipped2020/') if '.csv' in el])

/content/opencitationunzipped2020/2021-01-27T23:11:15_8.csv


In [None]:
with open('/content/drive/MyDrive/Colab_Notebooks/opencitations/output_2020-04-25T04_48_36_1.json', 'r') as fp:
  memory_dict = json.load(fp)
  for key, values in memory_dict.items():
    tot = sum(values['has_cited_n_times'].values())
    unici = len(values['has_cited_n_times'].keys())
    if tot > 1000:
      print(key, tot, unici, values['has_cited_n_times'])
      

10.1080/10408444.2019.1692191 1246 378 {"'0143-4160'": 1, "'1096-0929'": 2, "'0167-4781'": 3, "'0167-4838'": 10, "'1756-2651'": 7, "'0002-7863'": 13, "'1087-0156'": 2, "'0027-8424'": 32, "'0264-6021'": 72, "'1460-2075'": 2, "'0009-2797'": 17, "'1462-2416'": 2, "'0163-7258'": 2, "'0006-2960'": 60, "'1570-9639'": 4, "'0006-2952'": 26, "'1932-6203'": 8, "'0003-2697'": 12, "'0003-9861'": 34, "'0893-228X'": 26, "'0929-8665'": 2, "'0968-0896'": 22, "'2192-8304'": 1, "'0387-7604'": 1, "'0014-5793'": 18, "'0021-9258'": 64, "'0006-2936'": 4, "'0046-8177'": 3, "'1750-1326'": 1, "'0076-6879'": 20, "'2157-9024'": 1, "'0365-6233'": 2, "'0041-008X'": 12, "'0090-9556'": 13, "'0022-2623'": 23, "'0362-1642'": 3, "'1570-1808'": 2, "'1054-3589'": 3, "'0360-2532'": 16, "'0049-8254'": 9, "'0888-7543'": 6, "'0020-711X'": 2, "'1949-2553'": 4, "'0223-5234'": 7, "'0306-4492'": 1, "'1434-6621'": 1, "'0014-2956'": 18, "'0193-1857'": 2, "'0098-2997'": 2, "'0306-3283'": 13, "'0091-6765'": 2, "'0005-2795'": 1, "'03

In [None]:
with open('/content/drive/MyDrive/Colab_Notebooks/opencitations/output_2020-04-25T04_48_36_1.json', 'r') as fp:
  memory_dict = json.load(fp)
  print(len(memory_dict.keys()))
  for key, values in memory_dict.items():
    if values['issn'] == "'0028-0836'":
      print(values['issn'], values['has_cited_n_times'])
    else:
      continue
