This notebook search the related ISSN of each DOI in the COCI dataset. Since loading the entire dataset (30 GB unzipped) into memory is impossible, I will download each zipped folder (since it works remotely the process takes very little time), unzip it in Colab temporary storage and work this the individual .csv files. 

In [1]:
import sqlite3
import csv
import pandas as pd
import glob
import zipfile
import json
import os
import time
from alive_progress import alive_bar

This function loads the cleaned Crossref dataset into a Pandas DataFrame and for each COCI .csv file it searches the ISSN of the DOIs. Then it creates a .json that records which ISSN has been mentioned by each citing DOI (i.e. the journals mentioned by a DOI) and how many times that has happened. 
With Colab RAM limit you can only run one single .csv, with Amazon SageMaker you might be able to process at least 5 .csv (12 vs 16 GB of ram).

In [4]:
def get_issn_crossref(coci_files): #usando il database del crossref pulito e indicizzato
  db_path = '/content/drive/MyDrive/Colab_Notebooks/opencitations/crossref_pulito_indexed.db'
  connection = sqlite3.connect(db_path)
  cursor = connection.cursor()
  memory_dict = {}
  set_not_found_citing = set()
  set_not_found_cited = set()
  for coci in coci_files:
    print(coci)
    with open(coci, 'r', encoding="utf8") as csv_file: #read line by line the OC dataset and get citing and cited
      csv_reader = csv.reader(csv_file, delimiter=',')
      row_count = sum(1 for row in csv_reader)  #count the number of rows for the progress bar
      csv_file.seek(0) #reset file and interator
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      with alive_bar(row_count,force_tty=True) as bar:
        for row in csv_reader:
          citing = row[1] 
          cited = row[2] 
          if citing in memory_dict.keys(): #check if citing has been already searched
            rows = cursor.execute(
                  "SELECT doi, issn FROM articles WHERE doi = ?",
                          (cited,),).fetchall()  
            if len(rows) != 0:
              issn_cited = rows[0][1]
              issn_cited = issn_cited.split(', ')[0].strip("'").replace("-", "") #we are getting only the e-issn instead of the printed one
              if issn_cited in memory_dict[citing]['has_cited_n_times']:
                memory_dict[citing]['has_cited_n_times'][issn_cited.strip("''")] += 1
              else:
                memory_dict[citing]['has_cited_n_times'][issn_cited.strip("''")] = 1
            else:
              continue
          elif citing not in set_not_found_citing:
            rows = cursor.execute(
                  "SELECT doi, issn FROM articles WHERE doi = ?",
                          (citing,),).fetchall() 
            if len(rows) != 0: 
              issn_citing = rows[0][1]
              issn_citing = issn_citing.split(', ')[0].strip("'").replace("-", "")
              if cited not in set_not_found_cited:
                rows = cursor.execute(
                  "SELECT doi, issn FROM articles WHERE doi = ?",
                          (cited,),).fetchall()  
                if len(rows) != 0:
                  issn_cited = rows[0][1]
                  issn_cited = issn_cited.split(', ')[0].strip("'").replace("-", "")
                  memory_dict[citing] = {} 
                  memory_dict[citing]['issn'] = issn_citing.strip("''")
                  memory_dict[citing]['has_cited_n_times'] = {}
                  memory_dict[citing]['has_cited_n_times'][issn_cited.strip("''")] = 1
                else:
                  set_not_found_cited.add(cited)
            else:
              set_not_found_citing.add(citing)
          bar()
  with open('prova_db.json', 'w') as fp:
    json.dump(list(memory_dict.values()), fp) #transform the dict in a list of dicts to reduce the output size 

#counters to check if everything works right
  print('lenght of dict: ', len(memory_dict.keys()))
  print('Number set not found citing: ', len(set_not_found_citing))
  print('Number set not found cited: ', len(set_not_found_cited))

  with open('citing_not_found.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    for line in set_not_found_citing:
      writer.writerow([line])
  csvfile.close()
  with open('cited_not_found.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    for line in set_not_found_cited:
      writer.writerow([line])
  csvfile.close()

  connection.close()

In [None]:
get_issn_crossref([r'E:/opencitation/6741422/2020-11-22T17_48_01_1-3/'+el for el in os.listdir('E:/opencitation/6741422/2020-11-22T17_48_01_1-3/') if '.csv' in el])

/content/opencitationsunzipped/2021-07-06T163745_1_3.csv
|███████████████████████████████████████▊⚠︎ (!) 9924912/10000001 [99%] in 1:44:59.7 (1575.47/s)                          
/content/opencitationsunzipped/2021-07-06T163745_0_5.csv
|███████████████████████████████████████▊⚠︎ (!) 9106152/9174360 [99%] in 1:38:57.3 (1533.72/s)                           
/content/opencitationsunzipped/2021-07-06T163745_1_2.csv
|███████████████████████████████████████▉⚠︎ (!) 9960590/10000001 [100%] in 1:40:35.0 (1650.48/s)                         
/content/opencitationsunzipped/2021-07-06T163745_2_3.csv
|███████████████████████████████████     | ▅▇▇ 8751673/10000001 [88%] in 1:41:41 (1434.5/s, eta: 14:30) 

In [None]:
with open('/content/prova_db.json', 'r') as fp:
  memory_dict = json.load(fp)
  for el in memory_dict:
    print(el)
    break


{'issn': '10242422', 'has_cited_n_times': {'00221155': 2, '09608524': 9, '00218561': 3, '12332356': 1, '1369703X': 1, '14641801': 1, '01410229': 1, '02786915': 1, '01460749': 1, '15178382': 1, '07349750': 2, '13645072': 1, '00448486': 1, '09239820': 2, '09242244': 1, '01681605': 1, '10504648': 1, '00221147': 1, '00063444': 1, '03088146': 1, '01757598': 1, '02732289': 1, '13595113': 1, '13811177': 1, '1319562X': 1, '00223573': 1, '16180240': 1, '20701667': 1, '13369075': 1, '18772641': 1, '10286276': 1, '2193567X': 1, '21615063': 1, '22126708': 1, '03781135': 1, '23269162': 1, '00401706': 1}}


In [None]:
get_issn_crossref([r'/content/opencitationsunzipped/'+el for el in os.listdir('/content/opencitationsunzipped/') if '.csv' in el]) #experiment with multiple csvs (2 gb of zipped files)

/content/opencitationsunzipped/2021-11-15T031921_3_1.csv
|███████████████████████████████████████▊⚠︎ (!) 9588020/9640790 [99%] in 1:02:06.6 (2572.85/s)                           
/content/opencitationsunzipped/2021-11-15T031921_2_1.csv
|███████████████████████████████████████▋⚠︎ (!) 9558581/9636140 [99%] in 37:38.0 (4233.20/s)                             
/content/opencitationsunzipped/2021-11-15T031921_1_1.csv
|███████████████████████████████████████▋⚠︎ (!) 9561919/9644388 [99%] in 37:17.8 (4272.98/s)                             
/content/opencitationsunzipped/2020-08-20T18:12:28_2.csv
|███████████████████████████████████████▏⚠︎ (!) 1675676/1711749 [98%] in 13:44.7 (2031.94/s)                             
/content/opencitationsunzipped/2020-08-20T18:12:28_1.csv
|███████████████████████████████████████⚠︎| (!) 9749799/10000001 [97%] in 1:09:48.0 (2328.04/s)                          
/content/opencitationsunzipped/2021-11-15T031921_4_1.csv
|███████████████████████████████████████▋⚠︎ (!) 