In [None]:
# dissertation for review generation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import json

def merge_files_in_folder(folder_path):
    merged_data = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        merged_data.append(data)
                except ValueError as e:
                    print(f"Error reading {file_path}: {e}")
    return merged_data

# Paths to the base folders
base_path = '/content/drive/MyDrive/Dissertation_review/dataset'  # Adjust this to the base directory containing iclr folders
folders = ['ICLR_2017', 'ICLR_2018', 'ICLR_2019', 'ICLR_2020', 'NIPS_2016', 'NIPS_2017', 'NIPS_2018', 'NIPS_2019']
subfolder_suffixes = ['content', 'paper', 'review']

# Dictionary to hold merged data for each subfolder
merged_data = {suffix: [] for suffix in subfolder_suffixes}

# Merge files in each subfolder across all conference/year folders
for folder in folders:
    for suffix in subfolder_suffixes:
        subfolder = f"{folder}_{suffix}"
        folder_path = os.path.join(base_path, folder, subfolder)
        merged_data[suffix].extend(merge_files_in_folder(folder_path))


In [None]:
# Save merged data to separate JSON files
output_path = '/content/drive/MyDrive/Dissertation_review/merged'  # Output directory to save merged files
os.makedirs(output_path, exist_ok=True)

for suffix in subfolder_suffixes:
    with open(os.path.join(output_path, f'merged_{suffix}.json'), 'w', encoding='utf-8') as f:
        json.dump(merged_data[suffix], f, ensure_ascii=True, indent=4)  # Changed ensure_ascii to True

print("Data merged and saved successfully into separate files.")


Data merged and saved successfully into separate files.


In [1]:
import pandas as pd
df_paper= pd.read_json("/content/drive/MyDrive/Dissertation_review/merged/merged_paper.json")

In [None]:
df_paper.head()

Unnamed: 0,id,conference,decision,url,hasContent,hasReview,title,authors
0,ICLR_2017_240,ICLR,Invite to Workshop Track,http://openreview.net/pdf/d77d5653197e356d0b65...,False,True,Development of JavaScript-based deep learning ...,"[Masatoshi Hidaka, Ken Miura, Tatsuya Harada]"
1,ICLR_2017_318,ICLR,Reject,http://openreview.net/pdf/fbfa9f4044a6f033361d...,False,True,Counterpoint by Convolution,"[Cheng-Zhi Anna Huang, Tim Cooijmans, Adam Rob..."
2,ICLR_2017_381,ICLR,Reject,http://openreview.net/pdf/63b92f7ce8708e9df35d...,True,True,Multi-task learning with deep model based rein...,[Asier Mujika]
3,ICLR_2017_211,ICLR,Invite to Workshop Track,http://openreview.net/pdf/5c54afd2ea867c72b8ae...,True,True,Nonparametrically Learning Activation Function...,"[Carson Eisenach, Zhaoran Wang, Han Liu]"
4,ICLR_2017_200,ICLR,Invite to Workshop Track,http://openreview.net/pdf/e2d348d16ad96d19425e...,True,True,Discovering objects and their relations from e...,"[David Raposo, Adam Santoro, David Barrett, Ra..."


In [None]:
df_paper.describe()

Unnamed: 0,id,conference,decision,url,hasContent,hasReview,title,authors
count,8938,8938,8938,8938,8938,8938,8919.0,8938
unique,8877,2,7,8877,2,2,8824.0,8596
top,ICLR_2019_174,ICLR,Accept,http://openreview.net/pdf/02befcff74b4bc496cdf...,true,true,,"[Ahmed M. Alaa, Mihaela van der Schaar]"
freq,2,5226,3712,2,8907,8840,6.0,6


In [None]:
df_paper.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8938 entries, 0 to 8937
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          8938 non-null   object
 1   conference  8938 non-null   object
 2   decision    8938 non-null   object
 3   url         8938 non-null   object
 4   hasContent  8938 non-null   object
 5   hasReview   8938 non-null   object
 6   title       8919 non-null   object
 7   authors     8938 non-null   object
dtypes: object(8)
memory usage: 558.8+ KB


In [2]:
df_review=pd.read_json("/content/drive/MyDrive/Dissertation_review/merged/merged_review.json")
df_review.head()

Unnamed: 0,id,reviews,metaReview
0,ICLR_2017_65,[{'review': 'The authors propose NVI for LDA v...,
1,ICLR_2017_315,[{'review': 'This paper presents an improved f...,
2,ICLR_2017_219,[{'review': 'This work focuses on conditional ...,
3,ICLR_2017_397,[{'review': 'The proposed approach consists in...,
4,ICLR_2017_171,"[{'review': 'After the discussion below, I loo...",


In [None]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8829 entries, 0 to 8828
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          8829 non-null   object
 1   reviews     8829 non-null   object
 2   metaReview  5952 non-null   object
dtypes: object(3)
memory usage: 207.1+ KB


In [None]:
df_review['metaReview']

0                                                     NaN
1                                                     NaN
2                                                     NaN
3                                                     NaN
4                                                     NaN
                              ...                        
8824    The paper proposes a method for stopping unnec...
8825    This paper proposes two algorithms for recover...
8826    After discussion and the author response, the ...
8827    The paper presents HAAR - a hierarchical reinf...
8828    After discussions among the reviewers, they al...
Name: metaReview, Length: 8829, dtype: object

In [None]:
df_review['metaReview'].iloc[8826]

'After discussion and the author response, the reviewers landed on a consensus that the paper would be a good addition to the NeurIPS program. The author feedback provides a more thorough analysis of the experiments, which should be included in the final version.'

In [None]:
df_review['reviews'].iloc[8826]

[{'review': 'Originality: As far as I know, this is the first paper to study the theory of online Markov decoding rigorously.  Quality: The results our sound.  Clarity: The paper is well-written overall. However the lower bounds are in a form that makes it a bit hard to compare them with the upper bounds.  Significance: Markov decoding is an important scientific tool. Designing fast algorithms for this problem is imperative in my humble opinion.  --- Edit --- Lowered score following comments from other reviewers.'},

In [3]:
# merging review and paper on ID
df_paper_review=df_paper.merge(df_review,on='id')
df_paper_review.head()

Unnamed: 0,id,conference,decision,url,hasContent,hasReview,title,authors,reviews,metaReview
0,ICLR_2017_240,ICLR,Invite to Workshop Track,http://openreview.net/pdf/d77d5653197e356d0b65...,False,True,Development of JavaScript-based deep learning ...,"[Masatoshi Hidaka, Ken Miura, Tatsuya Harada]",[{'review': 'Validity: The presented work seem...,
1,ICLR_2017_318,ICLR,Reject,http://openreview.net/pdf/fbfa9f4044a6f033361d...,False,True,Counterpoint by Convolution,"[Cheng-Zhi Anna Huang, Tim Cooijmans, Adam Rob...","[{'review': 'This paper proposed COCONET, whic...",
2,ICLR_2017_381,ICLR,Reject,http://openreview.net/pdf/63b92f7ce8708e9df35d...,True,True,Multi-task learning with deep model based rein...,[Asier Mujika],[{'review': 'The term strategy is a bit ambigu...,
3,ICLR_2017_211,ICLR,Invite to Workshop Track,http://openreview.net/pdf/5c54afd2ea867c72b8ae...,True,True,Nonparametrically Learning Activation Function...,"[Carson Eisenach, Zhaoran Wang, Han Liu]",[{'review': 'This paper provides a principled ...,
4,ICLR_2017_200,ICLR,Invite to Workshop Track,http://openreview.net/pdf/e2d348d16ad96d19425e...,True,True,Discovering objects and their relations from e...,"[David Raposo, Adam Santoro, David Barrett, Ra...",[{'review': '+ Understanding relations between...,


In [None]:
import os
import json
import re

def clean_json_string(json_string):
    # Remove any unpaired surrogates
    cleaned_string = re.sub(r'[\ud800-\udbff](?![\udc00-\udfff])|(?<![\ud800-\udbff])[\udc00-\udfff]', '', json_string)
    return cleaned_string

def merge_files_in_folder(folder_path):
    merged_data = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        file_content = f.read()
                        # Clean the file content to remove problematic characters
                        cleaned_content = clean_json_string(file_content)
                        try:
                            data = json.loads(cleaned_content)
                            if isinstance(data, list):
                                merged_data.extend(data)
                            else:
                                merged_data.append(data)
                        except json.JSONDecodeError as e:
                            print(f"Error decoding JSON from file {file_path}: {e}")
                except UnicodeDecodeError as e:
                    print(f"Error reading {file_path}: {e}")
    return merged_data

# Paths to the base folders
base_path = '/content/drive/MyDrive/Dissertation_review/dataset'
folders = ['ICLR_2017', 'ICLR_2018','ICLR_2019','ICLR_2020', 'NIPS_2016', 'NIPS_2017', 'NIPS_2018', 'NIPS_2019']
subfolder_suffix = 'content'

# List to hold merged data for the content subfolder
merged_content_data = []

# Merge files in the content subfolder across all conference/year folders
for folder in folders:
    subfolder = f"{folder}_{subfolder_suffix}"
    folder_path = os.path.join(base_path, folder, subfolder)
    merged_content_data.extend(merge_files_in_folder(folder_path))

# Save merged data to a JSON file
output_path = '/content/drive/MyDrive/Dissertation_review/merged'  # Output directory to save merged files
os.makedirs(output_path, exist_ok=True)

# Custom JSON encoder to handle problematic characters
class CustomJSONEncoder(json.JSONEncoder):
    def encode(self, obj):
        json_str = super().encode(obj)
        # Replace problematic characters with a safe placeholder
        safe_json_str = re.sub(r'[\ud800-\udfff]', '\ufffd', json_str)
        return safe_json_str

with open(os.path.join(output_path, 'merged_content.json'), 'w', encoding='utf-8', errors='ignore') as f:
    json.dump(merged_content_data, f, ensure_ascii=False, indent=4, cls=CustomJSONEncoder)

print("Content data merged and saved successfully.")


Content data merged and saved successfully.


In [4]:
df_content=pd.read_json("/content/drive/MyDrive/Dissertation_review/merged/merged_content.json")
df_content.head()

Unnamed: 0,name,metadata,id
0,ICLR_2017_373.pdf,"{'source': 'CRF', 'title': None, 'authors': ['...",ICLR_2017_373
1,ICLR_2017_160.pdf,"{'source': 'CRF', 'title': 'TRAINING DEEP NEUR...",ICLR_2017_160
2,ICLR_2017_345.pdf,"{'source': 'CRF', 'title': 'CONTEXT-CONDITIONA...",ICLR_2017_345
3,ICLR_2017_260.pdf,"{'source': 'CRF', 'title': None, 'authors': ['...",ICLR_2017_260
4,ICLR_2017_108.pdf,"{'source': 'CRF', 'title': None, 'authors': ['...",ICLR_2017_108


In [None]:
df_content['metadata'].iloc[1]

{'source': 'CRF',
 'title': 'TRAINING DEEP NEURAL-NETWORKS USING A NOISE ADAPTATION LAYER',
 'authors': ['Jacob Goldberger', 'Ehud Ben-Reuven'],
 'emails': [],
 'sections': [{'heading': '1 INTRODUCTION',
   'text': 'The presence of class label noise inherent to training samples has been reported to deteriorate the performance of even the best classifiers in a broad range of classification problems (Nettleton et al. (2010), Pechenizkiy et al. (2006), Zhu & Wu (2004)). Noisy labels also tend to be more harmful than noisy attributes (Zhu & Wu (2004)). Noisy data are usually related to the data collection process. Typically, the labels used to train a classifier are assumed to be unambiguous and accurate. However, this assumption often does not hold since labels that are provided by human judgments are subjective. Many of the largest image datasets have been extracted from social networks. These images are labeled by non-expert users and building a consistent model based on a precisely lab

In [5]:
df_content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8854 entries, 0 to 8853
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      8854 non-null   object
 1   metadata  8854 non-null   object
 2   id        8853 non-null   object
dtypes: object(3)
memory usage: 207.6+ KB


In [6]:
# merging df_paper_review with df_content on ID
df_paper_review_content=df_paper_review.merge(df_content,on='id')
df_paper_review_content.head()

Unnamed: 0,id,conference,decision,url,hasContent,hasReview,title,authors,reviews,metaReview,name,metadata
0,ICLR_2017_381,ICLR,Reject,http://openreview.net/pdf/63b92f7ce8708e9df35d...,True,True,Multi-task learning with deep model based rein...,[Asier Mujika],[{'review': 'The term strategy is a bit ambigu...,,ICLR_2017_381.pdf,"{'source': 'CRF', 'title': 'MULTI-TASK LEARNIN..."
1,ICLR_2017_211,ICLR,Invite to Workshop Track,http://openreview.net/pdf/5c54afd2ea867c72b8ae...,True,True,Nonparametrically Learning Activation Function...,"[Carson Eisenach, Zhaoran Wang, Han Liu]",[{'review': 'This paper provides a principled ...,,ICLR_2017_211.pdf,"{'source': 'CRF', 'title': 'NONPARAMETRICALLY ..."
2,ICLR_2017_200,ICLR,Invite to Workshop Track,http://openreview.net/pdf/e2d348d16ad96d19425e...,True,True,Discovering objects and their relations from e...,"[David Raposo, Adam Santoro, David Barrett, Ra...",[{'review': '+ Understanding relations between...,,ICLR_2017_200.pdf,"{'source': 'CRF', 'title': 'DISCOVERING OBJECT..."
3,ICLR_2017_106,ICLR,Accept (Poster),http://openreview.net/pdf/5ab63afda67c68cd39a6...,True,True,"Attend, Adapt and Transfer: Attentive Deep Arc...","[Janarthanan Rajendran, Aravind Lakshminarayan...",[{'review': 'In this paper a well known soft m...,,ICLR_2017_106.pdf,"{'source': 'CRF', 'title': 'TRANSFER FROM MULT..."
4,ICLR_2017_64,ICLR,Accept (Poster),http://openreview.net/pdf/2cb01001f2f89ca11252...,True,True,Training Compressed Fully-Connected Networks w...,"[Shengjie Wang, Haoran Cai, Jeff Bilmes, Willi...",[{'review': 'The method proposes to compress t...,,ICLR_2017_64.pdf,"{'source': 'CRF', 'title': None, 'authors': ['..."


In [8]:
df_paper_review_content.to_json('/content/drive/MyDrive/Dissertation_review/merged/final_data-review.json')

In [None]:
df_paper_review_content.reindex(columns=['id',	'conference',	'decision', 'url',	'hasContent',	'hasReview',	'title',	'authors', 'name',	'metadata', 'reviews',	'metaReview',])

Unnamed: 0,id,conference,decision,url,hasContent,hasReview,title,authors,name,metadata,reviews,metaReview
0,ICLR_2017_381,ICLR,Reject,http://openreview.net/pdf/63b92f7ce8708e9df35d...,true,true,Multi-task learning with deep model based rein...,[Asier Mujika],ICLR_2017_381.pdf,"{'source': 'CRF', 'title': 'MULTI-TASK LEARNIN...",[{'review': 'The term strategy is a bit ambigu...,
1,ICLR_2017_211,ICLR,Invite to Workshop Track,http://openreview.net/pdf/5c54afd2ea867c72b8ae...,true,true,Nonparametrically Learning Activation Function...,"[Carson Eisenach, Zhaoran Wang, Han Liu]",ICLR_2017_211.pdf,"{'source': 'CRF', 'title': 'NONPARAMETRICALLY ...",[{'review': 'This paper provides a principled ...,
2,ICLR_2017_200,ICLR,Invite to Workshop Track,http://openreview.net/pdf/e2d348d16ad96d19425e...,true,true,Discovering objects and their relations from e...,"[David Raposo, Adam Santoro, David Barrett, Ra...",ICLR_2017_200.pdf,"{'source': 'CRF', 'title': 'DISCOVERING OBJECT...",[{'review': '+ Understanding relations between...,
3,ICLR_2017_106,ICLR,Accept (Poster),http://openreview.net/pdf/5ab63afda67c68cd39a6...,true,true,"Attend, Adapt and Transfer: Attentive Deep Arc...","[Janarthanan Rajendran, Aravind Lakshminarayan...",ICLR_2017_106.pdf,"{'source': 'CRF', 'title': 'TRANSFER FROM MULT...",[{'review': 'In this paper a well known soft m...,
4,ICLR_2017_64,ICLR,Accept (Poster),http://openreview.net/pdf/2cb01001f2f89ca11252...,true,true,Training Compressed Fully-Connected Networks w...,"[Shengjie Wang, Haoran Cai, Jeff Bilmes, Willi...",ICLR_2017_64.pdf,"{'source': 'CRF', 'title': None, 'authors': ['...",[{'review': 'The method proposes to compress t...,
...,...,...,...,...,...,...,...,...,...,...,...,...
8609,NIPS_2019_1016,NIPS,Accept,http://papers.nips.cc/paper/9310-privacy-prese...,true,true,Privacy-Preserving Q-Learning with Functional ...,"[Baoxiang Wang, Nidhi Hegde]",NIPS_2019_1016.pdf,"{'source': 'CRF', 'title': 'Privacy-preserving...",[{'review': 'The authors have significantly im...,This paper proposes a differentially private Q...
8610,NIPS_2019_181,NIPS,Accept,http://papers.nips.cc/paper/8476-numerically-a...,true,true,Numerically Accurate Hyperbolic Embeddings Usi...,"[Tao Yu, Christopher M. De Sa]",NIPS_2019_181.pdf,"{'source': 'META', 'title': 'Numerically Accur...",[{'review': 'This paper provides a novel and i...,There is broad agreement on the elegance and c...
8611,NIPS_2019_1328,NIPS,Accept,http://papers.nips.cc/paper/9622-algorithmic-g...,true,true,Algorithmic Guarantees for Inverse Imaging wit...,"[Gauri Jagatap, Chinmay Hegde]",NIPS_2019_1328.pdf,"{'source': 'META', 'title': 'Algorithmic Guara...",[{'review': 'This paper presents compressed se...,This paper describes new theory associated wit...
8612,NIPS_2019_51,NIPS,Accept,http://papers.nips.cc/paper/8346-cpm-nets-cros...,true,true,CPM-Nets: Cross Partial Multi-View Networks,"[Changqing Zhang, Zongbo Han, yajie cui, Huazh...",NIPS_2019_51.pdf,"{'source': 'CRF', 'title': 'CPM-Nets: Cross Pa...",[{'review': 'The authors picked a relevant pro...,The reviewers all agree that the proposed meth...


In [None]:
df_paper_review_content.to_csv('final_data-review.csv', index=False)