## **This notebook processes the Cora dataset ready for analysis**

In [1]:
import pandas as pd
import numpy as np
import json
from processing_Cora import ProcessCora

In [2]:
Cora_Processing = ProcessCora()

In [3]:
paper_ids = Cora_Processing.get_info_ids(Cora_Processing.ORIGINAL_CORA)
print(sorted(paper_ids)[:5])

Number of unique IDs equal to number of Nodes in Cora: True
Number of citations equal to number of Edges in Cora: True
[35, 40, 114, 117, 128]


In [4]:
# Diferent files named as urls contain information about each paper (e.g., Title, Abstract)
ids_to_filenames = Cora_Processing.get_filenames(Cora_Processing.FILENAMES_PATH)
print(ids_to_filenames[:2])

[{2: ['http:##dimacs.rutgers.edu#techps#1994#94-07.ps']}, {16: ['http:##www.cs.wisc.edu#~fischer#ftp#pub#tech-reports#ncstrl.uwmadison#CS-TR-90-907#CS-TR-90-907.ps.Z']}]


In [5]:
# Get only Cora papers
cora_ids = Cora_Processing.get_original_Cora(paper_ids, ids_to_filenames)
print(cora_ids[:2])

Equal number of ids and nodes -> True
Same ids as paper Ids -> True
[{35: ['http:##www.cis.ohio-state.edu#lair#TechReports#93-pa-ep93.ps', 'http:##www.cs.purdue.edu#coast#archive#clife#GA#docs#tcga91-2.ps.gz']}, {40: ['http:##www.bioele.nuee.nagoya-u.ac.jp#wec#papers#files#lee.ps.gz']}]


In [6]:
# Fix the urls to associate them (replace ':' with '_' in most of the cases)
fixed_protocols = Cora_Processing.update_protocol(cora_ids)
print(fixed_protocols[:2])

# Update dataset after looking missing files with info
new_dataset = Cora_Processing.check_missing_data(fixed_protocols)

[{35: ['http_##www.cis.ohio-state.edu#lair#TechReports#93-pa-ep93.ps', 'http_##www.cs.purdue.edu#coast#archive#clife#GA#docs#tcga91-2.ps.gz']}, {40: ['http_##www.bioele.nuee.nagoya-u.ac.jp#wec#papers#files#lee.ps.gz']}]
Papers with info: 2707
Papers missing info: 1
Total papers: 2708


In [7]:
extractions = Cora_Processing.ALL_EXTRACTIONS
paper_info = Cora_Processing.extract_paper_info2(new_dataset, extractions) # ID, Title, Authors and Abstract 

for i, (paper_id, info) in enumerate(paper_info.items()):
    print(paper_id, info)
    if i >= 4:  
        break

# Ids and url to Dataframe
ids_files_df = Cora_Processing.list_to_df(new_dataset)
print(f"\nNumber of records: {len(ids_files_df)}")
first_5_ids_files = ids_files_df.iloc[:5].to_dict(orient='records')
for info in first_5_ids_files:
    print(info)

# Paper info to DataFrame
df_info = Cora_Processing.dict_to_df(paper_info)
print(f"\nNumber of records {len(df_info)} -> Few papers did not have info about theur Title or Abstract so they are useless")
first_5_papers_info = df_info.iloc[:5].to_dict(orient='records')
for info in first_5_papers_info:
    print(info)



# Merge both DataFrames:
merged_df = Cora_Processing.merge_dfs(ids_files_df, df_info)
print(f"\nMerged DataFrame; number of records: {len(merged_df)}")
first_5_merged = merged_df.iloc[:5].to_dict(orient='records')
for row in first_5_merged:
    print(row)


# Sanity check:
print()
Cora_Processing.check_NA(merged_df)

35 {'Title': 'Evolutionary Module Acquisition (1993) Coevolving high-level representations, Artificial Life (1989) Genetic Algorithms in Search, Optimization,', 'Author': 'Angeline and Pollack Angeline, P. and Pollack, J. III, Fogel, L., Owens, A., and Walsh, M. Goldberg, D. Davis editor, Morgan Kaufman. Holland, J. Jefferson, D., R. Collins, C. Cooper, M. Dyer, M. Flowers, R. Korf, C. Taylor, and A. Wang. edited by C. Langton, C. Taylor, J. Farmer and S.', 'Abstract': 'Angeline, P., Saunders, G. and Pollack, J. (1993) An evolutionary algorithm that constructs recurrent neural networks, LAIR Technical Report #93-PA-GNARLY, Submitted to IEEE Transactions on Neural Networks Special Issue on Evolutionary Programming.'}
40 {'Title': 'Dynamic Control of Genetic Algorithms using Fuzzy Logic Techniques', 'Author': 'Michael A. LEE Hideyuki TAKAGI', 'Abstract': 'This paper proposes using fuzzy logic techniques to dynamically control parameter settings of genetic algorithms (GAs). We describe th

In [8]:
# Get paper categories:
labels_df = Cora_Processing.get_labels(Cora_Processing.CLASS_PATH, merged_df)
first_5_labels = labels_df.iloc[:5].to_dict(orient='records')
for row in first_5_labels:
    print(row['paper_id'],"->",row['topic'])

# Get Bag of Words (BoW) representation:
BoW_df = Cora_Processing.get_Bow(Cora_Processing.BOW_CORA)
BoW_df.head()

35 -> Genetic_Algorithms
40 -> Genetic_Algorithms
114 -> Reinforcement_Learning
117 -> Reinforcement_Learning
128 -> Reinforcement_Learning


Unnamed: 0,paper_id,BoW,topic2
0,31336,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Neural_Networks
1,1061127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",Rule_Learning
2,1106406,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reinforcement_Learning
3,13195,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reinforcement_Learning
4,37879,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Probabilistic_Methods


In [9]:
# Merge for the finalized DataFrame
final_df = Cora_Processing.merge_info_bow(labels_df, BoW_df)
final_df.head(10)

Unnamed: 0,paper_id,filenames,Title,Author,Abstract,BoW,topic2
0,35,[http_##www.cis.ohio-state.edu#lair#TechReport...,Evolutionary Module Acquisition (1993) Coevolv...,"Angeline and Pollack Angeline, P. and Pollack,...","Angeline, P., Saunders, G. and Pollack, J. (19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Genetic_Algorithms
1,40,[http_##www.bioele.nuee.nagoya-u.ac.jp#wec#pap...,Dynamic Control of Genetic Algorithms using Fu...,Michael A. LEE Hideyuki TAKAGI,This paper proposes using fuzzy logic techniqu...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Genetic_Algorithms
2,114,[ftp_##ftp.cs.colorado.edu#users#baveja#Papers...,Learning to Act using Real-Time Dynamic Progra...,Andrew G. Barto Steven J. Bradtke Satinder P. ...,"fl The authors thank Rich Yee, Vijay Gullapall...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",Reinforcement_Learning
3,117,[http_##www.cs.duke.edu#~mlittman#docs#gmdp.ps],Generalized Markov Decision Processes: Dynamic...,Csaba Szepesvari Michael L. Littman,The problem of maximizing the expected total d...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reinforcement_Learning
4,128,[ftp_##ftp.cs.colorado.edu#users#baveja#Papers...,Reinforcement Learning Algorithms for Average-...,Satinder P. Singh,Reinforcement learning (RL) has become a centr...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reinforcement_Learning
5,130,[http_##www.cs.orst.edu#~tadepall#research#pap...,Scaling Up Average Reward Reinforcement Learni...,Prasad Tadepalli and DoKyeong Ok,Almost all the work in Average-reward Reinforc...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reinforcement_Learning
6,164,[ftp_##theory.lcs.mit.edu#pub#people#oded#grs....,Learning polynomials with queries: The highly ...,ODED GOLDREICH RONITT RUBINFELD MADHU SUDAN,Given a function f mapping n-variate inputs fr...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Theory
7,288,[http_##www.cs.cmu.edu#afs#cs.cmu.edu#user#awd...,Memory Based Stochastic Optimization for Valid...,Artur Dubrawski and Jeff Schneider,This paper focuses on the optimization of hype...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reinforcement_Learning
8,424,[ftp_##ftp.cs.orst.edu#pub#tgd#papers#mlj-nge....,An Experimental Comparison of the Nearest-Neig...,DIETRICH WETTSCHERECK THOMAS G. DIETTERICH Edi...,Algorithms based on Nested Generalized Exempla...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Rule_Learning
9,434,[http_##www.cs.cmu.edu#afs#cs.cmu.edu#Web#Peop...,Learning Analytically and Inductively,Tom M. Mitchell Sebastian B. Thrun,Learning is a fundamental component of intelli...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reinforcement_Learning


In [10]:
# Store the dataframe as a .csv
Cora_Processing.to_csv(final_df)