In [1]:
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd

In [3]:
import networkx as nx

In [4]:
# new CORA dataset
# https://web.archive.org/web/20151007064508/http://linqs.cs.umd.edu/projects/projects/lbc/


In [5]:
new_folder = Path.home()/Path(r"Downloads\test")
old_folder = Path.home()/Path(r"Downloads\cora2")
if not new_folder.exists():
    new_folder = Path.home()/Path(r"Desktop\data\cora1")
    old_folder = Path.home()/Path(r"Desktop\data\cora-classify.tar\cora-classify\cora")

In [6]:
new_content = new_folder/"cora.content"
new_content.is_file()

True

In [7]:
old_classifications = old_folder/"classifications"
old_citations = old_folder/"citations"

In [8]:
df_new = pd.read_csv(new_content, delimiter="\t", header=None)
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1128975,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2704,1128977,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2705,1128978,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2706,117328,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Case_Based


In [9]:
df_new2 = df_new[[0, 1434]]
df_new2.columns = ["int_id", "category"]
df_new2.index = df_new2["int_id"]
df_new2 = df_new2.copy()
df_new2.drop("int_id", axis=1, inplace=True)
df_new2

Unnamed: 0_level_0,category
int_id,Unnamed: 1_level_1
31336,Neural_Networks
1061127,Rule_Learning
1106406,Reinforcement_Learning
13195,Reinforcement_Learning
37879,Probabilistic_Methods
...,...
1128975,Genetic_Algorithms
1128977,Genetic_Algorithms
1128978,Genetic_Algorithms
117328,Case_Based


In [10]:
df_new[1434].value_counts()

1434
Neural_Networks           818
Probabilistic_Methods     426
Genetic_Algorithms        418
Theory                    351
Case_Based                298
Reinforcement_Learning    217
Rule_Learning             180
Name: count, dtype: int64

In [11]:
old_classes = pd.read_csv(old_classifications, delimiter="\t", header=None)
old_classes.columns =["id", "category"]
old_classes.drop(old_classes.tail(1).index, inplace=True)

In [12]:
tmp = old_classes["category"].str.contains("Artificial_Intelligence")

In [13]:
df2 = old_classes[tmp]

In [14]:
df2["category"].value_counts()

category
/Artificial_Intelligence/Machine_Learning/Neural_Networks/           1473
/Artificial_Intelligence/Vision_and_Pattern_Recognition/             1327
/Artificial_Intelligence/Robotics/                                   1039
/Artificial_Intelligence/Agents/                                      827
/Artificial_Intelligence/Planning/                                    798
/Artificial_Intelligence/Machine_Learning/Probabilistic_Methods/      687
/Artificial_Intelligence/Machine_Learning/Genetic_Algorithms/         670
/Artificial_Intelligence/Games_and_Search/                            642
/Artificial_Intelligence/Machine_Learning/Theory/                     573
/Artificial_Intelligence/NLP/                                         562
/Artificial_Intelligence/Theorem_Proving/                             552
/Artificial_Intelligence/Machine_Learning/Case-Based/                 529
/Artificial_Intelligence/Speech/                                      513
/Artificial_Intelligence/Mach

In [15]:
def is_valid_id(id):
    if tpl.id in ["keywords", "http:##sal.cs.uiuc.edu#~ray#song97a.ps", ".include", ".exclude"]:
        return False
    if any(val in tpl.id for val in ["crhc.uiuc.edu", "tesla.csl.uiuc.edu", "ai.uiuc.edu"]):
        return False
    return True

In [16]:
for tpl in df2.itertuples():
    if not is_valid_id(tpl.id):
        continue
    potential_filename = tpl.id.replace(":", "_")
    if not (old_folder/"extractions"/potential_filename).is_file():
        print(tpl)
        print( potential_filename)
        break

In [17]:
all_files = list((old_folder/"extractions").glob('**/*'))

In [18]:
[file for file in all_files if "ai.uiuc.edu" in str(file)]

[]

In [19]:
other_properties_dict = {}
for tpl in df2.itertuples():
    if not is_valid_id(tpl.id):
        continue
    potential_filename = tpl.id.replace(":", "_")
    with open(old_folder/"extractions"/potential_filename) as file:
        other_properties_dict[tpl.id] = [line.rstrip() for line in file]
    

In [20]:
list(other_properties_dict.keys())[:10]

['http:##www.cc.gatech.edu#faculty#ashwin#papers#er-91-02.ps.Z',
 'http:##s2k-ftp.cs.berkeley.edu:8000#sequoia#tech-reports#s2k-92-9#s2k-92-09.ps.Z',
 'http:##www.cs.helsinki.fi#~oheinone#publications#Mining_in_the_Phrasal_Frontier_PKDD.ps.gz',
 'http:##www.cs.cmu.edu#afs#cs#user#alex#docs#idvl#bntuw98#sdr#bnslt98f.ps',
 'http:##www.cs.rice.edu#~andras#ECAI#kosk.ps',
 'http:##www.ccs.neu.edu#home#natasha#papers#AAAI-98.ps',
 'http:##www.cs.cornell.edu#home#cardie#papers#scott-a-exam.ps',
 'ftp:##ftp.cs.utexas.edu#pub#boyer#ics-reports#cmp33.ps.Z',
 'http:##www.iscs.nus.sg#~plong#papers#cgc.ps',
 'http:##ebbets.poly.edu#hstein#pubs#brof.ps']

In [21]:
records = []
for key, lines in other_properties_dict.items():
    d = {"id" : key, "Title:" : None, "Abstract:" : None}
    for term in ["Title:", "Abstract:"]:
        for line in lines:
            if line.startswith("Reference:") or line.startswith("Reference-contexts:"):
                continue
            if line.startswith(term):
                d[term]=line[len(term):]
    records.append(d)
        

In [22]:
df_abstract = pd.DataFrame.from_records(records)
old_classes
df_abstract

Unnamed: 0,id,Title:,Abstract:
0,http:##www.cc.gatech.edu#faculty#ashwin#papers...,A Goal-Based Approach to Intelligent Informat...,Intelligent information retrieval (IIR) requi...
1,http:##s2k-ftp.cs.berkeley.edu:8000#sequoia#te...,Automatic Acquisition of Hyponyms from Large ...,We describe a method for the automatic acquis...
2,http:##www.cs.helsinki.fi#~oheinone#publicatio...,Mining in the Phrasal Frontier,Data mining methods have been applied to a wi...
3,http:##www.cs.cmu.edu#afs#cs#user#alex#docs#id...,EXPERIMENTS IN INFORMATION RETRIEVAL FROM SPO...,This paper describes the experiments performe...
4,http:##www.cs.rice.edu#~andras#ECAI#kosk.ps,Finite-state morphology and information retri...,A source of potential systematic errors in in...
...,...,...,...
11721,http:##zen.efs.mq.edu.au:80#~akozek#GAMBL.ps,A Rule of Thumb (not only) for Gamblers,Let prize X in a game be a random variable wi...
11722,http:##zen.efs.mq.edu.au:80#~akozek#NoLoEss.ps,A New Nonparametic Estimation Method: Local a...,In the paper we consider a new class (called ...
11723,http:##zen.efs.mq.edu.au:80#~akozek#mdkl.ps,On Minimum Distance Estimation using Kolmogor...,
11724,http:##zen.efs.mq.edu.au:80#~akozek#nwsl.ps,On a Universal Strong Law of Large Numbers fo...,


In [23]:
old_classes.index = old_classes.id
old_classes2 = old_classes.drop("id", axis=1)

## Joining in the classes

In [24]:
df_abstract = df_abstract.join(old_classes2, on="id")

## Joining in the int_ids

In [25]:
df_papers = pd.read_csv(old_folder/"papers", delimiter="\t", header=None)
df_papers.columns = ["int_id", "id", "author"]
df_papers.index = df_papers.id

In [26]:
df_papers

Unnamed: 0_level_0,int_id,id,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
http:##dimacs.rutgers.edu#techps#1994#94-07.ps,2,http:##dimacs.rutgers.edu#techps#1994#94-07.ps,"[Gar] <author> M.R. Garey & D.S. Johnson, </au..."
http:##www.cs.wisc.edu#~fischer#ftp#pub#tech-reports#ncstrl.uwmadison#CS-TR-90-907#CS-TR-90-907.ps.Z,16,http:##www.cs.wisc.edu#~fischer#ftp#pub#tech-r...,"[DeWitt90] <author> D. DeWitt, P. Futtersack, ..."
ftp:##ftp.cs.purdue.edu#pub#hosking#papers#oopsla93.ps.gz,18,ftp:##ftp.cs.purdue.edu#pub#hosking#papers#oop...,"[Hoski93a] <author> A. Hosking, J. E. B. Moss,..."
ftp:##ftp.cs.umass.edu#pub#osl#papers#oopsla93.ps.Z,18,ftp:##ftp.cs.umass.edu#pub#osl#papers#oopsla93...,"[Hoski93a] <author> A. Hosking, J. E. B. Moss,..."
http:##cobar.cs.umass.edu#pubfiles#ds7.ps.gz,18,http:##cobar.cs.umass.edu#pubfiles#ds7.ps.gz,"[Hoski93a] <author> A. Hosking, J. E. B. Moss,..."
...,...,...,...
http:##www.cs.man.ac.uk#aig#staff#roger#pubs#darmstadt.ps.Z,1155664,http:##www.cs.man.ac.uk#aig#staff#roger#pubs#d...,
http:##www.cs.uni-bonn.de#III#lehre#seminare#Handlungsplanung#WS96#AIM-1293.ps.gz,1155665,http:##www.cs.uni-bonn.de#III#lehre#seminare#H...,
http:##www.daimi.aau.dk#~beta#Papers#Train#train-slides.ps.Z,1155666,http:##www.daimi.aau.dk#~beta#Papers#Train#tra...,
http:##www.eecis.udel.edu#~bao#psfile#chapter3.ps.gz,1155667,http:##www.eecis.udel.edu#~bao#psfile#chapter3...,


In [27]:
df_abstract = df_abstract.join(df_papers.drop("id",axis=1), on="id")


In [28]:
df_abstract

Unnamed: 0,id,Title:,Abstract:,category,int_id,author
0,http:##www.cc.gatech.edu#faculty#ashwin#papers...,A Goal-Based Approach to Intelligent Informat...,Intelligent information retrieval (IIR) requi...,/Artificial_Intelligence/Machine_Learning/Case...,129558.0,"[Ram and Hunter, 1991] <author> A. Ram and L. ..."
1,http:##s2k-ftp.cs.berkeley.edu:8000#sequoia#te...,Automatic Acquisition of Hyponyms from Large ...,We describe a method for the automatic acquis...,/Artificial_Intelligence/NLP/,101929.0,"[33] <author> Hearst, M.A. </author> <year> (1..."
2,http:##www.cs.helsinki.fi#~oheinone#publicatio...,Mining in the Phrasal Frontier,Data mining methods have been applied to a wi...,/Artificial_Intelligence/Data_Mining/,262323.0,"[AHKV97] <author> Helena Ahonen, Oskari Heinon..."
3,http:##www.cs.cmu.edu#afs#cs#user#alex#docs#id...,EXPERIMENTS IN INFORMATION RETRIEVAL FROM SPO...,This paper describes the experiments performe...,/Artificial_Intelligence/Speech/,1103045.0,
4,http:##www.cs.rice.edu#~andras#ECAI#kosk.ps,Finite-state morphology and information retri...,A source of potential systematic errors in in...,/Artificial_Intelligence/NLP/,1112801.0,
...,...,...,...,...,...,...
11721,http:##zen.efs.mq.edu.au:80#~akozek#GAMBL.ps,A Rule of Thumb (not only) for Gamblers,Let prize X in a game be a random variable wi...,/Artificial_Intelligence/Machine_Learning/Theory/,1121513.0,
11722,http:##zen.efs.mq.edu.au:80#~akozek#NoLoEss.ps,A New Nonparametic Estimation Method: Local a...,In the paper we consider a new class (called ...,/Artificial_Intelligence/Machine_Learning/Prob...,119952.0,"<author> Kozek, A. S. </author> <year> (1992)...."
11723,http:##zen.efs.mq.edu.au:80#~akozek#mdkl.ps,On Minimum Distance Estimation using Kolmogor...,,/Artificial_Intelligence/Machine_Learning/Prob...,1103455.0,
11724,http:##zen.efs.mq.edu.au:80#~akozek#nwsl.ps,On a Universal Strong Law of Large Numbers fo...,,/Artificial_Intelligence/Machine_Learning/Prob...,1123754.0,


## Join citation counts

In [29]:
df_citations = pd.read_csv(old_folder/"citations", delimiter="\t", header=None)
df_citations.columns=["referring_int_id", "cited_int_id"]

In [30]:
df_citations

Unnamed: 0,referring_int_id,cited_int_id
0,172005,0
1,172005,1
2,172005,2
3,172005,3
4,172005,4
...,...,...
714261,1102288,1102284
714262,1102288,37258
714263,1102288,66922
714264,1102288,1102301


In [31]:
tmp = df_citations.referring_int_id.value_counts()
tmp.name = "cites_count"
tmp

referring_int_id
1145049    2813
178718     2060
1144833     495
28418       458
75736       458
           ... 
1111752       1
1145304       1
1021601       1
1119089       1
44537         1
Name: cites_count, Length: 35788, dtype: int64

In [32]:
df_abstract = df_abstract.join(tmp, on="int_id")

In [33]:
tmp2 = df_citations.cited_int_id.value_counts()
tmp2.name = "cited_count"
tmp2

cited_int_id
2          929
1652       598
35         455
4159       448
1384       432
          ... 
45306        1
528926       1
528922       1
528887       1
1102301      1
Name: cited_count, Length: 208223, dtype: int64

In [34]:
df_abstract = df_abstract.join(tmp2, on="int_id")

In [35]:
df_abstract

Unnamed: 0,id,Title:,Abstract:,category,int_id,author,cites_count,cited_count
0,http:##www.cc.gatech.edu#faculty#ashwin#papers...,A Goal-Based Approach to Intelligent Informat...,Intelligent information retrieval (IIR) requi...,/Artificial_Intelligence/Machine_Learning/Case...,129558.0,"[Ram and Hunter, 1991] <author> A. Ram and L. ...",12.0,2.0
1,http:##s2k-ftp.cs.berkeley.edu:8000#sequoia#te...,Automatic Acquisition of Hyponyms from Large ...,We describe a method for the automatic acquis...,/Artificial_Intelligence/NLP/,101929.0,"[33] <author> Hearst, M.A. </author> <year> (1...",17.0,2.0
2,http:##www.cs.helsinki.fi#~oheinone#publicatio...,Mining in the Phrasal Frontier,Data mining methods have been applied to a wi...,/Artificial_Intelligence/Data_Mining/,262323.0,"[AHKV97] <author> Helena Ahonen, Oskari Heinon...",9.0,1.0
3,http:##www.cs.cmu.edu#afs#cs#user#alex#docs#id...,EXPERIMENTS IN INFORMATION RETRIEVAL FROM SPO...,This paper describes the experiments performe...,/Artificial_Intelligence/Speech/,1103045.0,,11.0,
4,http:##www.cs.rice.edu#~andras#ECAI#kosk.ps,Finite-state morphology and information retri...,A source of potential systematic errors in in...,/Artificial_Intelligence/NLP/,1112801.0,,9.0,
...,...,...,...,...,...,...,...,...
11721,http:##zen.efs.mq.edu.au:80#~akozek#GAMBL.ps,A Rule of Thumb (not only) for Gamblers,Let prize X in a game be a random variable wi...,/Artificial_Intelligence/Machine_Learning/Theory/,1121513.0,,6.0,
11722,http:##zen.efs.mq.edu.au:80#~akozek#NoLoEss.ps,A New Nonparametic Estimation Method: Local a...,In the paper we consider a new class (called ...,/Artificial_Intelligence/Machine_Learning/Prob...,119952.0,"<author> Kozek, A. S. </author> <year> (1992)....",14.0,1.0
11723,http:##zen.efs.mq.edu.au:80#~akozek#mdkl.ps,On Minimum Distance Estimation using Kolmogor...,,/Artificial_Intelligence/Machine_Learning/Prob...,1103455.0,,23.0,
11724,http:##zen.efs.mq.edu.au:80#~akozek#nwsl.ps,On a Universal Strong Law of Large Numbers fo...,,/Artificial_Intelligence/Machine_Learning/Prob...,1123754.0,,19.0,


In [36]:
df1 = df_abstract.dropna()

## Visualize citations

In [37]:
edges = [(tpl.referring_int_id, tpl.cited_int_id) for tpl in df_citations.itertuples()]

In [38]:
G = nx.DiGraph(edges)

In [39]:
df1 = df1[df1["category"].str.contains("Artificial_Intelligence/Machine_Learning")]

In [40]:
G_sub = G.subgraph(df1.int_id)

In [41]:
out_degrees = [(key, value) for key, value in G_sub.out_degree]
in_degrees = [(key, value) for key, value in G_sub.in_degree]

In [42]:
df_out_degree = pd.DataFrame.from_records(out_degrees, columns=["int_id", "out_degree"])

df_in_degree = pd.DataFrame.from_records(in_degrees, columns=["int_id", "in_degree"])
df_in_degree.index = df_in_degree.int_id
df_in_degree.drop("int_id", axis=1, inplace=True)
df_in_out_degree = df_out_degree.join(df_in_degree, on="int_id")
df_in_out_degree.index = df_in_out_degree.int_id

df_in_out_degree.drop("int_id", axis=1, inplace=True)
df_in_out_degree

Unnamed: 0_level_0,out_degree,in_degree
int_id,Unnamed: 1_level_1,Unnamed: 2_level_1
851968.0,2,0
368657.0,2,2
253971.0,0,0
8213.0,2,3
696342.0,3,0
...,...,...
368605.0,2,0
131042.0,2,3
262121.0,0,0
73712.0,0,1


In [43]:
if "author" in df_abstract.columns:
    df_abstract = df_abstract.drop("author", axis=1)

In [44]:
df1 = df_abstract.drop_duplicates(subset="int_id")
df1 = df1[df1["category"].str.contains("Artificial_Intelligence/Machine_Learning")]

In [45]:
df4 = df1.join(df_in_out_degree, on="int_id")

In [46]:
df4["help"] = np.logical_or(df4.in_degree>0, df4.out_degree>0 )

In [47]:
df3 = df4[~np.logical_or(df1["cites_count"].isnull(),  df1["cited_count"].isnull()) ]
df3

Unnamed: 0,id,Title:,Abstract:,category,int_id,cites_count,cited_count,out_degree,in_degree,help
0,http:##www.cc.gatech.edu#faculty#ashwin#papers...,A Goal-Based Approach to Intelligent Informat...,Intelligent information retrieval (IIR) requi...,/Artificial_Intelligence/Machine_Learning/Case...,129558.0,12.0,2.0,3.0,0.0,True
8,http:##www.iscs.nus.sg#~plong#papers#cgc.ps,Composite Geometric Concepts and Polynomial P...,,/Artificial_Intelligence/Machine_Learning/Theory/,42221.0,25.0,8.0,,,False
9,http:##ebbets.poly.edu#hstein#pubs#brof.ps,Learning Boolean Read-Once Formulas over Gene...,A read-once formula is one in which each vari...,/Artificial_Intelligence/Machine_Learning/Theory/,95225.0,16.0,6.0,5.0,3.0,True
12,http:##www.aic.nrl.navy.mil#~spears#papers#sia...,A COMPRESSION ALGORITHM FOR PROBABILITY TRANS...,This paper describes a compression algorithm ...,/Artificial_Intelligence/Machine_Learning/Gene...,447224.0,11.0,1.0,3.0,1.0,True
13,ftp:##lumpi.informatik.uni-dortmund.de#pub#bio...,Complexity Compression and Evolution,Compression of information is an important co...,/Artificial_Intelligence/Machine_Learning/Gene...,94641.0,4.0,11.0,1.0,8.0,True
...,...,...,...,...,...,...,...,...,...,...
11709,http:##www.wjh.harvard.edu#~hasselmo#NIPShasse...,Cholinergic suppression of transmission may a...,Selective suppression of transmission at feed...,/Artificial_Intelligence/Machine_Learning/Neur...,1365.0,12.0,156.0,0.0,43.0,True
11710,http:##www.wpi.edu#~mhchen#normc1.ps,Estimating Ratios of Normalizing Constants fo...,"In Bayesian inference, a Bayes factor is defi...",/Artificial_Intelligence/Machine_Learning/Prob...,643003.0,32.0,1.0,1.0,0.0,True
11717,http:##wwwipd.ira.uka.de#~prechelt#Biblio#mppm...,A Parallel Programming Model for Irregular Dy...,A compiler for CuPit has been built for the M...,/Artificial_Intelligence/Machine_Learning/Neur...,60169.0,7.0,1.0,3.0,1.0,True
11718,http:##wwwipd.ira.uka.de#~prechelt#Biblio#neur...,Investigation of the CasCor Family of Learnin...,Six learning algorithms are investigated and ...,/Artificial_Intelligence/Machine_Learning/Neur...,986996.0,22.0,1.0,2.0,1.0,True


In [48]:
len(df3["int_id"].unique())

2409

In [49]:
df3[~df3["Abstract:"].isnull()]

Unnamed: 0,id,Title:,Abstract:,category,int_id,cites_count,cited_count,out_degree,in_degree,help
0,http:##www.cc.gatech.edu#faculty#ashwin#papers...,A Goal-Based Approach to Intelligent Informat...,Intelligent information retrieval (IIR) requi...,/Artificial_Intelligence/Machine_Learning/Case...,129558.0,12.0,2.0,3.0,0.0,True
9,http:##ebbets.poly.edu#hstein#pubs#brof.ps,Learning Boolean Read-Once Formulas over Gene...,A read-once formula is one in which each vari...,/Artificial_Intelligence/Machine_Learning/Theory/,95225.0,16.0,6.0,5.0,3.0,True
12,http:##www.aic.nrl.navy.mil#~spears#papers#sia...,A COMPRESSION ALGORITHM FOR PROBABILITY TRANS...,This paper describes a compression algorithm ...,/Artificial_Intelligence/Machine_Learning/Gene...,447224.0,11.0,1.0,3.0,1.0,True
13,ftp:##lumpi.informatik.uni-dortmund.de#pub#bio...,Complexity Compression and Evolution,Compression of information is an important co...,/Artificial_Intelligence/Machine_Learning/Gene...,94641.0,4.0,11.0,1.0,8.0,True
18,http:##www.research.att.com#~yoav#papers#majp....,Boosting a weak learning algorithm by majorit...,We present an algorithm for improving the acc...,/Artificial_Intelligence/Machine_Learning/Theory/,6125.0,19.0,35.0,4.0,19.0,True
...,...,...,...,...,...,...,...,...,...,...
11709,http:##www.wjh.harvard.edu#~hasselmo#NIPShasse...,Cholinergic suppression of transmission may a...,Selective suppression of transmission at feed...,/Artificial_Intelligence/Machine_Learning/Neur...,1365.0,12.0,156.0,0.0,43.0,True
11710,http:##www.wpi.edu#~mhchen#normc1.ps,Estimating Ratios of Normalizing Constants fo...,"In Bayesian inference, a Bayes factor is defi...",/Artificial_Intelligence/Machine_Learning/Prob...,643003.0,32.0,1.0,1.0,0.0,True
11717,http:##wwwipd.ira.uka.de#~prechelt#Biblio#mppm...,A Parallel Programming Model for Irregular Dy...,A compiler for CuPit has been built for the M...,/Artificial_Intelligence/Machine_Learning/Neur...,60169.0,7.0,1.0,3.0,1.0,True
11718,http:##wwwipd.ira.uka.de#~prechelt#Biblio#neur...,Investigation of the CasCor Family of Learnin...,Six learning algorithms are investigated and ...,/Artificial_Intelligence/Machine_Learning/Neur...,986996.0,22.0,1.0,2.0,1.0,True


In [50]:
df1["category"].str[len("/Artificial_Intelligence/"):]

0                   Machine_Learning/Case-Based/
8                       Machine_Learning/Theory/
9                       Machine_Learning/Theory/
10                      Machine_Learning/Theory/
12          Machine_Learning/Genetic_Algorithms/
                          ...                   
11721                   Machine_Learning/Theory/
11722    Machine_Learning/Probabilistic_Methods/
11723    Machine_Learning/Probabilistic_Methods/
11724    Machine_Learning/Probabilistic_Methods/
11725    Machine_Learning/Probabilistic_Methods/
Name: category, Length: 4323, dtype: object

In [51]:
df1["category"].unique()

array(['/Artificial_Intelligence/Machine_Learning/Case-Based/',
       '/Artificial_Intelligence/Machine_Learning/Theory/',
       '/Artificial_Intelligence/Machine_Learning/Genetic_Algorithms/',
       '/Artificial_Intelligence/Machine_Learning/Probabilistic_Methods/',
       '/Artificial_Intelligence/Machine_Learning/Neural_Networks/',
       '/Artificial_Intelligence/Machine_Learning/Rule_Learning/',
       '/Artificial_Intelligence/Machine_Learning/Reinforcement_Learning/'],
      dtype=object)

In [52]:
df_to_iter = df4.join(df_new2, on="int_id", rsuffix="_new")
df_to_iter[~df_to_iter["Abstract:"].isnull()]

Unnamed: 0,id,Title:,Abstract:,category,int_id,cites_count,cited_count,out_degree,in_degree,help,category_new
0,http:##www.cc.gatech.edu#faculty#ashwin#papers...,A Goal-Based Approach to Intelligent Informat...,Intelligent information retrieval (IIR) requi...,/Artificial_Intelligence/Machine_Learning/Case...,129558.0,12.0,2.0,3.0,0.0,True,Case_Based
9,http:##ebbets.poly.edu#hstein#pubs#brof.ps,Learning Boolean Read-Once Formulas over Gene...,A read-once formula is one in which each vari...,/Artificial_Intelligence/Machine_Learning/Theory/,95225.0,16.0,6.0,5.0,3.0,True,Theory
10,http:##www.cs.orst.edu#~tadepall#research#pape...,Learning from Examples and Membership Queries...,It is well known that prior knowledge or bias...,/Artificial_Intelligence/Machine_Learning/Theory/,1116454.0,44.0,,,,False,
12,http:##www.aic.nrl.navy.mil#~spears#papers#sia...,A COMPRESSION ALGORITHM FOR PROBABILITY TRANS...,This paper describes a compression algorithm ...,/Artificial_Intelligence/Machine_Learning/Gene...,447224.0,11.0,1.0,3.0,1.0,True,Genetic_Algorithms
13,ftp:##lumpi.informatik.uni-dortmund.de#pub#bio...,Complexity Compression and Evolution,Compression of information is an important co...,/Artificial_Intelligence/Machine_Learning/Gene...,94641.0,4.0,11.0,1.0,8.0,True,Genetic_Algorithms
...,...,...,...,...,...,...,...,...,...,...,...
11719,http:##wwwipd.ira.uka.de#~prechelt#Biblio#neur...,Some Notes on Neural Learning Algorithm Bench...,New neural learning algorithms are often benc...,/Artificial_Intelligence/Machine_Learning/Neur...,1109203.0,4.0,,,,False,
11720,http:##wwwsyseng.anu.edu.au#~jon#papers#acnn98...,TDLeaf(): Combining Temporal Difference learn...,"In this paper we present TDLeaf(), a variatio...",/Artificial_Intelligence/Machine_Learning/Rein...,1113182.0,9.0,,,,False,Reinforcement_Learning
11721,http:##zen.efs.mq.edu.au:80#~akozek#GAMBL.ps,A Rule of Thumb (not only) for Gamblers,Let prize X in a game be a random variable wi...,/Artificial_Intelligence/Machine_Learning/Theory/,1121513.0,6.0,,,,False,
11722,http:##zen.efs.mq.edu.au:80#~akozek#NoLoEss.ps,A New Nonparametic Estimation Method: Local a...,In the paper we consider a new class (called ...,/Artificial_Intelligence/Machine_Learning/Prob...,119952.0,14.0,1.0,0.0,1.0,True,


In [53]:
df_to_iter[~df_to_iter["Abstract:"].isnull()]["category_new"].count()

2430

In [54]:
df_to_iter[~df_to_iter["category_new"].isnull()]

Unnamed: 0,id,Title:,Abstract:,category,int_id,cites_count,cited_count,out_degree,in_degree,help,category_new
0,http:##www.cc.gatech.edu#faculty#ashwin#papers...,A Goal-Based Approach to Intelligent Informat...,Intelligent information retrieval (IIR) requi...,/Artificial_Intelligence/Machine_Learning/Case...,129558.0,12.0,2.0,3.0,0.0,True,Case_Based
8,http:##www.iscs.nus.sg#~plong#papers#cgc.ps,Composite Geometric Concepts and Polynomial P...,,/Artificial_Intelligence/Machine_Learning/Theory/,42221.0,25.0,8.0,,,False,Theory
9,http:##ebbets.poly.edu#hstein#pubs#brof.ps,Learning Boolean Read-Once Formulas over Gene...,A read-once formula is one in which each vari...,/Artificial_Intelligence/Machine_Learning/Theory/,95225.0,16.0,6.0,5.0,3.0,True,Theory
12,http:##www.aic.nrl.navy.mil#~spears#papers#sia...,A COMPRESSION ALGORITHM FOR PROBABILITY TRANS...,This paper describes a compression algorithm ...,/Artificial_Intelligence/Machine_Learning/Gene...,447224.0,11.0,1.0,3.0,1.0,True,Genetic_Algorithms
13,ftp:##lumpi.informatik.uni-dortmund.de#pub#bio...,Complexity Compression and Evolution,Compression of information is an important co...,/Artificial_Intelligence/Machine_Learning/Gene...,94641.0,4.0,11.0,1.0,8.0,True,Genetic_Algorithms
...,...,...,...,...,...,...,...,...,...,...,...
11706,http:##www.wi.leidenuniv.nl#~gusz#vdhauw.ps.gz,Evaluating and Improving Steady State Evoluti...,,/Artificial_Intelligence/Machine_Learning/Gene...,646900.0,68.0,4.0,,,False,Genetic_Algorithms
11709,http:##www.wjh.harvard.edu#~hasselmo#NIPShasse...,Cholinergic suppression of transmission may a...,Selective suppression of transmission at feed...,/Artificial_Intelligence/Machine_Learning/Neur...,1365.0,12.0,156.0,0.0,43.0,True,Neural_Networks
11710,http:##www.wpi.edu#~mhchen#normc1.ps,Estimating Ratios of Normalizing Constants fo...,"In Bayesian inference, a Bayes factor is defi...",/Artificial_Intelligence/Machine_Learning/Prob...,643003.0,32.0,1.0,1.0,0.0,True,Probabilistic_Methods
11717,http:##wwwipd.ira.uka.de#~prechelt#Biblio#mppm...,A Parallel Programming Model for Irregular Dy...,A compiler for CuPit has been built for the M...,/Artificial_Intelligence/Machine_Learning/Neur...,60169.0,7.0,1.0,3.0,1.0,True,Neural_Networks


# Try to reproduce the word vectors attached to the "new" CORA dataset

In [55]:
# function performin basic text preprocessing

import nltk
import string
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

my_stopwords = set(["this", "use","learning", "paper"])

def process_abstracts(df, column, alt_column):
    en_stopwords = set(stopwords.words('english')) | set(string.punctuation) | my_stopwords
    ids = df["int_id"]
    vals = df[column]
    alt_vals = df[alt_column]
    stemmer = PorterStemmer()
    all_tokens = {}
    for the_id, val, alt_val in zip(ids, vals, alt_vals):
        if val is None:
            val=alt_val            
        if val is None:
            all_tokens.append([])
            continue
        tokens = [token.lower() for token in word_tokenize(val) if token.lower() not in en_stopwords]
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        all_tokens[the_id] = [token for token in stemmed_tokens if token not in en_stopwords]
    return all_tokens
    

In [56]:
df_token = df_to_iter[~df_to_iter["category_new"].isnull()]

abstract_token_dict = process_abstracts(df_token, "Abstract:", "Title:")
#abstract_token_dict = {key: value for key, value in zip(df_token["int_id"], tokens)}

In [57]:
title_token_dict = process_abstracts(df_token, "Title:", "Abstract:")
#title_token_dict = {key: value for key, value in zip(df_token["int_id"], tokens)}

In [58]:
#tokens = [l1+l2 for l1, l2 in zip(abstract_tokens, title_tokens)]
token_dict = {key: abstract_token_dict[key]+title_token_dict[key] for key in abstract_token_dict.keys()}

### Term frequency in old CORA

In [59]:
from collections import Counter
from itertools import chain

In [60]:
C = Counter(chain.from_iterable(set(tl) for tl in token_dict.values() if tl is not None))

In [61]:
C.most_common(20)

[('algorithm', 1078),
 ('problem', 1020),
 ('result', 935),
 ('model', 852),
 ('present', 820),
 ('gener', 789),
 ('method', 769),
 ('system', 745),
 ('network', 733),
 ('show', 717),
 ('perform', 691),
 ('approach', 668),
 ('base', 594),
 ('function', 580),
 ('also', 575),
 ('comput', 570),
 ('describ', 567),
 ('set', 567),
 ('new', 563),
 ('neural', 511)]

In [62]:
term_frequency=C

In [63]:
df_to_iter[df_to_iter["int_id"]==31336]["Title:"][11469]

' The megaprior heuristic for discovering protein sequence patterns'

### Calculate document frequency in new CORA

In [64]:
df_new.drop([0,1434], axis=1).sum(axis=0)

1        16
2        33
3        70
4        72
5       165
       ... 
1429      3
1430     34
1431      6
1432     65
1433     12
Length: 1433, dtype: int64

In [65]:
document_frequency = df_new.drop([0, 1434], axis=1).sum(axis=0)

### Try to find the words that match the new Cora data

In [66]:
word_shared_by_all_entries = [None]*df_new.shape[1]
most_frequent_words_per_column = [Counter() for _ in range(df_new.shape[1])]
for _, row in df_new.iterrows():
    the_id = row[0]
    if token_dict[the_id] is None:
        continue
    the_tokens = set(token_dict[the_id])
    for column in np.nonzero(row)[0][1:]:
        if word_shared_by_all_entries[column] is None:
            word_shared_by_all_entries[column] = the_tokens
        else:
            word_shared_by_all_entries[column] &= the_tokens
            most_frequent_words_per_column[column].update(the_tokens)
top_most_frequent_words_per_column = [col.most_common(3) for col in most_frequent_words_per_column]

In [67]:
# The output below indicates that for many of the articles, there is not one word shared amoung all the 
#  papers that have a one in a single column
#  the output {'lifetim'} indicates that there was one word that was shared by all the papers 
#  with a 1 in the corresponding column

# for some columns it is not obvious which is the token that belongs to it.
word_shared_by_all_entries[:27]

[None,
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'lifetim'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'algorithm', 'better', 'differ', 'english', 'sever'},
 set()]

In [68]:
# what does the output mean?
# the output  [('facilit', 12), ('process', 7), ('show', 7)] means that amoung the papers that 
#  have a 1 in column two, there were 12 with the token facilit, 7 with process and 7 with show
top_most_frequent_words_per_column[:21]

[[],
 [('facilit', 14), ('process', 7), ('show', 7)],
 [('paradigm', 31), ('approach', 16), ('problem', 15)],
 [('need', 67), ('system', 30), ('present', 28)],
 [('extend', 71), ('algorithm', 33), ('result', 30)],
 [('reason', 161), ('problem', 88), ('system', 87)],
 [('correctli', 6), ('problem', 4), ('inform', 3)],
 [('encod', 27), ('algorithm', 17), ('problem', 15)],
 [('thank', 7), ('research', 5), ('help', 5)],
 [('numer', 31), ('method', 17), ('problem', 13)],
 [('lifetim', 5), ('space', 4), ('differ', 3)],
 [('parameter', 12), ('result', 8), ('gener', 7)],
 [('behav', 8), ('network', 7), ('neural', 6)],
 [('aim', 29), ('problem', 14), ('present', 13)],
 [('dimension', 19), ('high', 10), ('algorithm', 10)],
 [('outcom', 10), ('method', 5), ('result', 5)],
 [('boundari', 10), ('result', 5), ('algorithm', 5)],
 [('latent', 15), ('model', 14), ('data', 10)],
 [('taken', 9), ('perform', 6), ('well', 5)],
 [('semant', 15), ('show', 11), ('model', 7)],
 [('network', 558), ('neural', 37

### Compute corrected frequency counts

This better measures the surprise of observing such a count. It essetially discounts the very frequent tokens by their expected frequency.

In [69]:
corpus_size = len(df_to_iter)

In [70]:
import math
corr_frequencies = []
for the_counter, document_f in zip(most_frequent_words_per_column[1:], document_frequency):
    new_d = {}
    for token, frequency in the_counter.items():
        corr_frequency = frequency - math.floor(document_f/corpus_size * term_frequency[token])
        if corr_frequency > 1:
            new_d[token] = corr_frequency
    
    corr_frequencies.append(Counter(new_d))
top_corr_words_per_column = [col.most_common(3) for col in corr_frequencies]

In [71]:
top_corr_words_per_column[:20]

[[('facilit', 14), ('process', 6), ('show', 5)],
 [('paradigm', 31), ('approach', 11), ('learn', 8)],
 [('need', 64), ('system', 18), ('learn', 15)],
 [('extend', 68), ('abstract', 17), ('algorithm', 16)],
 [('reason', 151), ('case-bas', 74), ('case', 63)],
 [('correctli', 6), ('problem', 3), ('inform', 3)],
 [('encod', 27), ('genet', 13), ('algorithm', 10)],
 [('thank', 7), ('research', 5), ('help', 5)],
 [('numer', 31), ('method', 12), ('techniqu', 8)],
 [('lifetim', 5), ('space', 4), ('differ', 3)],
 [('parameter', 12), ('result', 6), ('gener', 5)],
 [('behav', 8), ('network', 6), ('neural', 5)],
 [('aim', 29), ('complex', 10), ('order', 9)],
 [('dimension', 19), ('high', 10), ('space', 7)],
 [('outcom', 10), ('method', 4), ('decis', 3)],
 [('boundari', 10), ('effici', 3), ('classif', 3)],
 [('latent', 15), ('model', 11), ('data', 9)],
 [('taken', 9), ('well', 5), ('perform', 5)],
 [('semant', 15), ('show', 9), ('reason', 6)],
 [('network', 464), ('neural', 306), ('train', 113)]]

### Try to reconstruct the words that were used in the CORA dataset

In [72]:
t1 = 0.7
t2 = 0.8

def simple_filter_for_frequency_counts(t1, t2, most_frequent_words_per_column, document_frequency):
    """
    trying to identify the potential words that belong to columns of the CORA dataset.

    we therefor employ two thresholds:
     - t1 asserts the relative frequency of the most frequent candidate for that column to the second most frequent candidate
     - t2 asserts the relative frequency of the most frequent to the overall frequency of that token
    """
    out = []
    for suggestion, df in zip(most_frequent_words_per_column, document_frequency):
        if len(suggestion) < 2:
            out.append(None)
            continue
        word1, count1 = suggestion[0]
        _, count2 = suggestion[1]
        if  t2 > count2 / count1 and count1 / df > t1:
            out.append(word1)
        else:
            out.append(None)
    return out

In [73]:
simple_filter_for_frequency_counts(t1, t2, top_most_frequent_words_per_column[1:], document_frequency)[:30]

['facilit',
 'paradigm',
 'need',
 'extend',
 'reason',
 'correctli',
 'encod',
 'thank',
 'numer',
 None,
 'parameter',
 None,
 'aim',
 'dimension',
 'outcom',
 'boundari',
 None,
 'taken',
 'semant',
 'network',
 None,
 'onlin',
 None,
 'matrix',
 None,
 'empir',
 'involv',
 'varieti',
 'industri',
 None]

In [74]:
word_guesses_corr = simple_filter_for_frequency_counts(t1, t2, top_corr_words_per_column, document_frequency)

word_guesses_corr[:30]

['facilit',
 'paradigm',
 'need',
 'extend',
 'reason',
 'correctli',
 'encod',
 'thank',
 'numer',
 None,
 'parameter',
 'behav',
 'aim',
 'dimension',
 'outcom',
 'boundari',
 'latent',
 'taken',
 'semant',
 'network',
 None,
 'onlin',
 None,
 'matrix',
 None,
 'empir',
 'involv',
 'varieti',
 'industri',
 None]

In [75]:
def frequency_of_unidentified_words(counts_per_column, estimated_token):
    for counts, token in zip(counts_per_column, estimated_token):
        if token is None:
            if len(counts)>0 and counts[0][1] > 10:
                print(counts[0][1])

In [76]:
frequency_of_unidentified_words(top_corr_words_per_column, word_guesses_corr)

17
11
19
17
15
12
17
17
13
28
13
20
20
16
32
30
12
100
354
14
63
14
13
18
14
11
37
12
221
11
53
101
247
20
12
27
13
30
38
38


In [77]:
df_new.drop([0,1434], axis=1).sum(axis=0)

1        16
2        33
3        70
4        72
5       165
       ... 
1429      3
1430     34
1431      6
1432     65
1433     12
Length: 1433, dtype: int64

In [None]:
list(file.items())

In [None]:
file['attr_names'][100:240]