In [1]:
#pip install bash_kernel

In [2]:
import re
import pandas as pd
import os
import glob
from datetime import datetime

## Data preprocessing for the citation rate prediction

Data can be found at https://snap.stanford.edu/data/cit-HepTh.html

It represents the citation network in the fields of high energy theoretical physics



Let`s download the data and unzip it

In [None]:
%%bash
cd ..
mkdir data
cd data
wget https://snap.stanford.edu/data/cit-HepTh.txt.gz
wget https://snap.stanford.edu/data/cit-HepTh-dates.txt.gz
wget https://snap.stanford.edu/data/cit-HepTh-abstracts.tar.gz
gzip -d cit-HepTh.txt.gz
gzip -d cit-HepTh-dates.txt.gz
gzip -d cit-HepTh-abstracts.tar.gz
tar -xf cit-HepTh-abstracts.tar

--2024-11-06 22:09:18--  https://snap.stanford.edu/data/cit-HepTh.txt.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1317497 (1,3M) [application/x-gzip]
Saving to: ‘cit-HepTh.txt.gz’

     0K .......... .......... .......... .......... ..........  3% 80,8K 15s
    50K .......... .......... .......... .......... ..........  7%  271K 10s
   100K .......... .......... .......... .......... .......... 11%  300K 7s
   150K .......... .......... .......... .......... .......... 15% 6,14M 5s
   200K .......... .......... .......... .......... .......... 19% 10,3M 4s
   250K .......... .......... .......... .......... .......... 23%  287K 4s
   300K .......... .......... .......... .......... .......... 27% 44,4M 3s
   350K .......... .......... .......... .......... .......... 31% 10,6M 3s
   400K .......... .......... .......... ......

The following script will remove unnecessary lines from the files, and unfolder the folders with data

In [4]:
%%bash
cd data
sed '1,4d' cit-HepTh.txt > temp.txt && mv temp.txt cit-HepTh.txt
sed '1d' cit-HepTh-dates.txt > temp.txt && mv temp.txt cit-HepTh-dates.txt
mv cit-HepTh.txt edgelist.txt
mv cit-HepTh-dates.txt dates.txt

In [5]:
%%bash
cd data
mkdir -p meta_files

for folder in *; do
  if [ -d "$folder" ] && [ "$folder" != "meta_files" ]; then
    mv "$folder"/*.* meta_files/
  fi
done

find . -type d -empty -not -path "./meta_files" -delete

Parsing the metadata files. We consider all other information to be too sparse or irrelevant for the research

In [6]:
paper_features = ('Paper','Date','Title','Authors','Abstract')

In [7]:
def data_dict_from_file(filename):
    data_dict = dict()
    with open(filename, 'r') as file:
        text = file.read()

        last_comment = text.split("\\\\")[2].strip()

        # Extract key-value pairs using regular expressions and split by newline
        pattern = r"(\S+): (.+)"
        matches = re.findall(pattern, text.split("\\\\")[1], re.MULTILINE)

        # Create a dictionary from the matches
        data_dict = {key.strip(): value.strip() for key, value in matches}
        to_pop = list()
        for key in data_dict.keys():
            if key not in paper_features:
                to_pop.append(key)
        for key in to_pop:
            data_dict.pop(key, None)
        data_dict['Abstract'] = last_comment
        data_dict['Paper'] = int(data_dict['Paper'][7:])
        
        return data_dict

In [8]:
dict_list = list()

In [None]:
directory = os.path.join(os.pardir, 'data', 'meta_files')
files = glob.glob(os.path.join(directory, '*'))

for file in files:
    if os.path.isfile(file):
        new_dict = data_dict_from_file(file)
        dict_list.append(new_dict)

In [10]:
full_data_dictionary = dict()
for feature in paper_features:
    full_data_dictionary[feature] = []

In [11]:
for d in dict_list:
    for key in paper_features:
        if key not in d.keys():
            d[key] = ''
        full_data_dictionary[key].append(d[key])

Creating dataframe with everything we need (dates still need some manipulation)

In [12]:
df = pd.DataFrame.from_dict(full_data_dictionary)
df.head()

Unnamed: 0,Paper,Date,Title,Authors,Abstract
0,9511170,"Thu, 23 Nov 1995 15:22:03 +0000 (GMT) (8kb)",Einstein-Infeld-Hoffman method and soliton dyn...,Jacek Dziarmaga,We consider slow motion of a pointlike topolog...
1,9211024,"Wed, 4 Nov 1992 14:49 +0200 (7kb)",Discretization of the Superparticle Path Integral,"J. Grundberg, U. Lindstrom and H. Nordstrom",Requiring that the path integral has the globa...
2,9806180,"Sun, 21 Jun 1998 21:09:27 GMT (11kb)",Comments on N=2 AdS Orbifolds,Sergei Gukov,We discuss twisted states of AdS orbifolds whi...
3,108199,"Mon, 27 Aug 2001 16:34:58 GMT (10kb)",Determinant Line Bundles and Topological Invar...,"A.A. Bytsenko, M.C. Falleiros, A.E. Goncalves ...",We give some remarks on twisted determinant li...
4,110123,"Mon, 15 Oct 2001 05:40:51 GMT (5kb)",Entropy of the three dimensional Schwarzschild...,,We study the three dimensional Schwarzschild-d...


Let`s fix the dates

In [13]:
def is_date_format(date_str,form):
    pattern = r'^\d{2}-[A-Z]{3}-\d{4}$'
    if form == 2:
        pattern = r'^\d{2}/\d{2}/\d{2}$'
    if form == 3:
        pattern = r'^(0?\d|[12]\d|3[01])-[a-zA-Z]{3}-\d{4}$'
    if re.match(pattern, date_str):
        return True
    else:
        return False

In [14]:
for i, row in df.iterrows():
    
    if (row['Paper'] == 9509068):  # no year specified, add it manually
        df.at[i,'Date'] = "2000-09-13"
        continue
        
    date_str = row['Date']
    date_str = date_str.split()
    date_obj = None
    
    if is_date_format(date_str[0],1):
        sp = date_str[0].split('-')
        date_conc = sp[0] + " " + sp[1] + " " + sp[2]
        date_obj = datetime.strptime(date_conc, "%d %b %Y")
        
    elif is_date_format(date_str[0],2): 
        date_obj = datetime.strptime(date_str[0], "%m/%d/%y")
    
    elif is_date_format(date_str[0],3):
        sp = date_str[0].split('-')
        date_conc = '0' + sp[0] + " " + sp[1] + " " + sp[2]
        date_obj = datetime.strptime(date_conc, "%d %b %Y")
    else:
        date_str = date_str[(1 - date_str[0][0].isdigit()):]

        if not date_str[0].isdigit():
            tmp = date_str[0]
            date_str[0] = date_str[1]
            date_str[1] = tmp

        ind = 2
        if date_str[2][-1] == ',':
            date_str[2] = date_str[2][:-1]
        while not date_str[2].isdigit():
            ind += 1
            date_str[2] = date_str[ind]
        if int(date_str[2]) < 100:

            date_str[2] = '19' + date_str[2]

        date_str[1] = date_str[1][:3].upper()
        date_only = date_str[0] +" "+ date_str[1] +" "+ date_str[2]
        date_obj = datetime.strptime(date_only, "%d %b %Y")
        
    formatted_date = date_obj.strftime("%Y-%m-%d")
    df.at[i,'Date'] = formatted_date

Now let`s turn Authors into a list

In [15]:
for i, row in df.iterrows():
    string = row['Authors']
    string = string.replace('and',',').replace(' ','').split(',')
    df.at[i,'Authors'] = string

Also let`s rename columns properly, check our df, and output the result

In [16]:
df = df.rename(columns={"Paper": "Paper_ID"})

In [17]:
df.head()

Unnamed: 0,Paper_ID,Date,Title,Authors,Abstract
0,9511170,1995-11-23,Einstein-Infeld-Hoffman method and soliton dyn...,[JacekDziarmaga],We consider slow motion of a pointlike topolog...
1,9211024,1992-11-04,Discretization of the Superparticle Path Integral,"[J.Grundberg, U.Lindstrom, H.Nordstrom]",Requiring that the path integral has the globa...
2,9806180,1998-06-21,Comments on N=2 AdS Orbifolds,[SergeiGukov],We discuss twisted states of AdS orbifolds whi...
3,108199,2001-08-27,Determinant Line Bundles and Topological Invar...,"[A.A.Bytsenko, M.C.Falleiros, A.E.Goncalves, Z...",We give some remarks on twisted determinant li...
4,110123,2001-10-15,Entropy of the three dimensional Schwarzschild...,[],We study the three dimensional Schwarzschild-d...


In [None]:
df.to_csv(os.path.join(os.pardir, "data", "processed.csv"))

We didn`t need dates file yet, but that time may come

We noticed that the graph has some incorrect edges: some papers cited other papers that did not yet exist at that time. Let's iterate over all the edges and delete the bad ones. 

In [19]:
import networkx as nx
from get_graph import get_digraph

In [20]:
G: nx.DiGraph = get_digraph()

In [21]:
G.number_of_nodes(), G.number_of_edges()

(27770, 352807)

In [22]:
G.nodes()

NodeView((1001, 9304045, 9308122, 9309097, 9311042, 9401139, 9404151, 9407087, 9408099, 9501030, 9503124, 9504090, 9504145, 9505025, 9505054, 9505105, 9505162, 9506048, 9506112, 9506144, 9507050, 9507158, 9508094, 9508155, 9510142, 9510225, 9510234, 9511030, 9511171, 9601108, 9602022, 9602114, 9603003, 9603150, 9603161, 9603167, 9605184, 9605222, 9606017, 9606040, 9607163, 9607207, 9608086, 9609070, 9609071, 9609239, 9611137, 9612108, 9701162, 9702094, 9702155, 9702198, 9703082, 9703166, 9704097, 9705030, 9705044, 9705104, 9705220, 9706005, 9707014, 9707042, 9707049, 9710230, 9711036, 9711104, 9712028, 9712042, 9802194, 9805056, 9805206, 9806094, 9810188, 9811217, 9905036, 9907041, 9908007, 9908144, 9909108, 9909120, 9909229, 9910238, 9910248, 9910268, 9204040, 9203084, 9204035, 9205041, 9207049, 9207111, 9301042, 9301043, 9201015, 9201040, 9201047, 9202046, 9202059, 9202092, 9203008, 9203031, 9204037, 9204046, 9205046, 9205060, 9205062, 9205068, 9206023, 9206051, 9206078, 9207016, 920

In [23]:
edges_to_remove = [e for e in G.edges if not G.nodes[e[0]]['Date'] > G.nodes[e[1]]['Date']]
G.remove_edges_from(edges_to_remove)

In [24]:
G.number_of_nodes(), G.number_of_edges()

(27770, 351285)

We decided to remove all nodes not from the largest weak component (stray nodes), since the vast majority of nodes are in the same component

In [25]:
largest_weak_component = max(nx.weakly_connected_components(G), key=len)
G: nx.DiGraph = G.subgraph(largest_weak_component)

In [26]:
G.number_of_nodes(), G.number_of_edges()

(27376, 351025)

In [None]:
with open(os.path.join(os.pardir, 'data', 'edgelist.txt'), 'wb') as edgelist_file:
    nx.write_edgelist(G, edgelist_file, data=False)

Sinse we decided to remove stray nodes let's remove them from dataset too

In [None]:
df = df[df['Paper_ID'].isin(G.nodes())]
assert df.shape[0] == G.number_of_nodes()
df.to_csv(os.path.join(os.pardir, "data", "processed.csv"), index=False)

In [29]:
os.remove(os.path.join('data', 'cit-HepTh-abstracts.tar'))