In [40]:
#pip install bash_kernel

In [41]:
import re
import pandas as pd
import os
import glob
from datetime import datetime

## Data preprocessing for the citation rate prediction

Data can be found at https://snap.stanford.edu/data/cit-HepTh.html

It represents the citation network in the fields of high energy theoretical physics



Let`s download the data and unzip it

In [42]:
%%bash
mkdir data
cd data
wget https://snap.stanford.edu/data/cit-HepTh.txt.gz
wget https://snap.stanford.edu/data/cit-HepTh-dates.txt.gz
wget https://snap.stanford.edu/data/cit-HepTh-abstracts.tar.gz
gzip -d cit-HepTh.txt.gz
gzip -d cit-HepTh-dates.txt.gz
gzip -d cit-HepTh-abstracts.tar.gz
tar -xf cit-HepTh-abstracts.tar

--2024-11-04 17:51:49--  https://snap.stanford.edu/data/cit-HepTh.txt.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1317497 (1.3M) [application/x-gzip]
Saving to: ‘cit-HepTh.txt.gz’

     0K .......... .......... .......... .......... ..........  3% 90.8K 14s
    50K .......... .......... .......... .......... ..........  7%  264K 9s
   100K .......... .......... .......... .......... .......... 11% 5.74M 6s
   150K .......... .......... .......... .......... .......... 15%  282K 5s
   200K .......... .......... .......... .......... .......... 19% 6.79M 4s
   250K .......... .......... .......... .......... .......... 23%  280K 4s
   300K .......... .......... .......... .......... .......... 27% 8.92M 3s
   350K .......... .......... .......... .......... .......... 31% 15.8M 2s
   400K .......... .......... .......... .......

The following script will remove unnecessary lines from the files, and unfolder the folders with data

In [43]:
%%bash
cd data
sed '1,4d' Cit-HepTh.txt > temp.txt && mv temp.txt Cit-HepTh.txt
sed '1d' Cit-HepTh-dates.txt > temp.txt && mv temp.txt Cit-HepTh-dates.txt
mv Cit-HepTh.txt edgelist.txt
mv Cit-HepTh-dates.txt dates.txt

In [None]:
%%bash
cd data
mkdir -p meta_files

for folder in *; do
  if [ -d "$folder" ] && [ "$folder" != "meta_files" ]; then
    mv "$folder"/*.* meta_files/
  fi
done

find . -type d -empty -not -path "./meta_files" -delete

Parsing the metadata files. We consider all other information to be too sparse or irrelevant for the research

In [None]:
paper_features = ('Paper','Date','Title','Authors','Abstract')

In [None]:
def data_dict_from_file(filename):
    data_dict = dict()
    with open(filename, 'r') as file:
        text = file.read()

        last_comment = text.split("\\\\")[2].strip()

        # Extract key-value pairs using regular expressions and split by newline
        pattern = r"(\S+): (.+)"
        matches = re.findall(pattern, text.split("\\\\")[1], re.MULTILINE)

        # Create a dictionary from the matches
        data_dict = {key.strip(): value.strip() for key, value in matches}
        to_pop = list()
        for key in data_dict.keys():
            if key not in paper_features:
                to_pop.append(key)
        for key in to_pop:
            data_dict.pop(key, None)
        data_dict['Abstract'] = last_comment
        data_dict['Paper'] = int(data_dict['Paper'][7:])
        
        return data_dict

In [None]:
dict_list = list()

In [None]:
directory = os.path.join('data', 'meta_files')
files = glob.glob(os.path.join(directory, '*'))

for file in files:
    if os.path.isfile(file):
        new_dict = data_dict_from_file(file)
        dict_list.append(new_dict)

In [None]:
full_data_dictionary = dict()
for feature in paper_features:
    full_data_dictionary[feature] = []

In [None]:
for d in dict_list:
    for key in paper_features:
        if key not in d.keys():
            d[key] = ''
        full_data_dictionary[key].append(d[key])

Creating dataframe with everything we need (dates still need some manipulation)

In [None]:
df = pd.DataFrame.from_dict(full_data_dictionary)
df.head()

Let`s fix the dates

In [None]:
def is_date_format(date_str,form):
    pattern = r'^\d{2}-[A-Z]{3}-\d{4}$'
    if form == 2:
        pattern = r'^\d{2}/\d{2}/\d{2}$'
    if form == 3:
        pattern = r'^(0?\d|[12]\d|3[01])-[a-zA-Z]{3}-\d{4}$'
    if re.match(pattern, date_str):
        return True
    else:
        return False

In [None]:
for i, row in df.iterrows():
    
    if (row['Paper'] == 9509068): #no data in 1 case
        row['Date'] = None
        continue
        
    date_str = row['Date']
    date_str = date_str.split()
    date_obj = None
    
    if is_date_format(date_str[0],1):
        sp = date_str[0].split('-')
        date_conc = sp[0] + " " + sp[1] + " " + sp[2]
        date_obj = datetime.strptime(date_conc, "%d %b %Y")
        
    elif is_date_format(date_str[0],2): 
        date_obj = datetime.strptime(date_str[0], "%m/%d/%y")
    
    elif is_date_format(date_str[0],3):
        sp = date_str[0].split('-')
        date_conc = '0' + sp[0] + " " + sp[1] + " " + sp[2]
        date_obj = datetime.strptime(date_conc, "%d %b %Y")
    else:
        date_str = date_str[(1 - date_str[0][0].isdigit()):]

        if not date_str[0].isdigit():
            tmp = date_str[0]
            date_str[0] = date_str[1]
            date_str[1] = tmp

        ind = 2
        if date_str[2][-1] == ',':
            date_str[2] = date_str[2][:-1]
        while not date_str[2].isdigit():
            ind += 1
            date_str[2] = date_str[ind]
        if int(date_str[2]) < 100:

            date_str[2] = '19' + date_str[2]

        date_str[1] = date_str[1][:3].upper()
        date_only = date_str[0] +" "+ date_str[1] +" "+ date_str[2]
        date_obj = datetime.strptime(date_only, "%d %b %Y")
        
    formatted_date = date_obj.strftime("%Y-%m-%d")
    df.at[i,'Date'] = formatted_date

Now let`s turn Authors into a list

In [None]:
for i, row in df.iterrows():
    string = row['Authors']
    string = string.replace('and',',').replace(' ','').split(',')
    df.at[i,'Authors'] = string

Also let`s rename columns properly, check our df, and output the result

In [None]:
df = df.rename(columns={"Paper": "Paper_ID"})

In [None]:
df.head()

In [None]:
df.to_csv(os.path.join("data", "processed.csv"))

We didn`t need dates file yet, but that time may come