In [1]:
#pip install bash_kernel

In [2]:
import re
import pandas as pd
import os
import glob
from datetime import datetime

## Data preprocessing for the citation rate prediction

Data can be found at:https://snap.stanford.edu/data/cit-HepTh.html

It represents the citation network in the fields of high energy theoretical physics


Unarchive all data into "data" folder


Before running this, make sure you are in the home folder. This will remove unnecessary lines from the files, and unfolder the folders with data

In [3]:
%%bash
cd data
sed '1,4d' Cit-HepTh.txt > temp.txt && mv temp.txt Cit-HepTh.txt
sed '1d' Cit-HepTh-dates.txt > temp.txt && mv temp.txt Cit-HepTh-dates.txt
mv Cit-HepTh.txt Edgelist.txt
mv Cit-HepTh-dates.txt Dates.txt
cd ..

In [4]:
%%bash
cd data
mkdir -p meta_files

for folder in *; do
  if [ -d "$folder" ] && [ "$folder" != "meta_files" ]; then
    mv "$folder"/*.* meta_files/
  fi
done

find . -type d -empty -not -path "./meta_files" -delete
cd ..

Parsing the metadata files.We consider all other information to be too sparse or irrelevant for the research

In [5]:
paper_features = ('Paper','Date','Title','Authors','Abstract')

In [6]:
def data_dict_from_file(filename):
    data_dict = dict()
    with open(filename, 'r') as file:
        text = file.read()

        last_comment = text.split("\\\\")[2].strip()

        # Extract key-value pairs using regular expressions and split by newline
        pattern = r"(\S+): (.+)"
        matches = re.findall(pattern, text.split("\\\\")[1], re.MULTILINE)

        # Create a dictionary from the matches
        data_dict = {key.strip(): value.strip() for key, value in matches}
        to_pop = list()
        for key in data_dict.keys():
            if key not in paper_features:
                to_pop.append(key)
        for key in to_pop:
            data_dict.pop(key, None)
        data_dict['Abstract'] = last_comment
        data_dict['Paper'] = int(data_dict['Paper'][7:])
        
        return data_dict

In [7]:
dict_list = list()

In [8]:
directory = 'data\meta_files'
files = glob.glob(os.path.join(directory, '*'))

for file in files:

    if os.path.isfile(file):
        new_dict = data_dict_from_file(file)
        dict_list.append(new_dict)

In [9]:
full_data_dictionary = dict()
for feature in paper_features:
    full_data_dictionary[feature] = []

In [10]:
for d in dict_list:
    for key in paper_features:
        if key not in d.keys():
            d[key] = ''
        full_data_dictionary[key].append(d[key])

In [11]:
for feature in paper_features:
    print(len(full_data_dictionary[feature]))

29555
29555
29555
29555
29555


Creating dataframe with everything we need(dates need some manipulation still)

In [12]:
df = pd.DataFrame.from_dict(full_data_dictionary)
df.head()

Unnamed: 0,Paper,Date,Title,Authors,Abstract
0,1001,"Sat, 1 Jan 2000 00:02:31 GMT (84kb)","Compactification, Geometry and Duality: N=2",Paul S. Aspinwall,These are notes based on lectures given at TAS...
1,1002,"Mon, 3 Jan 2000 22:38:03 GMT (64kb)",Domain Walls and Massive Gauged Supergravity P...,"M. Cvetic, H. Lu and C.N. Pope",We point out that massive gauged supergravity ...
2,1003,"Sat, 1 Jan 2000 06:14:51 GMT (2kb)","Comment on ""Metric Fluctuations in Brane Worlds""",Y.S. Myung and Gungwon Kang,"Recently, Ivanov and Volovich (hep-th/9912242)..."
3,1004,"Sat, 1 Jan 2000 19:57:21 GMT (13kb)",Moving Mirrors and Thermodynamic Paradoxes,Adam D. Helfer,"Quantum fields responding to ""moving mirrors"" ..."
4,1005,"Sun, 2 Jan 2000 17:06:40 GMT (24kb)",Bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",Various aspects of spaces of chiral blocks are...


Let`s fix the dates

In [13]:
def is_date_format(date_str,form):
    pattern = r'^\d{2}-[A-Z]{3}-\d{4}$'
    if form == 2:
        pattern = r'^\d{2}/\d{2}/\d{2}$'
    if form == 3:
        pattern = r'^(0?\d|[12]\d|3[01])-[a-zA-Z]{3}-\d{4}$'
    if re.match(pattern, date_str):
        return True
    else:
        return False

In [14]:
for i, row in df.iterrows():
    
    if (row['Paper'] == 9509068): #no data in 1 case
        row['Date'] = None
        continue
        
    date_str = row['Date']
    date_str = date_str.split()
    date_obj = None
    
    if is_date_format(date_str[0],1):
        sp = date_str[0].split('-')
        date_conc = sp[0] + " " + sp[1] + " " + sp[2]
        date_obj = datetime.strptime(date_conc, "%d %b %Y")
        
    elif is_date_format(date_str[0],2): 
        date_obj = datetime.strptime(date_str[0], "%m/%d/%y")
    
    elif is_date_format(date_str[0],3):
        sp = date_str[0].split('-')
        date_conc = '0' + sp[0] + " " + sp[1] + " " + sp[2]
        date_obj = datetime.strptime(date_conc, "%d %b %Y")
    else:
        date_str = date_str[(1 - date_str[0][0].isdigit()):]

        if not date_str[0].isdigit():
            tmp = date_str[0]
            date_str[0] = date_str[1]
            date_str[1] = tmp

        ind = 2
        if date_str[2][-1] == ',':
            date_str[2] = date_str[2][:-1]
        while not date_str[2].isdigit():
            ind += 1
            date_str[2] = date_str[ind]
        if int(date_str[2]) < 100:

            date_str[2] = '19' + date_str[2]

        date_str[1] = date_str[1][:3].upper()
        date_only = date_str[0] +" "+ date_str[1] +" "+ date_str[2]
        date_obj = datetime.strptime(date_only, "%d %b %Y")
        
    formatted_date = date_obj.strftime("%Y-%m-%d")
    df.at[i,'Date'] = formatted_date

Now lets turn Authors into a list

In [15]:
for i, row in df.iterrows():
    string = row['Authors']
    string = string.replace('and',',').replace(' ','').split(',')
    df.at[i,'Authors'] = string

Also let`s rename columns properly, check our df, and output the result

In [16]:
df = df.rename(columns={"Paper": "Paper_ID"})

In [17]:
df.head()

Unnamed: 0,Paper_ID,Date,Title,Authors,Abstract
0,1001,2000-01-01,"Compactification, Geometry and Duality: N=2",[PaulS.Aspinwall],These are notes based on lectures given at TAS...
1,1002,2000-01-03,Domain Walls and Massive Gauged Supergravity P...,"[M.Cvetic, H.Lu, C.N.Pope]",We point out that massive gauged supergravity ...
2,1003,2000-01-01,"Comment on ""Metric Fluctuations in Brane Worlds""","[Y.S.Myung, GungwonKang]","Recently, Ivanov and Volovich (hep-th/9912242)..."
3,1004,2000-01-01,Moving Mirrors and Thermodynamic Paradoxes,[AdamD.Helfer],"Quantum fields responding to ""moving mirrors"" ..."
4,1005,2000-01-02,Bundles of chiral blocks and boundary conditio...,"[J.Fuchs, C.Schweigert]",Various aspects of spaces of chiral blocks are...


In [19]:
df.to_csv("data/processed.csv")

We didn`t need dates file yet, but that time may come