## Citation Analysis

In this notebook, we analyze the citation patterns for each selected topic.

In [1]:
from collections import defaultdict
from datetime import datetime
import pytz
import os
import sys
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [2]:
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('.'))

In [3]:
os.path.abspath('.')

'/Users/ahren/Workspace/NLP/arXivData/notebooks'

In [4]:
import const
from arguments import parse_args
from utility.metrics import calculate_citation_diversity
from utility.utils_data import load_semantic_scholar_papers, \
    load_semantic_scholar_references_parquet, load_keywords, load_arXiv_data
from utility.utils_misc import project_setup
from utility.utils_time import time_difference_in_days


In [5]:
# Calculate average age of citations for all types of references


data_dir = os.path.expanduser("~/Workspace/data")
semantic_scholar_papers = load_semantic_scholar_papers(data_dir)
arxiv_data = load_arXiv_data(data_dir)


Loaded 2022476 entries from Semantic Scholar.
Loaded 2118385 arXiv papers in 10.497 secs.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column_name].fillna("", inplace=True)


In [6]:
for year in range(2023, 2024):
    references_one_year = load_semantic_scholar_references_parquet(year, data_dir)


TypeError: can only concatenate str (not "int") to str

In [None]:
reference_one_paper = references_one_year.iloc[0]

In [None]:
feature_name = "title_and_abstract"

mask = {}
for index_row, row in tqdm(arxiv_data.iterrows(), total=len(arxiv_data)):
    # print(row)
    contains_keywords = False
    # for keyword in const.SUBJECT2KEYWORDS['Computer Science'][8]:
    keywords_one_paper = set([kwd.strip().lower() for kwd in row[f"{feature_name}_keywords"].split(',')])

    for keyword in const.SUBJECT2KEYWORDS['Computer Science'][8]:
        # print(keywords_one_paper)
        
        if keyword.lower() in keywords_one_paper:
            contains_keywords = True
            # print(keyword)
            
    id = row['id']
    # id = row['id'].split("arxiv.org/abs/")[-1]
    # if id[-2] == 'v':
    #     id = id[:-2]

    # elif id[-3] == 'v':
    #     id = id[:-3]
    
    mask[id] = contains_keywords 


In [44]:
const.SUBJECT2KEYWORDS['Computer Science'][8]

('large language models',
 'llm',
 'llms',
 'gpt',
 'chatgpt',
 'gpt4',
 'gpt3',
 'gpt-4',
 'gpt-3',
 'rlhf',
 'chain-of-thought',
 'chain of thought',
 'chain of thoughts',
 'cot')

In [92]:
sum(mask.values())

14733

In [93]:
mask = pd.Series(mask)

In [94]:
mask

http://arxiv.org/abs/physics/9403001      False
http://arxiv.org/abs/physics/9403001v1    False
http://arxiv.org/abs/math/9201239         False
http://arxiv.org/abs/math/9201204         False
http://arxiv.org/abs/math/9201203         False
                                          ...  
http://arxiv.org/abs/2406.16863v1         False
http://arxiv.org/abs/2406.16864v1         False
http://arxiv.org/abs/2406.16865v1         False
http://arxiv.org/abs/2406.16866v1         False
http://arxiv.org/abs/2406.16867v1         False
Length: 2118385, dtype: bool

In [84]:
arxiv_data[mask.index]

KeyError: "None of [Index(['physics/9403001', 'math/9201239', 'math/9201204', 'math/9201203',\n       'math/9201205', 'math/9201206', 'math/9201207', 'cs/9301111',\n       'math/9201240', 'math/9201241',\n       ...\n       '2406.16858', '2406.16859', '2406.16860', '2406.16861', '2406.16862',\n       '2406.16863', '2406.16864', '2406.16865', '2406.16866', '2406.16867'],\n      dtype='object', length=2042914)] are in the [columns]"

In [105]:
arxiv_data.set_index('id').loc[mask.index.values]

Unnamed: 0_level_0,title,summary,arxiv_comment,published,updated,authors,tags,tags_cleaned,title_keywords,title_and_abstract_keywords
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
http://arxiv.org/abs/physics/9403001,Desperately Seeking Superstrings,We provide a detailed analysis of the problems...,originally appeared as a Reference Frame in Ph...,1986-04-25 15:39:49+00:00,1986-04-25 15:39:49+00:00,"['Paul Ginsparg', 'Sheldon Glashow']","[physics.pop-ph, hep-th]","[physics.pop-ph, hep-th]","Superstrings, String theory, Quantum gravity","superstring theory, theoretical physics, quant..."
http://arxiv.org/abs/physics/9403001v1,Desperately Seeking Superstrings,We provide a detailed analysis of the problems...,originally appeared as a Reference Frame in Ph...,1986-04-25 15:39:49+00:00,1986-04-25 15:39:49+00:00,"['Paul Ginsparg', 'Sheldon Glashow']","[physics.pop-ph, hep-th]","[physics.pop-ph, hep-th]","Superstrings, String theory, Physics","superstring theory, theoretical physics, quant..."
http://arxiv.org/abs/math/9201239,A note on canonical functions,We construct a generic extension in which the ...,,1989-04-15 00:00:00+00:00,1989-04-15 00:00:00+00:00,"['Thomas Jech', 'Saharon Shelah']",[math.LO],[math.LO],"canonical functions, algebraic geometry, birat...","canonical functions, generic extension, aleph_..."
http://arxiv.org/abs/math/9201204,Shadows of convex bodies,It is proved that if $C$ is a convex body in $...,,1989-10-26 14:59:00+00:00,1989-10-26 14:59:00+00:00,['Keith Ball'],"[math.MG, math.FA, 52A20, 52A40]","[math.MG, math.FA, 52A20, 52A40]","Convex bodies, Shadows, Geometry","convex bodies, shadows, affine image, orthogon..."
http://arxiv.org/abs/math/9201203,Convex bodies with few faces,"It is proved that if $u_1,\ldots, u_n$ are vec...",,1989-10-26 14:59:00+00:00,1989-10-26 14:59:00+00:00,"['Keith Ball', 'Alain Pajor']","[math.MG, math.FA, 52A20, 10E05]","[math.MG, math.FA, 52A20, 10E05]","Convex bodies, Few faces, Geometry","Convex bodies, Faces, Vectors, Volume, Symmetr..."
...,...,...,...,...,...,...,...,...,...,...
http://arxiv.org/abs/2406.16863v1,FreeTraj: Tuning-Free Trajectory Control in Vi...,Diffusion model has demonstrated remarkable ca...,Project Page: http://haonanqiu.com/projects/Fr...,2024-06-24 17:59:56+00:00,2024-06-24 17:59:56+00:00,"['Haonan Qiu', 'Zhaoxi Chen', 'Zhouxia Wang', ...",[cs.CV],,"Trajectory Control, Video Diffusion Models, Tu...","Diffusion model, video generation, trajectory ..."
http://arxiv.org/abs/2406.16864v1,StableNormal: Reducing Diffusion Variance for ...,This work addresses the challenge of high-qual...,"HF Demo: hf.co/Stable-X, Video:\n https://www...",2024-06-24 17:59:58+00:00,2024-06-24 17:59:58+00:00,"['Chongjie Ye', 'Lingteng Qiu', 'Xiaodong Gu',...","[cs.CV, cs.AI, cs.GR]",,"Diffusion Variance, Stable Normal, Sharp Normal","surface normal estimation, diffusion priors, d..."
http://arxiv.org/abs/2406.16865v1,Variational Monte Carlo Study of the Doped $t$...,The ground state of the bipartite $t$-$J$ mode...,"14 pages, 11 figures",2024-06-24 17:59:58+00:00,2024-06-24 17:59:58+00:00,"['Can Cui', 'Jing-Yu Zhao', 'Zheng-Yu Weng']",[cond-mat.str-el],,"Variational Monte Carlo, Doped t-J Model, Hone...","Variational Monte Carlo, Doped t-J Model, Hone..."
http://arxiv.org/abs/2406.16866v1,Revisiting Referring Expression Comprehension ...,Referring expression comprehension (REC) invol...,,2024-06-24 17:59:58+00:00,2024-06-24 17:59:58+00:00,"['Jierun Chen', 'Fangyun Wei', 'Jinjing Zhao',...",[cs.CV],,"Referring Expression Comprehension, Evaluation...","Referring expression comprehension, Large mult..."


In [104]:
arxiv_data.set_index('id').loc[['http://arxiv.org/abs/physics/9403001']]

Unnamed: 0_level_0,title,summary,arxiv_comment,published,updated,authors,tags,tags_cleaned,title_keywords,title_and_abstract_keywords
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
http://arxiv.org/abs/physics/9403001,Desperately Seeking Superstrings,We provide a detailed analysis of the problems...,originally appeared as a Reference Frame in Ph...,1986-04-25 15:39:49+00:00,1986-04-25 15:39:49+00:00,"['Paul Ginsparg', 'Sheldon Glashow']","[physics.pop-ph, hep-th]","[physics.pop-ph, hep-th]","Superstrings, String theory, Quantum gravity","superstring theory, theoretical physics, quant..."


In [88]:
total_age = 0
count = 0

# Set a specific timezone, e.g., UTC
timezone = pytz.timezone('UTC')

all_time_differences = []

for paper in reference_one_paper.references:
    # print(paper)
    # print(paper['citedPaper']['publicationDate'])
    timestamp_reference = timezone.localize(datetime.strptime(paper['citedPaper']['publicationDate'], "%Y-%m-%d"))
    all_time_differences += [(reference_one_paper['arXivPublicationDate'] - timestamp_reference)]
    


TypeError: strptime() argument 1 must be str, not None