# Libraries

In [1]:
import pandas as pd
import networkx as nx
import plotly.express as px

# Yearly trends

In [3]:
# get files
citations_file = "cit-HepTh.txt"
dates_file = "cit-HepTh-dates.txt"

citations = pd.read_csv(citations_file, sep=r"\s+", skiprows=1, header=None, names=["FromNodeId", "ToNodeId"])

dates = pd.read_csv(dates_file, sep=r"\s+", skiprows=0, header=None, names=["PaperId", "Date"])

#get year
dates['Date'] = pd.to_datetime(dates['Date'], errors='coerce')
dates['Year'] = dates['Date'].dt.year

# group by year
yearly_publications = dates['Year'].value_counts().sort_index().reset_index()
yearly_publications.columns = ['Year', 'Number of Publications']

fig = px.bar(yearly_publications, x='Year', y='Number of Publications',
             title="Number of Publications per Year in ArXiv HEP-TH",
             labels={"Year": "Year", "Number of Publications": "Number of Publications"})

fig.show()


# AVG Citation over years

In [4]:
# get citations per each paper
citations_count = citations['ToNodeId'].value_counts().reset_index()
citations_count.columns = ['PaperId', 'CitationCount']

# join with dates
papers_with_citations = pd.merge(dates, citations_count, how='left', left_on='PaperId', right_on='PaperId')
papers_with_citations['CitationCount'] = papers_with_citations['CitationCount'].fillna(0)

# group by years, get mean
yearly_citation_density = papers_with_citations.groupby('Year')['CitationCount'].mean().reset_index()
yearly_citation_density.columns = ['Year', 'AvgCitationCount']

fig = px.bar(yearly_citation_density, x='Year', y='AvgCitationCount',
             title="Average Citation Density per Year in ArXiv HEP-TH",
             labels={"Year": "Year", "AvgCitationCount": "Average Citation Count"})

fig.show()


In [5]:
# just a check for 1992
papers_1992 = papers_with_citations[papers_with_citations['Year'] == 1992]
print(papers_1992[['PaperId', 'CitationCount']])


       PaperId  CitationCount
0      9203201            0.0
1      9203202            0.0
2      9203203            0.0
3      9203204            0.0
4      9203205            0.0
..         ...            ...
850  119212068            0.0
851  119212148            0.0
852  119212149            0.0
853  119212004            0.0
854  119212008            0.0

[855 rows x 2 columns]


# Most Citated papers 1999 - 2000

In [7]:
# filter only 1999 and 2000
papers_1999_2000 = dates[(dates['Year'] == 1999) | (dates['Year'] == 2000)]
papers_1999_2000_ids = papers_1999_2000['PaperId'].unique()

# Leave only citations where the end ID (ToNodeId) is an article from 1999 or 2000
citations_1999_2000 = citations[citations['ToNodeId'].isin(papers_1999_2000_ids)]

# Count the number of citations for each article
citations_agg_1999_2000 = citations_1999_2000['ToNodeId'].value_counts().reset_index()
citations_agg_1999_2000.columns = ['ToNodeId', 'TotalCitationCount']


In [9]:

papers_with_citations_1999_2000 = pd.merge(
    papers_1999_2000[['PaperId', 'Year']],
    citations_agg_1999_2000,
    how='left',
    left_on='PaperId',
    right_on='ToNodeId'
)

#fill missing
papers_with_citations_1999_2000['TotalCitationCount'] = papers_with_citations_1999_2000['TotalCitationCount'].fillna(0)


In [10]:
papers_with_dates = pd.merge(papers_with_citations_1999_2000[['PaperId', 'Year']], citations_agg_1999_2000, how='left', left_on='PaperId', right_on='ToNodeId')

# missing
papers_with_dates['TotalCitationCount'] = papers_with_dates['TotalCitationCount'].fillna(0)

# sort
papers_with_dates_sorted = papers_with_dates.sort_values('TotalCitationCount', ascending=False)


print(papers_with_dates_sorted[['PaperId', 'Year', 'TotalCitationCount']].head(10))


      PaperId  Year  TotalCitationCount
1130  9904207  1999               282.0
695   9903205  1999               251.0
5999     3160  2000               242.0
5834   112044  2000               242.0
3738  9910219  1999               211.0
4636  9912249  1999               201.0
7674     7191  2000               197.0
8500     9148  2000               196.0
5544     2091  2000               186.0
5528     2075  2000               182.0


In [11]:

citations = pd.read_csv(citations_file, sep=r"\s+", skiprows=1, header=None, names=["FromNodeId", "ToNodeId"])
dates = pd.read_csv(dates_file, sep=r"\s+", skiprows=0, header=None, names=["PaperId", "Date"])

dates['Date'] = pd.to_datetime(dates['Date'], errors='coerce')
dates['Year'] = dates['Date'].dt.year

citations_count = citations['ToNodeId'].value_counts().reset_index()
citations_count.columns = ['PaperId', 'CitationCount']


papers_with_citations = pd.merge(dates, citations_count, how='left', left_on='PaperId', right_on='PaperId')
papers_with_citations['CitationCount'] = papers_with_citations['CitationCount'].fillna(0)


papers_with_citations_sorted = papers_with_citations.sort_values(by=['Year', 'CitationCount'], ascending=[True, False])


top_cited_by_year = papers_with_citations_sorted.groupby('Year').head(10)


for year in top_cited_by_year['Year'].unique():
    print(f"Top 10 most cited papers in {year}:")
    top_papers = top_cited_by_year[top_cited_by_year['Year'] == year]
    print(top_papers[['PaperId', 'CitationCount']])
    print("\n")


Top 10 most cited papers in 1992:
   PaperId  CitationCount
0  9203201            0.0
1  9203202            0.0
2  9203203            0.0
3  9203204            0.0
4  9203205            0.0
5  9203206            0.0
6  9203207            0.0
7  9203208            0.0
8  9203209            0.0
9  9203210            0.0


Top 10 most cited papers in 1993:
      PaperId  CitationCount
2259  9310202           48.0
2670  9312206           38.0
2676  9312212           15.0
2258  9310201           13.0
2672  9312208           10.0
2677  9312213           10.0
2673  9312209            9.0
2674  9312210            9.0
2671  9312207            6.0
2668  9312204            2.0


Top 10 most cited papers in 1994:
      PaperId  CitationCount
5534  9412228          110.0
5003  9410237           91.0
3934  9406206           82.0
3944  9406216           77.0
5223  9411210           68.0
5542  9412236           63.0
5535  9412229           61.0
3945  9406217           48.0
5515  9412209           47.0

# Degree and Beteewnness Centrality

In [12]:
# get 2002 only
papers_2002 = dates[dates['Year'] == 2002]
papers_2002_ids = papers_2002['PaperId'].unique()
citations_2002 = citations[citations['ToNodeId'].isin(papers_2002_ids) | citations['FromNodeId'].isin(papers_2002_ids)]

# create a graph
G_2002 = nx.DiGraph()
for _, row in citations_2002.iterrows():
    G_2002.add_edge(row['FromNodeId'], row['ToNodeId'])

# 1. Degree Centrality
degree_centrality_2002 = nx.degree_centrality(G_2002)

# 2. Betweenness Centrality
betweenness_centrality_2002 = nx.betweenness_centrality(G_2002)

# save in DataFrame
centrality_df_2002 = pd.DataFrame({
    'PaperId': list(degree_centrality_2002.keys()),
    'DegreeCentrality': list(degree_centrality_2002.values()),
    'BetweennessCentrality': list(betweenness_centrality_2002.values())
})


centrality_with_info = pd.merge(papers_2002, centrality_df_2002, how='left', on='PaperId').fillna(0)

top_degree_centrality_2002 = centrality_with_info.sort_values(by='DegreeCentrality', ascending=False).head(10)
print("Top 10 papers by Degree Centrality in 2002:")
print(top_degree_centrality_2002[['PaperId', 'DegreeCentrality', 'BetweennessCentrality']])


top_betweenness_centrality_2002 = centrality_with_info.sort_values(by='BetweennessCentrality', ascending=False).head(10)
print("\nTop 10 papers by Betweenness Centrality in 2002:")
print(top_betweenness_centrality_2002[['PaperId', 'DegreeCentrality', 'BetweennessCentrality']])


Top 10 papers by Degree Centrality in 2002:
     PaperId  DegreeCentrality  BetweennessCentrality
411   202021          0.051825               0.000205
251   201253          0.037938               0.000273
499   202109          0.032785               0.000380
79    201081          0.031782               0.000035
798   203048          0.022334               0.000084
501   202111          0.020759               0.000078
863   203101          0.019900               0.000085
569   202179          0.019470               0.000100
547   202157          0.016034               0.000036
576   202186          0.015891               0.000107

Top 10 papers by Betweenness Centrality in 2002:
     PaperId  DegreeCentrality  BetweennessCentrality
517   202127          0.006872               0.000930
766   203018          0.011739               0.000826
438   202048          0.005297               0.000818
2     201004          0.011167               0.000763
213   201215          0.006013            

# Founding Papers

In [4]:
#initial citation count
initial_citations_count = citations.groupby('ToNodeId').size().reset_index(name='InitialCitationCount')

# Add to the article data information on initial citations
papers_with_initial_citations = pd.merge(dates, initial_citations_count, how='left', left_on='PaperId', right_on='ToNodeId')

papers_with_initial_citations['InitialCitationCount'] = papers_with_initial_citations['InitialCitationCount'].fillna(0)


G = nx.DiGraph()


for _, row in citations.iterrows():
    G.add_edge(row['FromNodeId'], row['ToNodeId'])



# Look for all indirect citations, starting with each paper
def get_indirect_citations(paper_id, graph):
    if paper_id not in graph:
        return 0

    # First we get all the papers that cite this PaperId
    neighbors = list(graph.neighbors(paper_id))
    all_cited = set(neighbors)
    for neighbor in neighbors:
        all_cited.update(graph.neighbors(neighbor))  # add articles that cite these neighbours
    all_cited.discard(paper_id)
    return len(all_cited)


papers_with_initial_citations['IndirectCitationCount'] = papers_with_initial_citations['PaperId'].apply(lambda x: get_indirect_citations(x, G))


praroditeli = papers_with_initial_citations.sort_values(by='IndirectCitationCount', ascending=False)


top_praroditeli = praroditeli[['PaperId', 'Year', 'InitialCitationCount', 'IndirectCitationCount']].head(10)
print(top_praroditeli)

       PaperId  Year  InitialCitationCount  IndirectCitationCount
36165   110055  2001                   5.0                   3080
29755     7170  2000                  78.0                   2246
32469   101126  2001                  60.0                   2034
38473   203048  2002                  14.0                   2013
34532   106048  2001                 220.0                   1858
37926   201253  2002                  39.0                   1852
34662   106178  2001                   4.0                   1667
30272     8241  2000                  39.0                   1642
35553   108172  2001                  36.0                   1586
29780     7195  2000                  10.0                   1538
