# Imports and read dataset

In [1]:
import pandas as pd
import numpy as np
import re
import json
import dedupe
from typing import Union
from tqdm import tqdm
from collections import Counter
from datetime import datetime
from thefuzz import fuzz
import warnings
warnings.filterwarnings('ignore')

import plotly_express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go
pio.renderers.default = "iframe"

import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import dash_core_components as dcc


import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from gensim.parsing.preprocessing import remove_stopwords, strip_tags, strip_punctuation, preprocess_string, STOPWORDS
custom_text_filters = [lambda x: x.lower(),
                       lambda x: x.replace('e.g.', ''),
                       lambda x: x.replace('i.e', ''),
                       lambda x: x.replace('paper', ''),
                       lambda x: x.replace('real', ''),
                       lambda x: x.replace('world', ''),
                       lambda x: x.replace('https', ''),
                       lambda x: x.replace('github', ''),
                       lambda x: x.replace('com', ''),
                       lambda x: x.replace('state', ''),
                       lambda x: x.replace('art', ''),
                       lambda x: x.replace('ieee', ''),
                       strip_tags,strip_punctuation,remove_stopwords]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ajaylakkegowda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ajaylakkegowda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
category_prefixes = ['stat.','cs.']
papers = []
with open('arxiv-metadata-oai-snapshot.json', 'r') as file_data:
    for data in tqdm(file_data):
        papers.append(json.loads(data))
arxiv_papers = pd.DataFrame(papers)

2318918it [01:57, 19812.87it/s] 


# Data Preprocessing for arxiv papers

In [3]:
arxiv_papers.drop(['comments', 'journal-ref', 'report-no', 'license'], inplace=True, axis = 1)
arxiv_papers = arxiv_papers[arxiv_papers['categories'].apply(lambda x: any(x.startswith(prefix) for prefix in category_prefixes))]

In [4]:
arxiv_papers.head()

Unnamed: 0,id,submitter,authors,title,doi,categories,abstract,versions,update_date,authors_parsed
46,704.0047,Igor Grabec,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,,cs.NE cs.AI,The intelligent acoustic emission locator is...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2009-09-29,"[[Kosel, T., ], [Grabec, I., ]]"
49,704.005,Igor Grabec,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,,cs.NE cs.AI,Part I describes an intelligent acoustic emi...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2007-05-23,"[[Kosel, T., ], [Grabec, I., ]]"
61,704.0062,Tom\'a\v{s} Vina\v{r},"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,10.1007/978-3-540-74126-8_23,cs.DS,"In this paper, we introduce the on-line Vite...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2010-01-25,"[[Šrámek, Rastislav, ], [Brejová, Broňa, ], [V..."
89,704.009,Lester Ingber,Lester Ingber,Real Options for Project Schedules (ROPS),,cs.CE cond-mat.stat-mech cs.MS cs.NA physics.d...,Real Options for Project Schedules (ROPS) ha...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2007-05-23,"[[Ingber, Lester, ]]"
97,704.0098,Jack Raymond,"Jack Raymond, David Saad",Sparsely-spread CDMA - a statistical mechanics...,10.1088/1751-8113/40/41/004,cs.IT math.IT,"Sparse Code Division Multiple Access (CDMA),...","[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2009-11-13,"[[Raymond, Jack, ], [Saad, David, ]]"


In [5]:
#DataFrame summary
arxiv_papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 486484 entries, 46 to 2072139
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              486484 non-null  object
 1   submitter       486356 non-null  object
 2   authors         486484 non-null  object
 3   title           486484 non-null  object
 4   doi             82245 non-null   object
 5   categories      486484 non-null  object
 6   abstract        486484 non-null  object
 7   versions        486484 non-null  object
 8   update_date     486484 non-null  object
 9   authors_parsed  486484 non-null  object
dtypes: object(10)
memory usage: 40.8+ MB


In [6]:
#Number of missing values per column
arxiv_papers.isnull().sum()

id                     0
submitter            128
authors                0
title                  0
doi               404239
categories             0
abstract               0
versions               0
update_date            0
authors_parsed         0
dtype: int64

In [7]:
# Remove duplicates
arxiv_papers.drop_duplicates(subset=['id'], inplace=True)

In [8]:
# Handle missing values
arxiv_papers.dropna(subset=['title', 'abstract', 'update_date'], inplace=True)

In [9]:
arxiv_papers.reset_index(drop=True, inplace=True)

In [10]:
#Number of missing values per column
arxiv_papers.isnull().sum()

id                     0
submitter            128
authors                0
title                  0
doi               404239
categories             0
abstract               0
versions               0
update_date            0
authors_parsed         0
dtype: int64

In [11]:
arxiv_papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486484 entries, 0 to 486483
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              486484 non-null  object
 1   submitter       486356 non-null  object
 2   authors         486484 non-null  object
 3   title           486484 non-null  object
 4   doi             82245 non-null   object
 5   categories      486484 non-null  object
 6   abstract        486484 non-null  object
 7   versions        486484 non-null  object
 8   update_date     486484 non-null  object
 9   authors_parsed  486484 non-null  object
dtypes: object(10)
memory usage: 37.1+ MB


In [12]:
arxiv_papers.versions[0]

[{'version': 'v1', 'created': 'Sun, 1 Apr 2007 13:06:50 GMT'}]

In [13]:
arxiv_papers['date'] = arxiv_papers['versions'].str[0].str.get('created')
arxiv_papers['date'] = pd.to_datetime(arxiv_papers.date, infer_datetime_format=True)
arxiv_papers['year'] = arxiv_papers['date'].dt.year

In [14]:
arxiv_papers.head()

Unnamed: 0,id,submitter,authors,title,doi,categories,abstract,versions,update_date,authors_parsed,date,year
0,704.0047,Igor Grabec,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,,cs.NE cs.AI,The intelligent acoustic emission locator is...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2009-09-29,"[[Kosel, T., ], [Grabec, I., ]]",2007-04-01 13:06:50,2007
1,704.005,Igor Grabec,T. Kosel and I. Grabec,Intelligent location of simultaneously active ...,,cs.NE cs.AI,Part I describes an intelligent acoustic emi...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2007-05-23,"[[Kosel, T., ], [Grabec, I., ]]",2007-04-01 18:53:13,2007
2,704.0062,Tom\'a\v{s} Vina\v{r},"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,10.1007/978-3-540-74126-8_23,cs.DS,"In this paper, we introduce the on-line Vite...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2010-01-25,"[[Šrámek, Rastislav, ], [Brejová, Broňa, ], [V...",2007-03-31 23:52:33,2007
3,704.009,Lester Ingber,Lester Ingber,Real Options for Project Schedules (ROPS),,cs.CE cond-mat.stat-mech cs.MS cs.NA physics.d...,Real Options for Project Schedules (ROPS) ha...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2007-05-23,"[[Ingber, Lester, ]]",2007-04-01 14:35:40,2007
4,704.0098,Jack Raymond,"Jack Raymond, David Saad",Sparsely-spread CDMA - a statistical mechanics...,10.1088/1751-8113/40/41/004,cs.IT math.IT,"Sparse Code Division Multiple Access (CDMA),...","[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2009-11-13,"[[Raymond, Jack, ], [Saad, David, ]]",2007-04-01 18:27:26,2007


In [15]:
arxiv_papers.sort_values(by='date', inplace=True)
# Find and display the earliest and latest creation dates in the 'papers' DataFrame
earliest_date = arxiv_papers['date'].iloc[0]
latest_date = arxiv_papers['date'].iloc[-1]
print(f"Earliest creation date: {earliest_date}\nLatest creation date: {latest_date}")

Earliest creation date: 1990-01-01 00:00:00
Latest creation date: 2023-08-31 17:59:46


# Data Preprocessing for Socpus Papers

In [16]:
scopus_papers = pd.read_excel('scopus.xls')

In [17]:
#DataFrame summary
scopus_papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19539 entries, 0 to 19538
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   authors    19053 non-null  object
 1   title      19539 non-null  object
 2   year       19539 non-null  int64 
 3   doi        14347 non-null  object
 4   abstract   19539 non-null  object
 5   submitter  17566 non-null  object
 6   id         19539 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


In [18]:
#Number of missing values per column
scopus_papers.isnull().sum()

authors       486
title           0
year            0
doi          5192
abstract        0
submitter    1973
id              0
dtype: int64

In [19]:
scopus_papers.reset_index(drop=True, inplace=True)
scopus_papers.head()

Unnamed: 0,authors,title,year,doi,abstract,submitter,id
0,Kaur G.; Sharma A.,HAS: Hybrid Analysis of Sentiments for the per...,2023,10.1007/s12652-022-03748-6,The reviews posted online by the end-users can...,Springer Science and Business Media Deutschlan...,2-s2.0-85124830704
1,Hu S.; Zhang H.; Zhang W.,Domain Knowledge Graph Question Answering Base...,2023,10.3390/app13158838,Information retrieval-based question answering...,Multidisciplinary Digital Publishing Institute...,2-s2.0-85167883197
2,Lai Z.; Wei K.; Fu Y.; H√§rtel P.; Heide F.,Œ¥-Prox: Differentiable Proximal Algorithm Mod...,2023,10.1145/3592144,Tasks across diverse application domains can b...,Association for Computing Machinery,2-s2.0-85167400308
3,Wang S.; Zhang Y.; Shi W.; Zhang G.; Zhang J.;...,A large dataset of semantic ratings and its co...,2023,10.1038/s41597-023-01995-6,Evidence from psychology and cognitive neurosc...,Nature Research,2-s2.0-85148814027
4,Li F.; Wang Y.; Jiang J.; Zhang H.; Wang X.; C...,Heterogeneous acceleration algorithms for shal...,2023,10.1016/j.future.2023.04.021,The physical process of atmospheric cumulus co...,Elsevier B.V.,2-s2.0-85157998857


# Data Preprocessing for DBLP papers

In [20]:
dblp_papers = pd.read_excel('dblp.xls')

In [21]:
#DataFrame summary
dblp_papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1241 entries, 0 to 1240
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1241 non-null   int64 
 1   authors    1240 non-null   object
 2   title      1241 non-null   object
 3   abstract   1241 non-null   object
 4   submitter  1238 non-null   object
 5   year       1241 non-null   int64 
 6   doi        1131 non-null   object
dtypes: int64(2), object(5)
memory usage: 68.0+ KB


In [22]:
#Number of missing values per column
dblp_papers.isnull().sum()

id             0
authors        1
title          0
abstract       0
submitter      3
year           0
doi          110
dtype: int64

In [23]:
dblp_papers.reset_index(drop=True, inplace=True)
dblp_papers.head()

Unnamed: 0,id,authors,title,abstract,submitter,year,doi
0,209840,Yinlin Deng;Chunqiu Steven Xia;Haoran Peng;Che...,Large Language Models Are Zero-Shot Fuzzers: F...,Large Language Models Are Zero-Shot Fuzzers: F...,ISSTA,2023,10.1145/3597926.3598067
1,151722,Sherzod Hakimov;David Schlangen,Images in Language Space: Exploring the Suitab...,Images in Language Space: Exploring the Suitab...,ACL,2023,10.18653/V1/2023.FINDINGS-ACL.894
2,257780,Michael Wornow;Yizhe Xu;Rahul Thapa;Birju S. P...,The Shaky Foundations of Clinical Foundation M...,The Shaky Foundations of Clinical Foundation M...,CoRR,2023,10.48550/ARXIV.2303.12961
3,273908,Yueting Yang;Xintong Zhang;Wenjuan Han,Enhance Reasoning Ability of Visual-Language M...,Enhance Reasoning Ability of Visual-Language M...,CoRR,2023,10.48550/ARXIV.2305.13267
4,274189,Sherzod Hakimov;David Schlangen,Images in Language Space: Exploring the Suitab...,Images in Language Space: Exploring the Suitab...,CoRR,2023,10.48550/ARXIV.2305.13782


# Data Preprocessing for all papers

In [24]:
papers = pd.concat([arxiv_papers, scopus_papers, dblp_papers], ignore_index=True)

In [25]:
papers.tail()

Unnamed: 0,id,submitter,authors,title,doi,categories,abstract,versions,update_date,authors_parsed,date,year
507259,776950,CoRR,Shailja Thakur;Baleegh Ahmad;Zhenxing Fan;Hamm...,Benchmarking Large Language Models for Automat...,10.48550/ARXIV.2212.11140,,Benchmarking Large Language Models for Automat...,,,,NaT,2022
507260,777308,CoRR,Byung-Doh Oh;William Schuler,Why Does Surprisal From Larger Transformer-Bas...,10.48550/ARXIV.2212.12131,,Why Does Surprisal From Larger Transformer-Bas...,,,,NaT,2022
507261,777650,CoRR,Karan Singhal;Shekoofeh Azizi;Tao Tu;S. Sara M...,Large Language Models Encode Clinical Knowledge.,10.48550/ARXIV.2212.13138,,Large Language Models Encode Clinical Knowledge.,,,,NaT,2022
507262,777989,CoRR,Ashley Liew;Klaus Mueller,Using Large Language Models to Generate Engagi...,10.48550/ARXIV.2212.14047,,Using Large Language Models to Generate Engagi...,,,,NaT,2022
507263,778319,CoRR,Yinlin Deng;Chunqiu Steven Xia;Haoran Peng;Che...,Fuzzing Deep-Learning Libraries via Large Lang...,10.48550/ARXIV.2212.14834,,Fuzzing Deep-Learning Libraries via Large Lang...,,,,NaT,2022


In [26]:
len(papers)

507264

In [27]:

# Function to parse authors from the 'authors' column
def parse_authors(authors_str):
    authors_list = []
    if isinstance(authors_str, str):
        authors_parts = authors_str.split(';')
        for part in authors_parts:
            authors_list.append([part.strip()])
    return authors_list

# Apply the parsing function to 'authors' column where 'authors_parsed' is NaN
def parse_or_fill_authors(row):
    if isinstance(row['authors_parsed'], float):  # Check for NaN using isinstance
        return parse_authors(row['authors'])
    else:
        return row['authors_parsed']

# Apply the parsing or filling function to 'authors_parsed' column with tqdm
tqdm.pandas()  # Enable tqdm for pandas operations
papers['authors_parsed'] = papers.progress_apply(parse_or_fill_authors, axis=1)

# Function to remove duplicates from authors list
def remove_duplicates(authors_list):
    return [list(author) for author in set(tuple(author) for author in authors_list)]

# Apply the remove_duplicates function to 'authors_parsed' column with tqdm
papers['authors_parsed'] = papers['authors_parsed'].progress_apply(remove_duplicates)

# Display the modified DataFrame
papers.tail()

100%|█████████████████████████████████| 507264/507264 [01:36<00:00, 5243.98it/s]
100%|████████████████████████████████| 507264/507264 [00:37<00:00, 13422.21it/s]


Unnamed: 0,id,submitter,authors,title,doi,categories,abstract,versions,update_date,authors_parsed,date,year
507259,776950,CoRR,Shailja Thakur;Baleegh Ahmad;Zhenxing Fan;Hamm...,Benchmarking Large Language Models for Automat...,10.48550/ARXIV.2212.11140,,Benchmarking Large Language Models for Automat...,,,"[[Baleegh Ahmad], [Shailja Thakur], [Benjamin ...",NaT,2022
507260,777308,CoRR,Byung-Doh Oh;William Schuler,Why Does Surprisal From Larger Transformer-Bas...,10.48550/ARXIV.2212.12131,,Why Does Surprisal From Larger Transformer-Bas...,,,"[[Byung-Doh Oh], [William Schuler]]",NaT,2022
507261,777650,CoRR,Karan Singhal;Shekoofeh Azizi;Tao Tu;S. Sara M...,Large Language Models Encode Clinical Knowledge.,10.48550/ARXIV.2212.13138,,Large Language Models Encode Clinical Knowledge.,,,"[[Hyung Won Chung], [Tao Tu], [Nathan Scales],...",NaT,2022
507262,777989,CoRR,Ashley Liew;Klaus Mueller,Using Large Language Models to Generate Engagi...,10.48550/ARXIV.2212.14047,,Using Large Language Models to Generate Engagi...,,,"[[Klaus Mueller], [Ashley Liew]]",NaT,2022
507263,778319,CoRR,Yinlin Deng;Chunqiu Steven Xia;Haoran Peng;Che...,Fuzzing Deep-Learning Libraries via Large Lang...,10.48550/ARXIV.2212.14834,,Fuzzing Deep-Learning Libraries via Large Lang...,,,"[[Yinlin Deng], [Haoran Peng], [Chenyuan Yang]...",NaT,2022


In [28]:
lemmatizer = WordNetLemmatizer()
# Function to preprocess title and abstract
def preprocess_text(text):
    words = text.split()
    filtered_words = [word for word in words if word not in STOPWORDS]
    filtered_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    filtered_words = preprocess_string(' '.join(filtered_words), custom_text_filters)
    return ' '.join(filtered_words)


papers['processed_titles_abstracts'] = papers.apply(lambda row: preprocess_text(row['title'] + ' ' + row['abstract']), axis=1)

In [29]:
papers['processed_titles_abstracts'][0]

'nested satisfiability special case satisfiability problem clause hierarchical structure shown solvable linear time assuming clause represented convenient way'

In [30]:
papers.drop(['id', 'submitter', 'doi'], axis=1, inplace=True)

In [31]:
len(papers)

507264

In [32]:
papers.head(20)

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts
0,Donald E. Knuth,Nested satisfiability,cs.CC,A special case of the satisfiability problem...,"[{'version': 'v1', 'created': 'Mon, 1 Jan 1990...",2008-02-03,"[[Knuth, Donald E., ]]",1990-01-01,1990,nested satisfiability special case satisfiabil...
1,Donald E. Knuth,A note on digitized angles,cs.GR,We study the configurations of pixels that o...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 1990...",2008-02-03,"[[Knuth, Donald E., ]]",1990-04-01,1990,note digitized angle study configuration pixel...
2,Donald E. Knuth,Textbook examples of recursion,cs.CC,We discuss properties of recursive schemas r...,"[{'version': 'v1', 'created': 'Thu, 1 Aug 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-08-01,1991,textbook example recursion discus property rec...
3,Donald E. Knuth,Theory and practice,cs.GL,The author argues to Silicon Valley that the...,"[{'version': 'v1', 'created': 'Fri, 1 Nov 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-11-01,1991,theory practice author argues silicon valley i...
4,Donald E. Knuth,Context-free multilanguages,cs.DS,This article is a sketch of ideas that were ...,"[{'version': 'v1', 'created': 'Sun, 1 Dec 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-12-01,1991,context free multilanguages icle sketch idea i...
5,"Donald E. Knuth, Arvind Raghunathan",The problem of compatible representatives,cs.DS math.CO,The purpose of this note is to attach a name...,"[{'version': 'v1', 'created': 'Wed, 1 Jul 1992...",2008-02-03,"[[Raghunathan, Arvind, ], [Knuth, Donald E., ]]",1992-07-01,1992,problem patible representative purpose note at...
6,M. P. Wellman,A Market-Oriented Programming Environment and ...,cs.AI,Market price systems constitute a well-under...,"[{'version': 'v1', 'created': 'Sun, 1 Aug 1993...",2008-02-03,"[[Wellman, M. P., ]]",1993-08-01,1993,market oriented programming environment applic...
7,M. L. Ginsberg,Dynamic Backtracking,cs.AI,Because of their occasional need to return t...,"[{'version': 'v1', 'created': 'Sun, 1 Aug 1993...",2008-02-03,"[[Ginsberg, M. L., ]]",1993-08-01,1993,dynamic backtracking occasional need return sh...
8,"I. P. Gent, T. Walsh",An Empirical Analysis of Search in GSAT,cs.AI,We describe an extensive study of search in ...,"[{'version': 'v1', 'created': 'Wed, 1 Sep 1993...",2008-02-03,"[[Gent, I. P., ], [Walsh, T., ]]",1993-09-01,1993,empirical analysis search gsat extensive study...
9,"J. C. Schlimmer, L. A. Hermens",Software Agents: Completing Patterns and Const...,cs.AI,To support the goal of allowing users to rec...,"[{'version': 'v1', 'created': 'Mon, 1 Nov 1993...",2009-09-25,"[[Schlimmer, J. C., ], [Hermens, L. A., ]]",1993-11-01,1993,software agents pleting patterns constructing ...


In [33]:
papers.tail()

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts
507259,Shailja Thakur;Baleegh Ahmad;Zhenxing Fan;Hamm...,Benchmarking Large Language Models for Automat...,,Benchmarking Large Language Models for Automat...,,,"[[Baleegh Ahmad], [Shailja Thakur], [Benjamin ...",NaT,2022,benchmarking large language models automated v...
507260,Byung-Doh Oh;William Schuler,Why Does Surprisal From Larger Transformer-Bas...,,Why Does Surprisal From Larger Transformer-Bas...,,,"[[Byung-Doh Oh], [William Schuler]]",NaT,2022,surprisal larger transformer based language mo...
507261,Karan Singhal;Shekoofeh Azizi;Tao Tu;S. Sara M...,Large Language Models Encode Clinical Knowledge.,,Large Language Models Encode Clinical Knowledge.,,,"[[Hyung Won Chung], [Tao Tu], [Nathan Scales],...",NaT,2022,large language models encode clinical knowledg...
507262,Ashley Liew;Klaus Mueller,Using Large Language Models to Generate Engagi...,,Using Large Language Models to Generate Engagi...,,,"[[Klaus Mueller], [Ashley Liew]]",NaT,2022,large language models generate engaging captio...
507263,Yinlin Deng;Chunqiu Steven Xia;Haoran Peng;Che...,Fuzzing Deep-Learning Libraries via Large Lang...,,Fuzzing Deep-Learning Libraries via Large Lang...,,,"[[Yinlin Deng], [Haoran Peng], [Chenyuan Yang]...",NaT,2022,fuzzing deep learning libraries large language...


In [34]:
papers[papers['year'] == 2016].tail()

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts
501677,Yongsiriwit K.; Assy N.; Gaaloul W.,A semantic framework for configurable business...,,"With the advent of Cloud Computing, new opport...",,,"[[Gaaloul W.], [Assy N.], [Yongsiriwit K.]]",NaT,2016,semantic framework configurable business proce...
501678,Qi J.; Ohsawa Y.,Matrix-like visualization based on topic model...,,Interdisciplinary research is challenging beca...,,,"[[Ohsawa Y.], [Qi J.]]",NaT,2016,matrix like visualization based topic modeling...
501679,Wang X.; Zhu P.; Liu T.; Xu K.,BioTopic: A topic-driven biological literature...,,Biology and biomedicine are flourishing discip...,,,"[[Xu K.], [Liu T.], [Zhu P.], [Wang X.]]",NaT,2016,biotopic topic driven biological literature mi...
501681,George R.; Samuel P.,Particle swarm optimization method based consi...,,Unified Modeling Language models are the de fa...,,,"[[Samuel P.], [George R.]]",NaT,2016,picle swarm optimization method based consiste...
501682,Guadarrama S.; Rodner E.; Saenko K.; Darrell T.,Understanding object descriptions in robotics ...,,We address the problem of retrieving and detec...,,,"[[Guadarrama S.], [Rodner E.], [Darrell T.], [...",NaT,2016,understanding object description robotics open...


In [35]:
# Filter the 'papers' DataFrame to include data between '2010' and '2023'
papers_2017_2023 = papers.loc[(papers['year'] >= 2017) & (papers['year'] <= 2023)]

# Number of all research paper submissions per year

In [38]:

# Group the 'papers' DataFrame by year and calculate the number of research paper submissions per year
yearly_papers_submission_count = papers.groupby(papers['year']).size().reset_index().rename(columns={0:"count"})

fig = px.line(x="year",
              y="count",
              data_frame=yearly_papers_submission_count,title="Number of research paper submissions per year",
              labels={"date": "year","count": "number of papers submitted"})
fig.update_traces(textposition="bottom right")
fig.update_layout(xaxis = dict(dtick = 1, tickangle=-45), title_x=0.5)
fig.show()

# Number of submissions containing large language model per year

In [39]:
def check_terms_in_abstract_title(abstract_title):
    abstract_title_lower = abstract_title.lower()
    
    pattern_1 = r"(large|big|massive)\s+language\s+(model|models)"
    pattern_2 = r"\b(ChatGPT|BERT|GPT|LLM|LLMS)\b"
    
    combined_pattern = f"({pattern_1})|({pattern_2})"
    
    return re.search(combined_pattern, abstract_title_lower, re.IGNORECASE) is not None
    
def check_non_llm_paper(abstract_title):
    return not check_terms_in_abstract_title(abstract_title)

In [40]:
llm_papers = papers_2017_2023[(papers_2017_2023['processed_titles_abstracts']).apply(check_terms_in_abstract_title)]
non_llm_papers = papers_2017_2023[papers_2017_2023['processed_titles_abstracts'].apply(check_non_llm_paper)]
# Initialize an empty dictionary to store counts of papers containing 'large language model' per year
count_papers = {}

# Use tqdm to display a progress bar while processing the DataFrame
tqdm.pandas(desc="Processing papers containing 'large language model'")

yearly_llm_submission = llm_papers.groupby('year').size().reset_index(name='count of large language model')

# Plot the number of submissions containing 'large language model' per year
fig = px.line(x="year",y="count of large language model",data_frame=yearly_llm_submission,title="Number of large language model research papers submissions per year", text="count of large language model")
fig.update_traces(textposition="bottom right")
fig.update_layout(xaxis = dict(dtick = 1), title_x=0.5)
fig.show()

# Researcher Profiling Data Preparation 

In [41]:
year_to_print = 2016

# Filter llm_papers for the specified year and print the full abstracts
for authors_parsed in llm_papers[llm_papers['year'] == year_to_print]['authors_parsed']:
    print(authors_parsed)

In [42]:
papers.tail()

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts
507259,Shailja Thakur;Baleegh Ahmad;Zhenxing Fan;Hamm...,Benchmarking Large Language Models for Automat...,,Benchmarking Large Language Models for Automat...,,,"[[Baleegh Ahmad], [Shailja Thakur], [Benjamin ...",NaT,2022,benchmarking large language models automated v...
507260,Byung-Doh Oh;William Schuler,Why Does Surprisal From Larger Transformer-Bas...,,Why Does Surprisal From Larger Transformer-Bas...,,,"[[Byung-Doh Oh], [William Schuler]]",NaT,2022,surprisal larger transformer based language mo...
507261,Karan Singhal;Shekoofeh Azizi;Tao Tu;S. Sara M...,Large Language Models Encode Clinical Knowledge.,,Large Language Models Encode Clinical Knowledge.,,,"[[Hyung Won Chung], [Tao Tu], [Nathan Scales],...",NaT,2022,large language models encode clinical knowledg...
507262,Ashley Liew;Klaus Mueller,Using Large Language Models to Generate Engagi...,,Using Large Language Models to Generate Engagi...,,,"[[Klaus Mueller], [Ashley Liew]]",NaT,2022,large language models generate engaging captio...
507263,Yinlin Deng;Chunqiu Steven Xia;Haoran Peng;Che...,Fuzzing Deep-Learning Libraries via Large Lang...,,Fuzzing Deep-Learning Libraries via Large Lang...,,,"[[Yinlin Deng], [Haoran Peng], [Chenyuan Yang]...",NaT,2022,fuzzing deep learning libraries large language...


In [43]:
papers[papers['year'] == 2016].tail()

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts
501677,Yongsiriwit K.; Assy N.; Gaaloul W.,A semantic framework for configurable business...,,"With the advent of Cloud Computing, new opport...",,,"[[Gaaloul W.], [Assy N.], [Yongsiriwit K.]]",NaT,2016,semantic framework configurable business proce...
501678,Qi J.; Ohsawa Y.,Matrix-like visualization based on topic model...,,Interdisciplinary research is challenging beca...,,,"[[Ohsawa Y.], [Qi J.]]",NaT,2016,matrix like visualization based topic modeling...
501679,Wang X.; Zhu P.; Liu T.; Xu K.,BioTopic: A topic-driven biological literature...,,Biology and biomedicine are flourishing discip...,,,"[[Xu K.], [Liu T.], [Zhu P.], [Wang X.]]",NaT,2016,biotopic topic driven biological literature mi...
501681,George R.; Samuel P.,Particle swarm optimization method based consi...,,Unified Modeling Language models are the de fa...,,,"[[Samuel P.], [George R.]]",NaT,2016,picle swarm optimization method based consiste...
501682,Guadarrama S.; Rodner E.; Saenko K.; Darrell T.,Understanding object descriptions in robotics ...,,We address the problem of retrieving and detec...,,,"[[Guadarrama S.], [Rodner E.], [Darrell T.], [...",NaT,2016,understanding object description robotics open...


In [44]:
# Calculate the number of authors
papers['num_authors'] = papers['authors_parsed'].apply(lambda x: len(x) if isinstance(x, list) else np.nan)

# Flatten and format authors' names
papers['authors'] = papers['authors_parsed'].apply(lambda authors: [(" ".join(author)).strip() for author in authors] if isinstance(authors, list) else [])

papers.tail()

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts,num_authors
507259,"[Baleegh Ahmad, Shailja Thakur, Benjamin Tan, ...",Benchmarking Large Language Models for Automat...,,Benchmarking Large Language Models for Automat...,,,"[[Baleegh Ahmad], [Shailja Thakur], [Benjamin ...",NaT,2022,benchmarking large language models automated v...,8
507260,"[Byung-Doh Oh, William Schuler]",Why Does Surprisal From Larger Transformer-Bas...,,Why Does Surprisal From Larger Transformer-Bas...,,,"[[Byung-Doh Oh], [William Schuler]]",NaT,2022,surprisal larger transformer based language mo...,2
507261,"[Hyung Won Chung, Tao Tu, Nathan Scales, S. Sa...",Large Language Models Encode Clinical Knowledge.,,Large Language Models Encode Clinical Knowledge.,,,"[[Hyung Won Chung], [Tao Tu], [Nathan Scales],...",NaT,2022,large language models encode clinical knowledg...,8
507262,"[Klaus Mueller, Ashley Liew]",Using Large Language Models to Generate Engagi...,,Using Large Language Models to Generate Engagi...,,,"[[Klaus Mueller], [Ashley Liew]]",NaT,2022,large language models generate engaging captio...,2
507263,"[Yinlin Deng, Haoran Peng, Chenyuan Yang, Ling...",Fuzzing Deep-Learning Libraries via Large Lang...,,Fuzzing Deep-Learning Libraries via Large Lang...,,,"[[Yinlin Deng], [Haoran Peng], [Chenyuan Yang]...",NaT,2022,fuzzing deep learning libraries large language...,5


In [45]:
papers.head()

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts,num_authors
0,[Knuth Donald E.],Nested satisfiability,cs.CC,A special case of the satisfiability problem...,"[{'version': 'v1', 'created': 'Mon, 1 Jan 1990...",2008-02-03,"[[Knuth, Donald E., ]]",1990-01-01,1990,nested satisfiability special case satisfiabil...,1
1,[Knuth Donald E.],A note on digitized angles,cs.GR,We study the configurations of pixels that o...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 1990...",2008-02-03,"[[Knuth, Donald E., ]]",1990-04-01,1990,note digitized angle study configuration pixel...,1
2,[Knuth Donald E.],Textbook examples of recursion,cs.CC,We discuss properties of recursive schemas r...,"[{'version': 'v1', 'created': 'Thu, 1 Aug 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-08-01,1991,textbook example recursion discus property rec...,1
3,[Knuth Donald E.],Theory and practice,cs.GL,The author argues to Silicon Valley that the...,"[{'version': 'v1', 'created': 'Fri, 1 Nov 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-11-01,1991,theory practice author argues silicon valley i...,1
4,[Knuth Donald E.],Context-free multilanguages,cs.DS,This article is a sketch of ideas that were ...,"[{'version': 'v1', 'created': 'Sun, 1 Dec 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-12-01,1991,context free multilanguages icle sketch idea i...,1


# Top Authors

### Top 30 Popular Authors in preprint platform

In [46]:
# Flatten the authors list
authors_list = [author for authors in papers['authors'] for author in authors]
# Create a DataFrame from the flattened list
authors_df = pd.DataFrame({'authors': authors_list})
# Group and count papers by authors
papers_by_authors = authors_df.groupby(['authors']).size().reset_index(name='Number of Papers Published')
papers_by_authors = papers_by_authors.sort_values("Number of Papers Published", ascending=False).head(30)
print(papers_by_authors.head())
# Plot the bar chart using Plotly Express
fig = px.bar(papers_by_authors.sort_values("Number of Papers Published", ascending=True),
             x="Number of Papers Published", y="authors",
             title="Top 30 Popular Authors in preprint platform", orientation="h")
fig.update_layout(
    title_x=0.5,
    yaxis=dict(automargin=True),
    height=900
)
fig.show()

                authors  Number of Papers Published
274933         Liu Yang                         860
366734  Poor H. Vincent                         671
530329        Zhang Rui                         579
456329      Tao Dacheng                         539
490099         Wang Wei                         513


### Top 30 Popular Authors on large language model

In [47]:
llm_papers['num_authors'] = llm_papers['authors_parsed'].apply(lambda x:len(x))
llm_papers['authors'] = llm_papers['authors_parsed'].apply(lambda authors:[(" ".join(author)).strip() for author in authors])
llm_papers.head()

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts,num_authors
118707,"[Vogel Stephan, Sajjad Hassan, Dalvi Fahim, Du...",QCRI Machine Translation Systems for IWSLT 16,cs.CL,This paper describes QCRI's machine translat...,"[{'version': 'v1', 'created': 'Sat, 14 Jan 201...",2017-01-17,"[[Vogel, Stephan, ], [Sajjad, Hassan, ], [Dalv...",2017-01-14 14:18:54,2017,qcri machine translation systems iwslt 16 desc...,4
119308,"[Hinton Geoffrey, Le Quoc, Davis Andy, Mirhose...",Outrageously Large Neural Networks: The Sparse...,cs.LG cs.CL cs.NE stat.ML,The capacity of a neural network to absorb i...,"[{'version': 'v1', 'created': 'Mon, 23 Jan 201...",2017-01-24,"[[Hinton, Geoffrey, ], [Le, Quoc, ], [Davis, A...",2017-01-23 18:10:00,2017,outrageously large neural networks sparsely ga...,7
141668,"[Bennett Erin D., Nie Allen, Goodman Noah D.]",DisSent: Sentence Representation Learning from...,cs.CL cs.AI,Learning effective representations of senten...,"[{'version': 'v1', 'created': 'Thu, 12 Oct 201...",2019-06-05,"[[Bennett, Erin D., ], [Nie, Allen, ], [Goodma...",2017-10-12 00:56:13,2017,dissent sentence representation learning expli...,3
164684,"[Reitter David, Ororbia Alexander G., Mali Ank...",Like a Baby: Visually Situated Neural Language...,cs.CL cs.AI,We examine the benefits of visual context in...,"[{'version': 'v1', 'created': 'Tue, 29 May 201...",2019-06-05,"[[Reitter, David, ], [Ororbia, Alexander G., ]...",2018-05-29 15:53:30,2018,like baby visually situated neural language ac...,4
167966,"[Puchinger Sven, Renner Julian, Wachter-Zeh An...",Twisted Gabidulin Codes in the GPT Cryptosystem,cs.IT cs.CR math.IT,"In this paper, we investigate twisted Gabidu...","[{'version': 'v1', 'created': 'Tue, 26 Jun 201...",2018-08-15,"[[Puchinger, Sven, ], [Renner, Julian, ], [Wac...",2018-06-26 15:14:43,2018,twisted gabidulin codes gpt cryptosystem inves...,3


In [48]:
# Flatten the authors list
llm_authors_list = [author for authors in llm_papers['authors'] for author in authors]

# Create a DataFrame from the flattened list
authors_df = pd.DataFrame({'author': llm_authors_list})

# Group and count papers by authors
all_llm_top_authors = authors_df.groupby(['author']).size().reset_index(name='Number of Papers Published')
llm_top_authors = all_llm_top_authors.sort_values("Number of Papers Published", ascending=False).reset_index().iloc[0:30]
llm_top_100_authors = all_llm_top_authors.sort_values("Number of Papers Published", ascending=False).reset_index().iloc[0:100]
llm_top_authors.drop('index', axis=1, inplace=True)
# Plot the bar chart using Plotly Express
fig = px.bar(llm_top_authors.sort_values("Number of Papers Published", ascending=True),
             x="Number of Papers Published", y="author",
             title="Top 30 Popular Authors on large language model", orientation="h")
fig.update_layout(
    title_x=0.5,
    yaxis=dict(automargin=True),
    height=900
)
fig.show()
print(llm_top_authors)


               author  Number of Papers Published
0            Zhang Y.                          49
1    Zettlemoyer Luke                          43
2          Qiu Xipeng                          41
3          Choi Yejin                          40
4            Wei Furu                          40
5        Gao Jianfeng                          38
6               Li Y.                          35
7         Liu Zhiyuan                          34
8           Lin Jimmy                          33
9            Liu Yang                          33
10          Zhang Yue                          33
11          Jiang Xin                          33
12            Liu Qun                          33
13     Huang Xuanjing                          31
14      Xiong Caiming                          30
15      Smith Noah A.                          30
16          Ren Xiang                          30
17          Huang Fei                          29
18           Li Xiang                          29


In [49]:
# Flatten the authors list
authors_list = [author for authors in papers['authors'] for author in authors]

# Create a DataFrame from the flattened list
authors_df = pd.DataFrame({'authors': authors_list})

# Group and count papers by authors
papers_by_authors = authors_df.groupby(['authors']).size().reset_index(name='Number of Papers Published')
papers_by_authors = papers_by_authors.sort_values("Number of Papers Published", ascending=False).head(30)

# Plot the bar chart using Plotly Express
fig = px.bar(papers_by_authors.sort_values("Number of Papers Published", ascending=True),
             x="Number of Papers Published", y="authors",
             title="Top 30 Popular Authors in preprint platform", orientation="h")
fig.update_layout(
    title_x=0.5,
    yaxis=dict(automargin=True),
    height=900
)
fig.show()

### Top 5 authors and their publication count on large language model each year

In [50]:
authors_years = []
for index, row in llm_papers.iterrows():
    authors = row['authors']
    authors_years.extend([(author, row['year']) for author in authors])

# Create a DataFrame from the flattened list
authors_df = pd.DataFrame(authors_years, columns=['author', 'year'])

# Group and count papers by authors and year
llm_top_authors_by_year = authors_df.groupby(['author', 'year']).size().reset_index(name='Number of Papers Published')

# Get the top 5 authors for each year
top_authors_by_year = (
    llm_top_authors_by_year.sort_values(['year', 'Number of Papers Published'], ascending=[True, False])
    .groupby('year')
    .head(5)
    .reset_index(drop=True)
)

top_authors_by_year = pd.DataFrame(top_authors_by_year)

# Create a bar graph using Plotly Express
fig = px.bar(top_authors_by_year, x='Number of Papers Published', y='author', color='year',
             title='Top 5 authors and their publication count on large language model each year', orientation='h', text="Number of Papers Published")
fig.update_layout(yaxis={"dtick":1},coloraxis={"colorbar":{"dtick":1}},title_x=0.5,height=1100)
fig.update_traces(textangle=0)
fig.show()
print(len(llm_top_authors_by_year))

44837


# Key words used in the titles and abstract of the LLM papers published

In [51]:
# Generating bigrams
all_llm_bigrams = ["_".join(bigram) for title in llm_papers['processed_titles_abstracts'] for bigram in ngrams(nltk.word_tokenize(title), 2)]

topn = 50

# Counting bigram frequencies
llm_bigram_counter = Counter(all_llm_bigrams)
top_llm_bigrams = llm_bigram_counter.most_common(topn)

# Creating a DataFrame for top bigrams
top_llm_bigrams_df = pd.DataFrame(top_llm_bigrams, columns=['words', 'Frequency'])

# Creating a horizontal bar chart using Plotly
fig = px.bar(data_frame=top_llm_bigrams_df.sort_values("Frequency", ascending=True),
             x="Frequency", y="words", orientation="h",
             title="Top " + str(topn) + " Bigrams in LLM Papers")

fig.update_layout(yaxis={"dtick":1},title_x=0.5,height=900)
fig.update_traces(textangle=0)
fig.show()

In [52]:
all_llm_trigrams = ["_".join(trigram) for title in llm_papers['processed_titles_abstracts'] for trigram in ngrams(title.replace(":", "").split(), 3)]

topn = 50

# Counting trigram frequencies
llm_trigram_counter = Counter(all_llm_trigrams)
top_llm_trigrams = llm_trigram_counter.most_common(topn)

# Creating a DataFrame for top trigrams
top_llm_trigrams_df = pd.DataFrame(top_llm_trigrams, columns=['words', 'Frequency'])

# Creating a horizontal bar chart using Plotly
fig = px.bar(data_frame=top_llm_trigrams_df.sort_values("Frequency", ascending=True),
             x="Frequency", 
             y="words", 
             orientation="h",
             title="Top " + str(topn) + " Trigrams in Papers")

fig.update_layout(yaxis={"dtick":1},title_x=0.5,height=900)
fig.update_traces(textangle=0)
fig.show()
top_llm_trigrams_df.head()

Unnamed: 0,words,Frequency
0,large_language_models,4706
1,large_language_model,3450
2,natural_language_processing,1796
3,pre_trained_language,1343
4,language_model_llms,1208


# Researcher Profiling 

In [53]:
# Generating bigrams
all_bigrams = ["_".join(bigram) for title in papers['processed_titles_abstracts'] for bigram in ngrams(title.split(), 2)]

topn = 50

bigram_counter = Counter(all_bigrams)
top_all_bigrams = bigram_counter.most_common(topn)

# Creating a DataFrame for top bigrams
top_bigrams_df = pd.DataFrame(top_all_bigrams, columns=['words', 'Frequency'])

top_bigrams_df.head(10)


Unnamed: 0,words,Frequency
0,neural_network,79231
1,machine_learning,54065
2,deep_learning,47917
3,large_scale,36236
4,proposed_method,30588
5,reinforcement_learning,29701
6,experimental_result,29385
7,neural_networks,28998
8,propose_novel,25901
9,natural_language,24382


In [54]:
# Generating trigrams
all_trigrams = ["_".join(trigram) for title in papers['processed_titles_abstracts'] for trigram in ngrams(nltk.word_tokenize(title), 3)]

topn = 50

# Counting trigram frequencies
trigram_counter = Counter(all_trigrams)
top_all_trigrams = trigram_counter.most_common(topn)

# Creating DataFrames for top bigrams and trigrams
top_trigrams_df = pd.DataFrame(top_all_trigrams, columns=['words', 'Frequency'])

# Display the top 100 trigrams
top_trigrams_df.head(10)

Unnamed: 0,words,Frequency
0,deep_neural_network,14940
1,convolutional_neural_network,13550
2,natural_language_processing,10836
3,deep_reinforcement_learning,6204
4,deep_neural_networks,6085
5,deep_learning_model,5737
6,deep_learning_based,5718
7,machine_learning_model,5653
8,convolutional_neural_networks,5334
9,recurrent_neural_network,4770


In [55]:
papers.head()

Unnamed: 0,authors,title,categories,abstract,versions,update_date,authors_parsed,date,year,processed_titles_abstracts,num_authors
0,[Knuth Donald E.],Nested satisfiability,cs.CC,A special case of the satisfiability problem...,"[{'version': 'v1', 'created': 'Mon, 1 Jan 1990...",2008-02-03,"[[Knuth, Donald E., ]]",1990-01-01,1990,nested satisfiability special case satisfiabil...,1
1,[Knuth Donald E.],A note on digitized angles,cs.GR,We study the configurations of pixels that o...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 1990...",2008-02-03,"[[Knuth, Donald E., ]]",1990-04-01,1990,note digitized angle study configuration pixel...,1
2,[Knuth Donald E.],Textbook examples of recursion,cs.CC,We discuss properties of recursive schemas r...,"[{'version': 'v1', 'created': 'Thu, 1 Aug 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-08-01,1991,textbook example recursion discus property rec...,1
3,[Knuth Donald E.],Theory and practice,cs.GL,The author argues to Silicon Valley that the...,"[{'version': 'v1', 'created': 'Fri, 1 Nov 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-11-01,1991,theory practice author argues silicon valley i...,1
4,[Knuth Donald E.],Context-free multilanguages,cs.DS,This article is a sketch of ideas that were ...,"[{'version': 'v1', 'created': 'Sun, 1 Dec 1991...",2008-02-03,"[[Knuth, Donald E., ]]",1991-12-01,1991,context free multilanguages icle sketch idea i...,1


In [56]:
stop_words = set(stopwords.words('english'))
papers_titles = papers['title'].tolist()
titles = [title.lower() for title in papers_titles]  # Lowercasing the titles

# Function to remove stopwords
def remove_stop_words(title, stop_words):
    words = title.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

papers_titles = [remove_stop_words(title, stop_words) for title in papers_titles]

# Collaboration Trends in Research Papers

In [57]:
# Calculate the average number of authors per year
mean_authors_per_year = llm_papers.groupby('year')['num_authors'].mean().round().reset_index()

fig = px.bar(mean_authors_per_year,
             x='year',
             y='num_authors',
             labels={'year': 'Year', 'num_authors': 'Average Number of Authors'},
             text='num_authors')  # Set a color for the bars

# Update layout to add main title and subtitle
fig.update_layout(title_text='Average Number of Authors per Year ',  # Main title
                  title_x=0.5,  # Center the main title
                  xaxis=dict(tickmode='linear', tick0=0, dtick=1),  
                  title_font_size=12,  # Font size for the main title
                  title_pad_t=20,  # Padding at the top of the main title
                  title_pad_b=10,  # Padding at the bottom of the main title
                  title_y=0.95,  # Y-coordinate of the main title
                  title_xanchor='center',  # Anchor point for the main title
                  title_yanchor='top',  # Anchor point for the main title
                  font=dict(size=16),  # Font size for the subtitle
                  annotations=[dict(xref='paper', yref='paper', x=0.5, y=1.15,  # Subtitle position
                                    text='Collaboration Trends on LLM',  # Subtitle text
                                    showarrow=False)])  # Don't show arrow for subtitle annotation

fig.show()

In [58]:
# Calculate the percentage of papers with more than 3 authors per year
papers_with_multi_authors = (llm_papers[llm_papers['num_authors'] > 2]
                             .groupby('year')['num_authors'].count() / llm_papers.groupby('year')['num_authors'].count() * 100).round().reset_index()

# Create a bar plot for collaboration trends
fig = px.bar(papers_with_multi_authors,
             x='year',
             y='num_authors',
             labels={'year': 'Year', 'num_authors': 'Percentage of Papers'},
             text='num_authors')

# Update layout to add main title and subtitle
fig.update_layout(title_text='Percentage of Papers with More than 2 Authors per Year',  # Main title
                  title_x=0.5,  # Center the main title
                  xaxis=dict(tickmode='linear', tick0=0, dtick=1),  
                  title_font_size=12,  # Font size for the main title
                  title_pad_t=20,  # Padding at the top of the main title
                  title_pad_b=10,  # Padding at the bottom of the main title
                  title_y=0.95,  # Y-coordinate of the main title
                  title_xanchor='center',  # Anchor point for the main title
                  title_yanchor='top',  # Anchor point for the main title
                  font=dict(size=14),  # Font size for the subtitle
                  annotations=[dict(xref='paper', yref='paper', x=0.5, y=1.15,  # Subtitle position
                                    text='Collaboration Trends on LLM',  # Subtitle text
                                    showarrow=False)])  # Don't show arrow for subtitle annotation

fig.show()


# Researcher's Bigram and Trigram

In [59]:
top_20_authors = llm_top_authors['author']
non_llm_publications = non_llm_papers[non_llm_papers['authors'].apply(lambda authors: any(author for author in top_20_authors))]

# SUCCESS Initialize lists to store bigrams and trigrams with their years
bigram_data = []
trigram_data = []

# Function to extract ngrams from text
def extract_ngrams(text, n, year):
    tokens = word_tokenize(text.lower())
    n_grams = ngrams(tokens, n)
    return [(year, ' '.join(gram)) for gram in n_grams]

# Iterate over the DataFrame with tqdm for progress monitoring
for _, row in tqdm(non_llm_publications.iterrows(), total=len(non_llm_publications)):
    title_abstract = row['processed_titles_abstracts']
    year = row['year']
    
    # Extract bigrams and trigrams
    bigram_data.extend(extract_ngrams(title_abstract, 2, year))
    trigram_data.extend(extract_ngrams(title_abstract, 3, year))

# Calculate the frequency of bigrams and trigrams with years
bigram_freq = Counter(bigram_data)
trigram_freq = Counter(trigram_data)

# Create DataFrames for bigrams and trigrams with years
bigram_df = pd.DataFrame(list(bigram_freq.items()), columns=['(year, word)', 'frequency'])
bigram_df[['year', 'word']] = pd.DataFrame(bigram_df['(year, word)'].tolist(), index=bigram_df.index)
bigram_df.drop(columns=['(year, word)'], inplace=True)

trigram_df = pd.DataFrame(list(trigram_freq.items()), columns=['(year, word)', 'frequency'])
trigram_df[['year', 'word']] = pd.DataFrame(trigram_df['(year, word)'].tolist(), index=trigram_df.index)
trigram_df.drop(columns=['(year, word)'], inplace=True)


# Sort the bigram DataFrame by frequency in descending order
bigram_df = bigram_df[bigram_df['frequency'] > 20].sort_values(by='frequency', ascending=False)

# Sort the trigram DataFrame by frequency in descending order
trigram_df = trigram_df[trigram_df['frequency'] > 20].sort_values(by='frequency', ascending=False)

# Display the results
print("Bigram DataFrame:")
print(bigram_df)

print("\n Trigram DataFrame:")
print(trigram_df)

100%|█████████████████████████████████| 371718/371718 [05:18<00:00, 1166.84it/s]


Bigram DataFrame:
          frequency  year              word
8333113       13022  2021    neural network
11265640      12552  2022    neural network
5681238       12372  2020    neural network
3467841       10927  2019    neural network
11265028      10313  2022  machine learning
...             ...   ...               ...
177110           21  2017   prediction deep
177046           21  2017          rate wer
3670836          21  2019        given test
3498325          21  2019      driving task
8737834          21  2021    efficient code

[171241 rows x 3 columns]

 Trigram DataFrame:
          frequency  year                            word
16148173       2527  2021             deep neural network
10746521       2507  2020             deep neural network
6409682        2239  2019             deep neural network
2784630        2234  2018    convolutional neural network
22237000       2200  2022             deep neural network
...             ...   ...                             ...


In [60]:
bigram_df[bigram_df['year'] == 2016].nlargest(5, 'frequency')

Unnamed: 0,frequency,year,word


In [61]:
top_20_authors = llm_top_authors['author']
llm_publications = llm_papers[llm_papers['authors'].apply(lambda authors: any(author in authors for author in top_20_authors))]

# SUCCESS Initialize lists to store bigrams and trigrams with their years
llm_bigram_data = []
llm_trigram_data = []

# Function to extract ngrams from text
def extract_ngrams(text, n, year):
    tokens = word_tokenize(text.lower())
    n_grams = ngrams(tokens, n)
    return [(year, ' '.join(gram)) for gram in n_grams]

# Iterate over the DataFrame with tqdm for progress monitoring
for _, row in tqdm(llm_publications.iterrows(), total=len(llm_publications)):
    title_abstract = row['processed_titles_abstracts']
    year = row['year']
    
    # Extract bigrams and trigrams
    llm_bigram_data.extend(extract_ngrams(title_abstract, 2, year))
    llm_trigram_data.extend(extract_ngrams(title_abstract, 3, year))

# Calculate the frequency of bigrams and trigrams with years
llm_bigram_freq = Counter(llm_bigram_data)
llm_trigram_freq = Counter(llm_trigram_data)

# Create DataFrames for bigrams and trigrams with years
llm_bigram_df = pd.DataFrame(list(llm_bigram_freq.items()), columns=['(year, word)', 'frequency'])
llm_bigram_df[['year', 'word']] = pd.DataFrame(llm_bigram_df['(year, word)'].tolist(), index=llm_bigram_df.index)
llm_bigram_df.drop(columns=['(year, word)'], inplace=True)

llm_trigram_df = pd.DataFrame(list(llm_trigram_freq.items()), columns=['(year, word)', 'frequency'])
llm_trigram_df[['year', 'word']] = pd.DataFrame(llm_trigram_df['(year, word)'].tolist(), index=llm_trigram_df.index)
llm_trigram_df.drop(columns=['(year, word)'], inplace=True)


# Sort the bigram DataFrame by frequency in descending order
llm_bigram_df = llm_bigram_df.sort_values(by='frequency', ascending=False)

# Sort the trigram DataFrame by frequency in descending order
llm_trigram_df = llm_trigram_df.sort_values(by='frequency', ascending=False)

# Display the results
print("llm Bigram DataFrame:")
print(llm_bigram_df)

print("\n llm Trigram DataFrame:")
print(llm_trigram_df)

100%|█████████████████████████████████████████| 810/810 [00:28<00:00, 28.84it/s]


llm Bigram DataFrame:
       frequency  year               word
35468        335  2023     large language
35343        289  2023     language model
35469        196  2023    language models
23398        144  2022     language model
35355        110  2023   natural language
...          ...   ...                ...
26399          1  2022        tagging ner
26400          1  2022            ner lid
26401          1  2022        lid gluecos
26402          1  2022  gluecos benchmark
74004          1  2019     right reserved

[74005 rows x 3 columns]

 llm Trigram DataFrame:
       frequency  year                           word
41491        189  2023           large language model
41443        142  2023          large language models
41616        100  2023            language model llms
27505         45  2022           pre trained language
27409         43  2022           large language model
...          ...   ...                            ...
30299          1  2022                    d w

In [62]:
# Step 1: Calculate the frequency of large language model-related bigrams and trigrams for each year
years = range(2010, 2024)  # Assuming you want to analyze from 2010 to 2023
focus_data_top_authors = []
focus_data_all_authors = []

for year in years:
    # Calculate the frequency for top 20 authors
    year_non_llm_bigram_frequency = bigram_df[(bigram_df['year'] == year)]['frequency'].sum()
    year_non_llm_trigram_frequency = trigram_df[(trigram_df['year'] == year)]['frequency'].sum()

    # Calculate the frequency for all authors
    year_llm_bigram_frequency_all = llm_bigram_df[(llm_bigram_df['year'] == year)]['frequency'].sum()
    year_llm_trigram_frequency_all = llm_trigram_df[(llm_trigram_df['year'] == year)]['frequency'].sum()
    
    focus_data_top_authors.append({'Year': year, 'llm': year_llm_bigram_frequency_all + year_llm_trigram_frequency_all})
    focus_data_all_authors.append({'Year': year, 'non_llm': year_non_llm_bigram_frequency + year_non_llm_trigram_frequency})

# Step 2: Create DataFrames to store the data
focus_df_top_authors = pd.DataFrame(focus_data_top_authors)
focus_df_all_authors = pd.DataFrame(focus_data_all_authors)

# Step 3: Plot the line graph using Plotly
fig = go.Figure()

# Add a line for top 20 authors
fig.add_trace(go.Scatter(x=focus_df_top_authors['Year'], y=focus_df_top_authors['llm'],
                         mode='lines+markers', name='llm'))

# Add a line for all authors
fig.add_trace(go.Scatter(x=focus_df_all_authors['Year'], y=focus_df_all_authors['non_llm'],
                         mode='lines+markers', name='non_llm'))

# Customize the layout
fig.update_layout(title='Focus on Large Language Models Over the Years',
                  xaxis_title='Year', yaxis_title='Frequency',
                  legend=dict(x=0, y=1))

fig.show()

In [63]:
# Create a list of years for analysis
years = range(2017, 2024)  # Adjust the range as needed

# Initialize empty lists to store data
bigram_data = []

# Loop through each year and calculate the sum of LLM bigram frequencies
for year in years:
    # Filter the LLM bigram DataFrame for the current year
    llm_bigrams_year = llm_bigram_df[llm_bigram_df['year'] == year]
    
    # Calculate the sum of LLM bigram frequencies for the current year
    llm_bigram_sum = llm_bigrams_year['frequency'].sum()
    
    # Get the top 5 non-LLM bigrams for the current year
    top_non_llm_bigrams = bigram_df[bigram_df['year'] == year].nlargest(5, 'frequency')
    
    # Convert the Pandas Series to a list of strings
    top_non_llm_bigram_names = top_non_llm_bigrams['word'].tolist()
    
    # Append data for the current year to the list
    bigram_data.append({
        'Year': year,
        'LLM Bigram Sum': llm_bigram_sum,
        'Top 5 Non-LLM Bigrams': top_non_llm_bigram_names,
        'Top 5 Non-LLM Bigram Frequencies': top_non_llm_bigrams['frequency'].tolist()
    })

# Create a DataFrame from the collected data
bigram_evolution_df = pd.DataFrame(bigram_data)

# Create a Plotly line graph for the evolution of LLM bigrams and top 5 non-LLM bigrams
fig = go.Figure()

# Add lines for top 5 non-LLM bigrams
for i in range(5):
    fig.add_trace(go.Scatter(
        x=bigram_evolution_df['Year'],
        y=bigram_evolution_df['Top 5 Non-LLM Bigram Frequencies'].apply(lambda x: x[i]),
        mode='lines+markers',
        name=top_non_llm_bigram_names[i]
    ))

# Add a line for LLM bigram sum
fig.add_trace(go.Scatter(
    x=bigram_evolution_df['Year'],
    y=bigram_evolution_df['LLM Bigram Sum'],
    mode='lines+markers',
    name='LLM'
))

# Customize the layout
fig.update_layout(
    title='Evolution of Bigrams over year',
    xaxis_title='Year',
    yaxis_title='Frequency',
    legend=dict(x=0, y=1)
)

# Show the graph
fig.show()


In [64]:
# Create a list of years for analysis
years = range(2017, 2024)  # Adjust the range as needed

# Initialize empty lists to store data
trigram_data = []

# Loop through each year and calculate the sum of LLM bigram frequencies
for year in years:
    # Filter the LLM bigram DataFrame for the current year
    llm_trigrams_year = llm_trigram_df[llm_trigram_df['year'] == year]
    
    # Calculate the sum of LLM bigram frequencies for the current year
    llm_trigram_sum = llm_trigrams_year['frequency'].sum()
    
    # Get the top 5 non-LLM bigrams for the current year
    top_non_llm_trigrams = trigram_df[trigram_df['year'] == year].nlargest(5, 'frequency')
    
    # Convert the Pandas Series to a list of strings
    top_non_llm_trigram_names = top_non_llm_trigrams['word'].tolist()
    
    # Append data for the current year to the list
    trigram_data.append({
        'Year': year,
        'LLM Trigram Sum': llm_trigram_sum,
        'Top 5 Non-LLM Trigrams': top_non_llm_trigram_names,
        'Top 5 Non-LLM Trigram Frequencies': top_non_llm_trigrams['frequency'].tolist()
    })

# Create a DataFrame from the collected data
trigram_evolution_df = pd.DataFrame(trigram_data)

# Create a Plotly line graph for the evolution of LLM bigrams and top 5 non-LLM bigrams
fig = go.Figure()

# Add lines for top 5 non-LLM bigrams
for i in range(5):
    fig.add_trace(go.Scatter(
        x=trigram_evolution_df['Year'],
        y=trigram_evolution_df['Top 5 Non-LLM Trigram Frequencies'].apply(lambda x: x[i]),
        mode='lines+markers',
        name=top_non_llm_trigram_names[i]
    ))

# Add a line for LLM bigram sum
fig.add_trace(go.Scatter(
    x=trigram_evolution_df['Year'],
    y=trigram_evolution_df['LLM Trigram Sum'],
    mode='lines+markers',
    name='LLM'
))

# Customize the layout
fig.update_layout(
    title='Evolution of Trigrams over year',
    xaxis_title='Year',
    yaxis_title='Frequency',
    legend=dict(x=0, y=1)
)

# Show the graph
fig.show()


# Senior vs Junior

In [65]:
# Create DataFrames from the sample data
senior_junior_authors = pd.DataFrame(llm_top_authors)
papers_df = pd.DataFrame(papers)

# Calculate the current year
current_year = datetime.now().year

# Convert the 'update_date' column to datetime and handle invalid or missing dates
def convert_to_datetime(date_str):
    try:
        return pd.to_datetime(date_str, format='%Y-%m-%d')
    except (TypeError, ValueError):
        return pd.NaT  # Return a Not-a-Time (NaT) for invalid or missing dates

papers_df['update_date'] = papers_df['update_date'].apply(convert_to_datetime)

# Define a function to categorize authors as senior or junior
def categorize_author(author):
    author_papers = papers_df[papers_df['authors'].apply(lambda x: author in x)]
    
    # Handle the case where there are no papers for the author
    if author_papers.empty:
        return 'Unknown Author'
    
    latest_update_year = min(author_papers['update_date'].dt.year, default=current_year)
    if current_year - latest_update_year > 8:
        return 'Senior Author'
    else:
        return 'Junior Author'

# Apply the categorize_author function to the top authors DataFrame
senior_junior_authors['Author Category'] = senior_junior_authors['author'].apply(categorize_author)

# Separate the senior and junior authors into two dataframes
senior_authors_df = senior_junior_authors[senior_junior_authors['Author Category'] == 'Senior Author']
junior_authors_df = senior_junior_authors[senior_junior_authors['Author Category'] == 'Junior Author']

senior_authors_df = senior_authors_df.reset_index(drop=True)
junior_authors_df = junior_authors_df.reset_index(drop=True)
# Display the results
print("Senior Authors:")
print(senior_authors_df)

print("\nJunior Authors:")
print(junior_authors_df)


Senior Authors:
               author  Number of Papers Published Author Category
0            Zhang Y.                          49   Senior Author
1          Choi Yejin                          40   Senior Author
2        Gao Jianfeng                          38   Senior Author
3               Li Y.                          35   Senior Author
4         Liu Zhiyuan                          34   Senior Author
5           Lin Jimmy                          33   Senior Author
6            Liu Yang                          33   Senior Author
7           Zhang Yue                          33   Senior Author
8           Jiang Xin                          33   Senior Author
9       Xiong Caiming                          30   Senior Author
10      Smith Noah A.                          30   Senior Author
11          Ren Xiang                          30   Senior Author
12           Li Xiang                          29   Senior Author
13    Schütze Hinrich                          29   Senior A

## Senior Authors

In [66]:
senior_authors = senior_authors_df['author']
non_llm_publications = non_llm_papers[non_llm_papers['authors'].apply(lambda authors: any(author for author in senior_authors))]

# SUCCESS Initialize lists to store bigrams and trigrams with their years
bigram_data = []
trigram_data = []

# Function to extract ngrams from text
def extract_ngrams(text, n, year):
    tokens = word_tokenize(text.lower())
    n_grams = ngrams(tokens, n)
    return [(year, ' '.join(gram)) for gram in n_grams]

# Iterate over the DataFrame with tqdm for progress monitoring
for _, row in tqdm(non_llm_publications.iterrows(), total=len(non_llm_publications)):
    title_abstract = row['processed_titles_abstracts']
    year = row['year']
    
    # Extract bigrams and trigrams
    bigram_data.extend(extract_ngrams(title_abstract, 2, year))
    trigram_data.extend(extract_ngrams(title_abstract, 3, year))


print(len(bigram_data))
print(len(trigram_data))

# Calculate the frequency of bigrams and trigrams with years
bigram_freq = Counter(bigram_data)
trigram_freq = Counter(trigram_data)
print(len(bigram_freq))
print(len(trigram_freq))

# Create DataFrames for bigrams and trigrams with years
senior_authors_bigram_df = pd.DataFrame(list(bigram_freq.items()), columns=['(year, word)', 'frequency'])
senior_authors_bigram_df[['year', 'word']] = pd.DataFrame(senior_authors_bigram_df['(year, word)'].tolist(), index=senior_authors_bigram_df.index)
senior_authors_bigram_df.drop(columns=['(year, word)'], inplace=True)

senior_authors_trigram_df = pd.DataFrame(list(trigram_freq.items()), columns=['(year, word)', 'frequency'])
senior_authors_trigram_df[['year', 'word']] = pd.DataFrame(senior_authors_trigram_df['(year, word)'].tolist(), index=senior_authors_trigram_df.index)
senior_authors_trigram_df.drop(columns=['(year, word)'], inplace=True)


# Sort the bigram DataFrame by frequency in descending order
senior_authors_bigram_df = senior_authors_bigram_df[senior_authors_bigram_df['frequency'] > 20].sort_values(by='frequency', ascending=False)

# Sort the trigram DataFrame by frequency in descending order
senior_authors_trigram_df = senior_authors_trigram_df[senior_authors_trigram_df['frequency'] > 20].sort_values(by='frequency', ascending=False)

# Display the results
print("Bigram DataFrame:")
print(senior_authors_trigram_df)

print("\n Trigram DataFrame:")
print(senior_authors_trigram_df)

100%|██████████████████████████████████| 371718/371718 [15:45<00:00, 393.28it/s]


40923453
40551735
17289159
34788810
Bigram DataFrame:
          frequency  year                            word
16148173       2527  2021             deep neural network
10746521       2507  2020             deep neural network
6409682        2239  2019             deep neural network
2784630        2234  2018    convolutional neural network
22237000       2200  2022             deep neural network
...             ...   ...                             ...
23145099         21  2022          method achieve parable
23147243         21  2022            neural network multi
23149895         21  2022                           n 2 n
16878118         21  2021  theoretical analysis empirical
2950604          21  2018          pose estimation method

[23563 rows x 3 columns]

 Trigram DataFrame:
          frequency  year                            word
16148173       2527  2021             deep neural network
10746521       2507  2020             deep neural network
6409682        2239  2019    

In [67]:
senior_authors = senior_authors_df['author']
senior_authors_llm_publications = llm_papers[llm_papers['authors'].apply(lambda authors: any(author in authors for author in senior_authors))]

# SUCCESS Initialize lists to store bigrams and trigrams with their years
senior_authors_llm_bigram_data = []
senior_authors_llm_trigram_data = []

# Function to extract ngrams from text
def extract_ngrams(text, n, year):
    tokens = word_tokenize(text.lower())
    n_grams = ngrams(tokens, n)
    return [(year, ' '.join(gram)) for gram in n_grams]

# Iterate over the DataFrame with tqdm for progress monitoring
for _, row in tqdm(senior_authors_llm_publications.iterrows(), total=len(senior_authors_llm_publications)):
    title_abstract = row['processed_titles_abstracts']
    year = row['year']
    
    # Extract bigrams and trigrams
    senior_authors_llm_bigram_data.extend(extract_ngrams(title_abstract, 2, year))
    senior_authors_llm_trigram_data.extend(extract_ngrams(title_abstract, 3, year))

# Calculate the frequency of bigrams and trigrams with years
senior_authors_llm_bigram_freq = Counter(senior_authors_llm_bigram_data)
senior_authors_llm_trigram_freq = Counter(senior_authors_llm_trigram_data)

# Create DataFrames for bigrams and trigrams with years
senior_authors_llm_bigram_df = pd.DataFrame(list(senior_authors_llm_bigram_freq.items()), columns=['(year, word)', 'frequency'])
senior_authors_llm_bigram_df[['year', 'word']] = pd.DataFrame(senior_authors_llm_bigram_df['(year, word)'].tolist(), index=senior_authors_llm_bigram_df.index)
senior_authors_llm_bigram_df.drop(columns=['(year, word)'], inplace=True)

senior_authors_llm_trigram_df = pd.DataFrame(list(senior_authors_llm_trigram_freq.items()), columns=['(year, word)', 'frequency'])
senior_authors_llm_trigram_df[['year', 'word']] = pd.DataFrame(senior_authors_llm_trigram_df['(year, word)'].tolist(), index=senior_authors_llm_trigram_df.index)
senior_authors_llm_trigram_df.drop(columns=['(year, word)'], inplace=True)


# Sort the bigram DataFrame by frequency in descending order
senior_authors_llm_bigram_df = senior_authors_llm_bigram_df.sort_values(by='frequency', ascending=False)

# Sort the trigram DataFrame by frequency in descending order
senior_authors_llm_trigram_df = senior_authors_llm_trigram_df.sort_values(by='frequency', ascending=False)

# Display the results
print("Senior Authors llm Bigram DataFrame:")
print(senior_authors_llm_bigram_df)

print("\n Senior Authors llm Trigram DataFrame:")
print(senior_authors_llm_trigram_df)

100%|█████████████████████████████████████████| 624/624 [00:53<00:00, 11.61it/s]


Senior Authors llm Bigram DataFrame:
       frequency  year                 word
26450        234  2023       large language
26324        209  2023       language model
26451        139  2023      language models
18136        116  2022       language model
26336         85  2023     natural language
...          ...   ...                  ...
20913          1  2022          base source
20914          1  2022     source exemplars
20915          1  2022  exemplars challenge
20916          1  2022   challenge exemplar
58637          1  2019       right reserved

[58638 rows x 3 columns]

 Senior Authors llm Trigram DataFrame:
       frequency  year                              word
30370        130  2023              large language model
30322        100  2023             large language models
30495         73  2023               language model llms
21032         40  2022              pre trained language
6377          34  2020              pre trained language
...          ...   ...     

In [68]:
# Create a list of years for analysis
years = range(2017, 2024)  # Adjust the range as needed

# Initialize empty lists to store data
bigram_data = []

# Loop through each year and calculate the sum of LLM bigram frequencies
for year in years:
    # Filter the LLM bigram DataFrame for the current year
    llm_bigrams_year = senior_authors_llm_bigram_df[senior_authors_llm_bigram_df['year'] == year]
    
    # Calculate the sum of LLM bigram frequencies for the current year
    llm_bigram_sum = llm_bigrams_year['frequency'].sum()
    
    # Get the top 5 non-LLM bigrams for the current year
    top_non_llm_bigrams = senior_authors_bigram_df[senior_authors_bigram_df['year'] == year].nlargest(5, 'frequency')
    
    # Convert the Pandas Series to a list of strings
    top_non_llm_bigram_names = top_non_llm_bigrams['word'].tolist()
    
    # Append data for the current year to the list
    bigram_data.append({
        'Year': year,
        'LLM Bigram Sum': llm_bigram_sum,
        'Top 5 Non-LLM Bigrams': top_non_llm_bigram_names,
        'Top 5 Non-LLM Bigram Frequencies': top_non_llm_bigrams['frequency'].tolist()
    })

# Create a DataFrame from the collected data
bigram_evolution_df = pd.DataFrame(bigram_data)

# Create a Plotly line graph for the evolution of LLM bigrams and top 5 non-LLM bigrams
fig = go.Figure()

# Add lines for top 5 non-LLM bigrams
for i in range(5):
    fig.add_trace(go.Scatter(
        x=bigram_evolution_df['Year'],
        y=bigram_evolution_df['Top 5 Non-LLM Bigram Frequencies'].apply(lambda x: x[i]),
        mode='lines+markers',
        name=top_non_llm_bigram_names[i]
    ))

# Add a line for LLM bigram sum
fig.add_trace(go.Scatter(
    x=bigram_evolution_df['Year'],
    y=bigram_evolution_df['LLM Bigram Sum'],
    mode='lines+markers',
    name='LLM'
))

# Customize the layout
fig.update_layout(
    title='Evolution of Bigrams over year - Senior Authors',
    xaxis_title='Year',
    yaxis_title='Frequency',
    legend=dict(x=0, y=1)
)

# Show the graph
fig.show()


In [69]:
# Create a list of years for analysis
years = range(2017, 2024)  # Adjust the range as needed

# Initialize empty lists to store data
trigram_data = []

# Loop through each year and calculate the sum of LLM bigram frequencies
for year in years:
    # Filter the LLM bigram DataFrame for the current year
    llm_trigrams_year = senior_authors_llm_trigram_df[senior_authors_llm_trigram_df['year'] == year]
    
    # Calculate the sum of LLM bigram frequencies for the current year
    llm_trigram_sum = llm_trigrams_year['frequency'].sum()
    
    # Get the top 5 non-LLM bigrams for the current year
    top_non_llm_trigrams = senior_authors_trigram_df[senior_authors_trigram_df['year'] == year].nlargest(5, 'frequency')
    
    # Convert the Pandas Series to a list of strings
    top_non_llm_trigram_names = top_non_llm_trigrams['word'].tolist()
    
    # Append data for the current year to the list
    trigram_data.append({
        'Year': year,
        'LLM Trigram Sum': llm_trigram_sum,
        'Top 5 Non-LLM Trigrams': top_non_llm_trigram_names,
        'Top 5 Non-LLM Trigram Frequencies': top_non_llm_trigrams['frequency'].tolist()
    })

# Create a DataFrame from the collected data
trigram_evolution_df = pd.DataFrame(trigram_data)

# Create a Plotly line graph for the evolution of LLM bigrams and top 5 non-LLM bigrams
fig = go.Figure()

# Add lines for top 5 non-LLM bigrams
for i in range(5):
    fig.add_trace(go.Scatter(
        x=trigram_evolution_df['Year'],
        y=trigram_evolution_df['Top 5 Non-LLM Trigram Frequencies'].apply(lambda x: x[i]),
        mode='lines+markers',
        name=top_non_llm_trigram_names[i]
    ))

# Add a line for LLM bigram sum
fig.add_trace(go.Scatter(
    x=trigram_evolution_df['Year'],
    y=trigram_evolution_df['LLM Trigram Sum'],
    mode='lines+markers',
    name='LLM'
))

# Customize the layout
fig.update_layout(
    title='Evolution of Trigrams over year - Senior Authors',
    xaxis_title='Year',
    yaxis_title='Frequency',
    legend=dict(x=0, y=1)
)

# Show the graph
fig.show()


## Junior Authors

In [70]:
junior_authors = junior_authors_df['author']
non_llm_publications = non_llm_papers[non_llm_papers['authors'].apply(lambda authors: any(author for author in junior_authors))]

# SUCCESS Initialize lists to store bigrams and trigrams with their years
bigram_data = []
trigram_data = []

# Function to extract ngrams from text
def extract_ngrams(text, n, year):
    tokens = word_tokenize(text.lower())
    n_grams = ngrams(tokens, n)
    return [(year, ' '.join(gram)) for gram in n_grams]

# Iterate over the DataFrame with tqdm for progress monitoring
for _, row in tqdm(non_llm_publications.iterrows(), total=len(non_llm_publications)):
    title_abstract = row['processed_titles_abstracts']
    year = row['year']
    
    # Extract bigrams and trigrams
    bigram_data.extend(extract_ngrams(title_abstract, 2, year))
    trigram_data.extend(extract_ngrams(title_abstract, 3, year))


print(len(bigram_data))
print(len(trigram_data))

# Calculate the frequency of bigrams and trigrams with years
bigram_freq = Counter(bigram_data)
trigram_freq = Counter(trigram_data)
print(len(bigram_freq))
print(len(trigram_freq))

# Create DataFrames for bigrams and trigrams with years
junior_authors_bigram_df = pd.DataFrame(list(bigram_freq.items()), columns=['(year, word)', 'frequency'])
junior_authors_bigram_df[['year', 'word']] = pd.DataFrame(junior_authors_bigram_df['(year, word)'].tolist(), index=junior_authors_bigram_df.index)
junior_authors_bigram_df.drop(columns=['(year, word)'], inplace=True)

junior_authors_trigram_df = pd.DataFrame(list(trigram_freq.items()), columns=['(year, word)', 'frequency'])
junior_authors_trigram_df[['year', 'word']] = pd.DataFrame(junior_authors_trigram_df['(year, word)'].tolist(), index=junior_authors_trigram_df.index)
junior_authors_trigram_df.drop(columns=['(year, word)'], inplace=True)


# Sort the bigram DataFrame by frequency in descending order
junior_authors_bigram_df = junior_authors_bigram_df[junior_authors_bigram_df['frequency'] > 20].sort_values(by='frequency', ascending=False)

# Sort the trigram DataFrame by frequency in descending order
junior_authors_trigram_df = junior_authors_trigram_df[junior_authors_trigram_df['frequency'] > 20].sort_values(by='frequency', ascending=False)

# Display the results
print("Junior Bigram DataFrame:")
print(junior_authors_trigram_df)

print("\n Junior Trigram DataFrame:")
print(junior_authors_trigram_df)

100%|██████████████████████████████████| 371718/371718 [06:44<00:00, 918.45it/s]


40923453
40551735
17289159
34788810
Junior Bigram DataFrame:
          frequency  year                            word
16148173       2527  2021             deep neural network
10746521       2507  2020             deep neural network
6409682        2239  2019             deep neural network
2784630        2234  2018    convolutional neural network
22237000       2200  2022             deep neural network
...             ...   ...                             ...
23145099         21  2022          method achieve parable
23147243         21  2022            neural network multi
23149895         21  2022                           n 2 n
16878118         21  2021  theoretical analysis empirical
2950604          21  2018          pose estimation method

[23563 rows x 3 columns]

 Junior Trigram DataFrame:
          frequency  year                            word
16148173       2527  2021             deep neural network
10746521       2507  2020             deep neural network
6409682        

In [71]:
junior_authors = junior_authors_df['author']
junior_authors_llm_publications = llm_papers[llm_papers['authors'].apply(lambda authors: any(author in authors for author in junior_authors))]

# SUCCESS Initialize lists to store bigrams and trigrams with their years
junior_authors_llm_bigram_data = []
junior_authors_llm_trigram_data = []

# Function to extract ngrams from text
def extract_ngrams(text, n, year):
    tokens = word_tokenize(text.lower())
    n_grams = ngrams(tokens, n)
    return [(year, ' '.join(gram)) for gram in n_grams]

# Iterate over the DataFrame with tqdm for progress monitoring
for _, row in tqdm(junior_authors_llm_publications.iterrows(), total=len(junior_authors_llm_publications)):
    title_abstract = row['processed_titles_abstracts']
    year = row['year']
    
    # Extract bigrams and trigrams
    junior_authors_llm_bigram_data.extend(extract_ngrams(title_abstract, 2, year))
    junior_authors_llm_trigram_data.extend(extract_ngrams(title_abstract, 3, year))

# Calculate the frequency of bigrams and trigrams with years
junior_authors_llm_bigram_freq = Counter(junior_authors_llm_bigram_data)
junior_authors_llm_trigram_freq = Counter(junior_authors_llm_trigram_data)

# Create DataFrames for bigrams and trigrams with years
junior_authors_llm_bigram_df = pd.DataFrame(list(junior_authors_llm_bigram_freq.items()), columns=['(year, word)', 'frequency'])
junior_authors_llm_bigram_df[['year', 'word']] = pd.DataFrame(junior_authors_llm_bigram_df['(year, word)'].tolist(), index=junior_authors_llm_bigram_df.index)
junior_authors_llm_bigram_df.drop(columns=['(year, word)'], inplace=True)

junior_authors_llm_trigram_df = pd.DataFrame(list(junior_authors_llm_trigram_freq.items()), columns=['(year, word)', 'frequency'])
junior_authors_llm_trigram_df[['year', 'word']] = pd.DataFrame(junior_authors_llm_trigram_df['(year, word)'].tolist(), index=junior_authors_llm_trigram_df.index)
junior_authors_llm_trigram_df.drop(columns=['(year, word)'], inplace=True)


# Sort the bigram DataFrame by frequency in descending order
junior_authors_llm_bigram_df = junior_authors_llm_bigram_df.sort_values(by='frequency', ascending=False)

# Sort the trigram DataFrame by frequency in descending order
junior_authors_llm_trigram_df = junior_authors_llm_trigram_df.sort_values(by='frequency', ascending=False)

# Display the results
print("Senior Authors llm Bigram DataFrame:")
print(junior_authors_llm_bigram_df)

print("\n Senior Authors llm Trigram DataFrame:")
print(junior_authors_llm_trigram_df)

100%|█████████████████████████████████████████| 254/254 [00:56<00:00,  4.49it/s]


Senior Authors llm Bigram DataFrame:
       frequency  year                 word
14863        121  2023       large language
14798         98  2023       language model
14790         75  2023      language models
5595          41  2021          pre trained
9311          40  2022       language model
...          ...   ...                  ...
8454           1  2021         recent years
8453           1  2021        models recent
8452           1  2021  reusable pretrained
8451           1  2021   bert2bert reusable
23975          1  2023          nlp systems

[23976 rows x 3 columns]

 Senior Authors llm Trigram DataFrame:
       frequency  year                    word
17096         68  2023    large language model
17037         53  2023   large language models
17097         31  2023     language model llms
6491          18  2021    pre trained language
11309         16  2022    large language model
...          ...   ...                     ...
9407           1  2021     better benefi

In [72]:
# Create a list of years for analysis
years = range(2017, 2024)  # Adjust the range as needed

# Initialize empty lists to store data
bigram_data = []

# Loop through each year and calculate the sum of LLM bigram frequencies
for year in years:
    # Filter the LLM bigram DataFrame for the current year
    llm_bigrams_year = junior_authors_llm_bigram_df[junior_authors_llm_bigram_df['year'] == year]
    
    # Calculate the sum of LLM bigram frequencies for the current year
    llm_bigram_sum = llm_bigrams_year['frequency'].sum()
    
    # Get the top 5 non-LLM bigrams for the current year
    top_non_llm_bigrams = junior_authors_bigram_df[junior_authors_bigram_df['year'] == year].nlargest(5, 'frequency')
    
    # Convert the Pandas Series to a list of strings
    top_non_llm_bigram_names = top_non_llm_bigrams['word'].tolist()
    
    # Append data for the current year to the list
    bigram_data.append({
        'Year': year,
        'LLM Bigram Sum': llm_bigram_sum,
        'Top 5 Non-LLM Bigrams': top_non_llm_bigram_names,
        'Top 5 Non-LLM Bigram Frequencies': top_non_llm_bigrams['frequency'].tolist()
    })

# Create a DataFrame from the collected data
bigram_evolution_df = pd.DataFrame(bigram_data)

# Create a Plotly line graph for the evolution of LLM bigrams and top 5 non-LLM bigrams
fig = go.Figure()

# Add lines for top 5 non-LLM bigrams
for i in range(5):
    fig.add_trace(go.Scatter(
        x=bigram_evolution_df['Year'],
        y=bigram_evolution_df['Top 5 Non-LLM Bigram Frequencies'].apply(lambda x: x[i]),
        mode='lines+markers',
        name=top_non_llm_bigram_names[i]
    ))

# Add a line for LLM bigram sum
fig.add_trace(go.Scatter(
    x=bigram_evolution_df['Year'],
    y=bigram_evolution_df['LLM Bigram Sum'],
    mode='lines+markers',
    name='LLM'
))

# Customize the layout
fig.update_layout(
    title='Evolution of Bigrams over year - Junior Authors',
    xaxis_title='Year',
    yaxis_title='Frequency',
    legend=dict(x=0, y=1)
)

# Show the graph
fig.show()


In [73]:
# Create a list of years for analysis
years = range(2017, 2024)  # Adjust the range as needed

# Initialize empty lists to store data
trigram_data = []

# Loop through each year and calculate the sum of LLM bigram frequencies
for year in years:
    # Filter the LLM bigram DataFrame for the current year
    llm_trigrams_year = junior_authors_llm_trigram_df[junior_authors_llm_trigram_df['year'] == year]
    
    # Calculate the sum of LLM bigram frequencies for the current year
    llm_trigram_sum = llm_trigrams_year['frequency'].sum()
    
    # Get the top 5 non-LLM bigrams for the current year
    top_non_llm_trigrams = junior_authors_trigram_df[junior_authors_trigram_df['year'] == year].nlargest(5, 'frequency')
    
    # Convert the Pandas Series to a list of strings
    top_non_llm_trigram_names = top_non_llm_trigrams['word'].tolist()
    
    # Append data for the current year to the list
    trigram_data.append({
        'Year': year,
        'LLM Trigram Sum': llm_trigram_sum,
        'Top 5 Non-LLM Trigrams': top_non_llm_trigram_names,
        'Top 5 Non-LLM Trigram Frequencies': top_non_llm_trigrams['frequency'].tolist()
    })

# Create a DataFrame from the collected data
trigram_evolution_df = pd.DataFrame(trigram_data)

# Create a Plotly line graph for the evolution of LLM bigrams and top 5 non-LLM bigrams
fig = go.Figure()

# Add lines for top 5 non-LLM bigrams
for i in range(5):
    fig.add_trace(go.Scatter(
        x=trigram_evolution_df['Year'],
        y=trigram_evolution_df['Top 5 Non-LLM Trigram Frequencies'].apply(lambda x: x[i]),
        mode='lines+markers',
        name=top_non_llm_trigram_names[i]
    ))

# Add a line for LLM bigram sum
fig.add_trace(go.Scatter(
    x=trigram_evolution_df['Year'],
    y=trigram_evolution_df['LLM Trigram Sum'],
    mode='lines+markers',
    name='LLM'
))

# Customize the layout
fig.update_layout(
    title='Evolution of Trigrams over year - Junior Authors',
    xaxis_title='Year',
    yaxis_title='Frequency',
    legend=dict(x=0, y=1)
)

# Show the graph
fig.show()


### Evolution of LLM

In [69]:

# Filter DataFrames for years starting from 2017
start_year = 2019
end_year = max(llm_bigram_df['year'].max(), llm_trigram_df['year'].max()) + 1

years = list(range(start_year, end_year))

# Initialize lists to store top 20 bigrams and trigrams for each year
top_bigrams_by_year = {}
top_trigrams_by_year = {}

# Function to get top n bigrams or trigrams for a specific year
def get_top_n_ngrams(df, year, n=20):
    return df[df['year'] == year].nlargest(n, 'frequency')

for year in years:
    top_trigrams_by_year[year] = get_top_n_ngrams(llm_trigram_df, year)
    top_bigrams_by_year[year] = get_top_n_ngrams(llm_bigram_df, year)

# Create a Dash app
app = dash.Dash(__name__)

# Define the layout of the app
app.layout = html.Div([
    dcc.Graph(id='ngrams-graph'),
    dcc.Slider(
        id='year-slider',
        min=start_year,
        max=end_year,
        step=1,
        value=start_year,
        marks={str(year): str(year) for year in years}
    )
])

# Define a callback to update the graphs when the slider value changes
@app.callback(
    Output('ngrams-graph', 'figure'),
    [Input('year-slider', 'value')]
)
def update_graphs(selected_year):
    # Create a subplot with two columns for bigrams and trigrams
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Top 20 Trigrams", "Top 20 Bigrams"))
        # Add traces for top trigrams for the selected year
    top_trigrams = top_trigrams_by_year[selected_year].sort_values(by='frequency')  # Sort by frequency
    for i, row in top_trigrams.iterrows():
        text = row['word'] if 'large' in row['word'].lower() else None  # Display text only if it contains "large"
        marker_symbol = 'star-dot' if 'large' in row['word'].lower() else 'circle-open'  # Define marker symbol
        legend_group = 'trigrams' if 'large' in row['word'].lower() else 'other'  # Group legends
        fig.add_trace(go.Scatter(x=[row['word']], y=[row['frequency']], mode='lines+markers', name=text,
                                 legendgroup=legend_group, showlegend=(legend_group == 'trigrams'),
                                 marker=dict(
                                     symbol=marker_symbol,
                                     size=10  # Adjust the size of the marker as needed
                                 )), row=1, col=1)
    
    # Add traces for top bigrams for the selected year
    top_bigrams = top_bigrams_by_year[selected_year].sort_values(by='frequency')  # Sort by frequency
    for i, row in top_bigrams.iterrows():
        text = row['word'] if 'large' in row['word'].lower() else None  # Display text only if it contains "large"
        marker_symbol = 'star-dot' if 'large' in row['word'].lower() else 'circle-open'  # Define marker symbol
        legend_group = 'bigrams' if 'large' in row['word'].lower() else 'other'  # Group legends
        fig.add_trace(go.Scatter(x=[row['word']], y=[row['frequency']], mode='lines+markers', name=text,
                                 legendgroup=legend_group, showlegend=(legend_group == 'bigrams'),
                                 marker=dict(
                                     symbol=marker_symbol,
                                     size=10,
                                     # Adjust the size of the marker as needed
                                 )), row=1, col=2)



    fig.update_layout(title_text=f'Top 20 Trigrams and Bigrams in {selected_year}', showlegend=True)
    
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


### Individual Authors

In [74]:
categories_full_form = {
    'LLM': 'LLM',
    'cs.AI': 'Artificial Intelligence',
    'cs.CL': 'Computation and Language',
    'cs.LG': 'Machine Learning',
'cs.AR': 'Hardware Architecture',
'cs.CC': 'Computational Complexity',
'cs.CE': 'Computational Engineering, Finance, and Science',
'cs.CG': 'Computational Geometry',
'cs.CR': 'Cryptography and Security',
'cs.CV': 'Computer Vision and Pattern Recognition',
'cs.CY': 'Computers and Society',
'cs.DB': 'Databases',
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
'cs.DL': 'Digital Libraries',
'cs.DM': 'Discrete Mathematics',
'cs.DS': 'Data Structures and Algorithms',
'cs.ET': 'Emerging Technologies',
'cs.FL': 'Formal Languages and Automata Theory',
'cs.GL': 'General Literature',
'cs.GR': 'Graphics',
'cs.GT': 'Computer Science and Game Theory',
'cs.HC': 'Human-Computer Interaction',
'cs.IR': 'Information Retrieval',
'cs.IT': 'Information Theory',
'cs.LO': 'Logic in Computer Science',
'cs.MA': 'Multiagent Systems',
'cs.MM': 'Multimedia',
'cs.MS': 'Mathematical Software',
'cs.NA': 'Numerical Analysis',
'cs.NE': 'Neural and Evolutionary Computing',
'cs.NI': 'Networking and Internet Architecture',
'cs.OH': 'Other Computer Science',
'cs.OS': 'Operating Systems',
'cs.PF': 'Performance',
'cs.PL': 'Programming Languages',
'cs.RO': 'Robotics',
'cs.SC': 'Symbolic Computation',
'cs.SD': 'Sound',
'cs.SE': 'Software Engineering',
'cs.SI': 'Social and Information Networks',
'cs.SY': 'Systems and Control',
'stat.AP': 'Applications',
'stat.CO': 'Computation',
'stat.ME': 'Methodology',
'stat.ML': 'Machine Learning',
'stat.OT': 'Other Statistics',
'stat.TH': 'Statistics Theory'
}

In [80]:
# Sample author name and format it
author_string = 'Li Xiang'
formatted_name = ' '.join(author_string.split())

# Filter the dataset for the author
author_each_publications = papers[papers['authors'].apply(lambda x: formatted_name in str(x))]
llm_rows = author_each_publications['processed_titles_abstracts'].apply(check_terms_in_abstract_title)

# Iterate over the rows and add 'LLM' category to the 'categories' column
for idx, row in author_each_publications[llm_rows].iterrows():
    author_each_publications.at[idx, 'categories'] = str(row['categories']) + ', LLM'

# Split combined categories and count each subcategory individually
author_each_publications['categories'] = author_each_publications['categories'].str.split()
category_counts = author_each_publications.explode('categories').groupby(['year', 'categories']).size().reset_index(name='count')

# Map categories to their full forms
category_counts['categories_full'] = category_counts['categories'].map(categories_full_form)
sorted_categories = category_counts.sort_values(by='count', ascending=False)['categories_full'].tolist()
agg_category_counts = category_counts.groupby(['year', 'categories_full']).agg({'count': 'sum'}).reset_index()
agg_category_counts = agg_category_counts[agg_category_counts['year'] >= 2014]
# Plot different lines for different categories
fig = px.line(agg_category_counts, x='year', y='count', color='categories_full', markers=True,
              category_orders={'categories_full': sorted_categories},
              title=f'Publication Count by {formatted_name} in Each Category Over Years')
fig.update_layout(legend_title_text='Categories', yaxis_title='Publication Count')
fig.for_each_trace(lambda t: t.update(line=dict(color='black', width=3)) if t.name == 'LLM' else t)
# Hide legend items beyond the first 5 categories
for i, data in enumerate(fig.data):
    if i >= 15 and 'LLM' not in data.name:
        data.showlegend = False

# Always show 'LLM' in the legend
fig.update_traces(showlegend=True, selector={'name': 'LLM'})
fig.show()


In [76]:

authors_all_publications = pd.DataFrame()
# Assuming you have a DataFrame named 'papers'
formatted_names = []

for author_string in tqdm(llm_top_authors['author'], desc='Formatting author names'):
    formatted_name = ' '.join(author_string.split())
    formatted_names.append(formatted_name)

# Loop through all formatted names and execute the code for each author
for formatted_name in tqdm(formatted_names, desc='Filtering publications'):
    # Filter the dataset for the author
    publications = papers[papers['authors'].apply(lambda x: formatted_name in str(x))]
    authors_all_publications = pd.concat([authors_all_publications, publications])

llm_rows = authors_all_publications['processed_titles_abstracts'].apply(check_terms_in_abstract_title)

# Iterate over the rows and add 'LLM' category to the 'categories' column
for idx, row in authors_all_publications[llm_rows].iterrows():
    if row['categories']:
        if 'cs.CL' in str(row['categories']):
            authors_all_publications.at[idx, 'categories'] = str(row['categories']) + ', LLM'
    else:
        authors_all_publications.at[idx, 'categories'] = str(row['categories'])

# Split combined categories and count each subcategory individually
authors_all_publications['categories'] = authors_all_publications['categories'].str.split()
category_counts = authors_all_publications.explode('categories').groupby(['year', 'categories']).size().reset_index(name='count')

# Map categories to their full forms
category_counts['categories_full'] = category_counts['categories'].map(categories_full_form)
sorted_categories = category_counts.sort_values(by='count', ascending=False)['categories_full'].tolist()
agg_category_counts = category_counts.groupby(['year', 'categories_full']).agg({'count': 'sum'}).reset_index()
agg_category_counts = agg_category_counts[agg_category_counts['year'] >= 2014]

# Plot different lines for different categories
fig = px.line(agg_category_counts, x='year', y='count', color='categories_full', markers=True,
              category_orders={'categories_full': sorted_categories},
              title=f'Publication Count by Top LLM Authors in Each Category Over Years')

fig.update_layout(legend_title_text='Categories',yaxis_title='Publication Count')
fig.for_each_trace(lambda t: t.update(line=dict(color='black', width=3)) if t.name == 'LLM' else t)
# Hide legend items beyond the first 5 categories
for i, data in enumerate(fig.data):
    if i >= 15 and 'LLM' not in data.name:
        data.showlegend = False

# Always show 'LLM' in the legend
fig.update_traces(showlegend=True, selector={'name': 'LLM'})
fig.show()

Formatting author names: 100%|███████████████| 30/30 [00:00<00:00, 15167.44it/s]
Filtering publications: 100%|███████████████████| 30/30 [00:14<00:00,  2.12it/s]


In [77]:
authors_all_publications = pd.DataFrame()
# Assuming you have a DataFrame named 'papers'
formatted_names = []
for author_string in tqdm(senior_authors_df['author'], desc='Formatting author names'):
    formatted_name = ' '.join(author_string.split())
    formatted_names.append(formatted_name)

# Loop through all formatted names and execute the code for each author
for formatted_name in tqdm(formatted_names, desc='Filtering publications'):
    # Filter the dataset for the author
    publications = papers[papers['authors'].apply(lambda x: formatted_name in str(x))]
    authors_all_publications = pd.concat([authors_all_publications, publications])


llm_rows = authors_all_publications['processed_titles_abstracts'].apply(check_terms_in_abstract_title)

# Iterate over the rows and add 'LLM' category to the 'categories' column
for idx, row in authors_all_publications[llm_rows].iterrows():
    if row['categories']:
        authors_all_publications.at[idx, 'categories'] = str(row['categories']) + ', LLM'
    else:
        authors_all_publications.at[idx, 'categories'] = str(row['categories'])


# Split combined categories and count each subcategory individually
authors_all_publications['categories'] = authors_all_publications['categories'].str.split()
category_counts = authors_all_publications.explode('categories').groupby(['year', 'categories']).size().reset_index(name='count')

# Map categories to their full forms
category_counts['categories_full'] = category_counts['categories'].map(categories_full_form)
# Get the top 5 categories based on count
top_categories = category_counts.groupby('categories_full')['count'].sum().sort_values(ascending=False).head(5).index

# Filter the data to include only the top 5 categories
category_counts_top5 = category_counts[category_counts['categories_full'].isin(top_categories)]

# Sort categories for better visualization in the legend
sorted_categories = category_counts_top5.sort_values(by='count', ascending=False)['categories_full'].tolist()
agg_category_counts = category_counts.groupby(['year', 'categories_full']).agg({'count': 'sum'}).reset_index()
agg_category_counts = agg_category_counts[agg_category_counts['year'] >= 2014]
# Plot all lines for different categories
fig = px.line(agg_category_counts, x='year', y='count', color='categories_full', markers=True,
              category_orders={'categories_full': sorted_categories},
              title=f'Publication Count by Top LLM Senior Authors in Each Category Over Years')

fig.update_layout(legend_title_text='Categories', yaxis_title='Publication Count')
fig.for_each_trace(lambda t: t.update(line=dict(color='black', width=3)) if t.name == 'LLM' else t)
for i, data in enumerate(fig.data):
    if i >= 15 and 'LLM' not in data.name:
        data.showlegend = False
fig.update_traces(showlegend=True, selector={'name': 'LLM'})
fig.show()

Formatting author names: 100%|███████████████| 22/22 [00:00<00:00, 90200.09it/s]
Filtering publications: 100%|███████████████████| 22/22 [00:06<00:00,  3.18it/s]


In [78]:
authors_all_publications = pd.DataFrame()
# Assuming you have a DataFrame named 'papers'
formatted_names = []
for author_string in tqdm(junior_authors_df['author'], desc='Formatting author names'):
    formatted_name = ' '.join(author_string.split())
    formatted_names.append(formatted_name)

# Loop through all formatted names and execute the code for each author
for formatted_name in tqdm(formatted_names, desc='Filtering publications'):
    # Filter the dataset for the author
    publications = papers[papers['authors'].apply(lambda x: formatted_name in str(x))]
    authors_all_publications = pd.concat([authors_all_publications, publications])


llm_rows = authors_all_publications['processed_titles_abstracts'].apply(check_terms_in_abstract_title)

# Iterate over the rows and add 'LLM' category to the 'categories' column
for idx, row in authors_all_publications[llm_rows].iterrows():
    if row['categories']:
        authors_all_publications.at[idx, 'categories'] = str(row['categories']) + ', LLM'
    else:
        authors_all_publications.at[idx, 'categories'] = str(row['categories'])


# Split combined categories and count each subcategory individually
authors_all_publications['categories'] = authors_all_publications['categories'].str.split()
category_counts = authors_all_publications.explode('categories').groupby(['year', 'categories']).size().reset_index(name='count')

# Map categories to their full forms
category_counts['categories_full'] = category_counts['categories'].map(categories_full_form)
# Get the top 5 categories based on count
top_categories = category_counts.groupby('categories_full')['count'].sum().sort_values(ascending=False).head(5).index

# Filter the data to include only the top 5 categories
category_counts_top5 = category_counts[category_counts['categories_full'].isin(top_categories)]

# Sort categories for better visualization in the legend
sorted_categories = category_counts_top5.sort_values(by='count', ascending=False)['categories_full'].tolist()
agg_category_counts = category_counts.groupby(['year', 'categories_full']).agg({'count': 'sum'}).reset_index()
agg_category_counts = agg_category_counts[agg_category_counts['year'] >= 2014]
# Plot all lines for different categories
fig = px.line(agg_category_counts, x='year', y='count', color='categories_full', markers=True,
              category_orders={'categories_full': sorted_categories},
              title=f'Publication Count by Top LLM Junior Authors in Each Category Over Years')

fig.update_layout(legend_title_text='Categories',yaxis_title='Publication Count')
fig.for_each_trace(lambda t: t.update(line=dict(color='black', width=3)) if t.name == 'LLM' else t)
for i, data in enumerate(fig.data):
    if i >= 15 and 'LLM' not in data.name:
        data.showlegend = False
fig.update_traces(showlegend=True, selector={'name': 'LLM'})
fig.show()

Formatting author names: 100%|█████████████████| 8/8 [00:00<00:00, 78398.21it/s]
Filtering publications: 100%|█████████████████████| 8/8 [00:02<00:00,  3.07it/s]


In [79]:
authors_all_publications = pd.DataFrame()
# Assuming you have a DataFrame named 'papers'
formatted_names = []
for author_string in tqdm(papers_by_authors['authors'], desc='Formatting author names'):
    formatted_name = ' '.join(author_string.split())
    formatted_names.append(formatted_name)

# Loop through all formatted names and execute the code for each author
for formatted_name in tqdm(formatted_names, desc='Filtering publications'):
    # Filter the dataset for the author
    publications = papers[papers['authors'].apply(lambda x: formatted_name in str(x))]
    authors_all_publications = pd.concat([authors_all_publications, publications])


llm_rows = authors_all_publications['processed_titles_abstracts'].apply(check_terms_in_abstract_title)

# Iterate over the rows and add 'LLM' category to the 'categories' column
for idx, row in authors_all_publications[llm_rows].iterrows():
    if row['categories']:
        authors_all_publications.at[idx, 'categories'] = str(row['categories']) + ', LLM'
    else:
        authors_all_publications.at[idx, 'categories'] = str(row['categories'])


# Split combined categories and count each subcategory individually
authors_all_publications['categories'] = authors_all_publications['categories'].str.split()
category_counts = authors_all_publications.explode('categories').groupby(['year', 'categories']).size().reset_index(name='count')

# Map categories to their full forms
category_counts['categories_full'] = category_counts['categories'].map(categories_full_form)
# Get the top 5 categories based on count
top_categories = category_counts.groupby('categories_full')['count'].sum().sort_values(ascending=False).head(5).index

# Filter the data to include only the top 5 categories
category_counts_top5 = category_counts[category_counts['categories_full'].isin(top_categories)]

# Sort categories for better visualization in the legend
sorted_categories = category_counts_top5.sort_values(by='count', ascending=False)['categories_full'].tolist()
agg_category_counts = category_counts.groupby(['year', 'categories_full']).agg({'count': 'sum'}).reset_index()
agg_category_counts = agg_category_counts[agg_category_counts['year'] >= 2014]
# Plot all lines for different categories
fig = px.line(agg_category_counts, x='year', y='count', color='categories_full', markers=True,
              category_orders={'categories_full': sorted_categories},
              title=f'Publication Count by Top Preprint Platform Authors in Each Category Over Years')

fig.update_layout(legend_title_text='Categories', yaxis_title='Publication Count')

fig.for_each_trace(lambda t: t.update(line=dict(color='black', width=3)) if t.name == 'LLM' else t)
# Hide legend items beyond the first 5 categories
for i, data in enumerate(fig.data):
    if i >= 15 and 'LLM' not in data.name:
        data.showlegend = False

# Always show 'LLM' in the legend
fig.update_traces(showlegend=True, selector={'name': 'LLM'})


fig.show()


Formatting author names: 100%|███████████████| 30/30 [00:00<00:00, 70611.18it/s]
Filtering publications: 100%|███████████████████| 30/30 [00:22<00:00,  1.34it/s]
