# Loading Dataset

In [1]:
import warnings
warnings.simplefilter(action='ignore')

In [2]:
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_colwidth', 100)

In [3]:
# we labeled 6509 articles by topic
# load flattened dataset
df = pd.read_csv('data/bcas_dataset_topics_flat.csv', sep='|')
df = df.rename(columns={'multiple_topics': 'topic'})

In [4]:
df.shape

(10843, 21)

In [5]:
df = df.dropna(subset='topic')

In [6]:
df.shape

(10449, 21)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10449 entries, 0 to 10842
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   url             10449 non-null  object 
 1   year            10449 non-null  int64  
 2   issue           10449 non-null  object 
 3   page            10449 non-null  object 
 4   views           10449 non-null  int64  
 5   downloads       10449 non-null  int64  
 6   title_cn        10449 non-null  object 
 7   title_en        5095 non-null   object 
 8   author_cn       6988 non-null   object 
 9   author_en       3093 non-null   object 
 10  affiliation_cn  6213 non-null   object 
 11  affiliation_en  3036 non-null   object 
 12  abstract_cn     7296 non-null   object 
 13  abstract_en     3161 non-null   object 
 14  keywords_cn     7827 non-null   object 
 15  keywords_en     3025 non-null   object 
 16  fund_project    1411 non-null   object 
 17  similar_ref     7583 non-null   obje

In [216]:
df.topic = df.topic.astype('int')

In [217]:
#origina full dataset
df_full = pd.read_csv('data/bcas_datase_fin.csv')
df_full = df_full[df_full['year'] < 2024]

# Topic Analysis

In [8]:
topic_info = pd.read_csv('data/topic_label_ontology_full.csv')
topic_info.columns

Index(['topic', 'keywords_cn', 'keywords_en', 'keywords_ru', 'topic_en',
       'topic_cn', 'topic_ru', 'subfield_en', 'subfield_cn', 'subfield_ru',
       'field_en', 'field_cn', 'field_ru', 'domain_en', 'domain_cn',
       'domain_ru'],
      dtype='object')

In [9]:
df = pd.merge(df, topic_info, how='left', on='topic')

## Number of Topics by Year

In [10]:
import plotly.express as px 
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "ggplot2"

# layout settings

def set_layout():
    return {
        'width': 800,
        'height': 400,
        'xaxis': {
            'title': '',
            'titlefont': {'size': 16, 'family': "Verdana"},
            'tickmode': 'array',
            'showline': True,
            'linecolor': 'black'
        },
        'yaxis': {
            'title':'',
            'titlefont': {'size': 16, 'family': "Verdana"},
            'showline': True,
            'linecolor': 'black'
        },
        'font': {
            'size': 14,
            'family': "Verdana"
        },
        'margin':{'r': 0, 'l':0, 't': 50, 'b':10}
    }

layout = set_layout()

color_palette = {
    'Economic & Social Sciences': '#CCBFFF', 
    'Natural Sciences': '#92d073', 
    'Applied Sciences': '#654CFF', 
    'Arts & Humanities': '#E51932',
    'Health Sciences':'#19B2FF'
    }

In [11]:
gp = df.groupby(['year', 'domain_en'])['topic_en'].nunique().reset_index()

fig = px.area(
    gp,
    x='year',
    y='topic_en',
    color='domain_en',
    color_discrete_map=color_palette,
)

fig.update_layout(
    layout,
    width=1000,
    xaxis=dict(minor=dict(ticks="inside", showgrid=True), type='category'),
    yaxis=dict(title="Topic Count"),
    #title='Number of Topics per Year, 1986-2023',
    showlegend=True,
    hovermode='x unified',
    margin=dict(b=0, r=250),
    legend=dict(title='Domain'),
)

fig.update_traces(
    hovertemplate='%{fullData.name}: %{y}<extra></extra>'
)

fig.show()

## Topic Statistics

In [12]:
# aggregate views by topic
topic_views = df.groupby('topic')['views'].agg(['sum', 'mean'])
topic_views.columns = ['views_total', 'views_avg']

# aggregate downloads by topic
topic_downloads = df.groupby('topic')['downloads'].agg(['sum', 'mean'])
topic_downloads.columns = ['downloads_total', 'downloads_avg']

# merge
topic_stats = pd.merge(topic_views, topic_downloads, left_index=True, right_index=True)

In [13]:
# calculate views and downloads shares
topic_stats['views_share'] = topic_stats['views_total'] / topic_stats['views_total'].sum()
topic_stats['downloads_share'] = topic_stats['downloads_total'] / topic_stats['downloads_total'].sum()

In [14]:
# add article count and share
topic_stats['article_count'] = df.groupby('topic').size().values
topic_stats['article_share'] = (df.topic.value_counts(normalize=True)).values

In [15]:
topic_stats.columns

Index(['views_total', 'views_avg', 'downloads_total', 'downloads_avg',
       'views_share', 'downloads_share', 'article_count', 'article_share'],
      dtype='object')

In [16]:
topic_stats = topic_stats.reset_index()
topic_stats = topic_stats[['topic', 'article_count', 'article_share',
                            'views_total', 'views_avg', 'views_share',
                            'downloads_total', 'downloads_avg', 'downloads_share']]

In [17]:
topic_stats = pd.merge(topic_stats, topic_info, how='left', on='topic')

In [18]:
topic_stats

Unnamed: 0,topic,article_count,article_share,views_total,views_avg,views_share,downloads_total,downloads_avg,downloads_share,keywords_cn,keywords_en,keywords_ru,topic_en,topic_cn,topic_ru,subfield_en,subfield_cn,subfield_ru,field_en,field_cn,field_ru,domain_en,domain_cn,domain_ru
0,0.0,624,0.059719,1528313,2449.219551,0.064253,1089402,1745.836538,0.05474,"[创新, 知识, 合作, 科技, 人才, 智库, 工程, 工作, 中国科学院, 培养]","[innovation, knowledge, cooperation, science and technology, talent, think tank, engineering, wo...","[инновации, знания, сотрудничество, технологии, талант, аналитический центр, инженерия, работа, ...",S&T Talent Cultivation,科技人才培养,Воспитание научно-технических талантов,Education,教育,Образование,Social Sciences,社会科学,Общественные науки,Economic & Social Sciences,经济与社会科学,Гуманитарные науки
1,1.0,340,0.053402,795842,2340.711765,0.033459,594437,1748.344118,0.029869,"[年度, 简介, 实验室, 中国科学院, 一等奖, 重点, 国家, 科学技术, 进步奖, 研究所]","[annual, introduction, laboratory, chinese academy of sciences, first prize, key, national, scie...","[ежегодный, введение, лаборатория, китайская академия наук, первая премия, ключевая, национальна...",Development History and Scientific Achievements of the Chinese Academy of Sciences,中国科学院发展历程与科研成果,История развития и научные достижения КАН,Science Studies,科学学,Исследования науки,Social Sciences,社会科学,Общественные науки,Economic & Social Sciences,经济与社会科学,Гуманитарные науки
2,2.0,558,0.032539,1415220,2536.236559,0.059498,1114275,1996.908602,0.05599,"[创新, 科技, 强国, 改革, 世界, 建设, 科技体制, 发展, 战略, 政策]","[innovation, science and technology, strong country, reform, world, construction, science and te...","[инновации, наука и технологии, сильная страна, реформа, мир, строительство, научно-техническая ...",S&T Innovation and Superpower Strategy,中国科技创新与强国战略,Научно-технические инновации и стратегия сверхдержавы,"Strategic, Defence & Security Studies",战略、国防与安全研究,"Стратегия, оборона и безопасность",Enabling & Strategic Technologies,使能技术与战略技术,Прикладные и стратегические технологии,Applied Sciences,应用科学,Прикладные науки
3,3.0,196,0.019811,520340,2654.795918,0.021876,540190,2756.071429,0.027143,"[生物学, 生物, 细胞, 基因, 合成, 生命科学, 分子, 生命, 蛋白质, 干细胞]","[biology, biology, cell, gene, synthesis, life science, molecule, life, protein, stem cell]","[биология, организм, клетка, ген, синтез, наука о жизни, молекула, жизнь, белок, стволовая клетка]",Biotechnology Development,生物技术发展,Развитие биотехнологий,Biotechnology,生物技术,Биотехнологии,Enabling & Strategic Technologies,使能技术与战略技术,Прикладные и стратегические технологии,Applied Sciences,应用科学,Прикладные науки
4,4.0,147,0.018758,395219,2688.564626,0.016616,316695,2154.387755,0.015913,"[湿地, 水资源, 长江, 生态, 流域, 湖泊, 海洋, 经济带, 保护, 对策]","[wetland, water resources, yangtze river, ecology, river basin, lake, ocean, economic zone, prot...","[водно-болотные угодья, водные ресурсы, река янцзы, экология, речной бассейн, озеро, океан, экон...",Water Resources Management and Ecological Protection,长江流域水资源管理与生态保护,Управление водными ресурсами и экологическая защита,Environmental Engineering,环境工程,Инженерная экология,Engineering,工程学,Технические науки,Applied Sciences,应用科学,Прикладные науки
5,5.0,151,0.018184,323727,2143.887417,0.01361,273989,1814.496689,0.013767,"[试验站, 生态系统, 观测, 中国科学院, 森林, 野外, 定位, 生态, 研究所, 青藏高原]","[experimental station, ecosystem, observation, chinese academy of sciences, forest, field, posit...","[экспериментальная станция, экосистема, наблюдение, китайская академия наук, лес, поле, позицион...",CAS Field Observation Station Network,中科院野外观测站网络,Сеть полевых наблюдательных станций КАН,Environmental Sciences,环境科学,Науки об окружающей среде,Earth & Environmental Sciences,地球与环境科学,Науки о Земле и науки об окружающей среде,Natural Sciences,自然科学,Естественные науки
6,6.0,176,0.017705,385668,2191.295455,0.016214,323058,1835.556818,0.016233,"[科普, 专题报告, 自然科学, 科学, 调研, 学科, 智库, 文化, 全国, 科学普及]","[popular science, special report, natural science, science, research, discipline, think tank, cu...","[популяризация науки, специальный репортаж, естествознание, наука, исследование, предмет, аналит...",Science Popularization and Think Tank Construction,科学普及与智库建设,Популяризация науки и создание аналитических центров,Science Studies,科学学,Исследования науки,Social Sciences,社会科学,Общественные науки,Economic & Social Sciences,经济与社会科学,Гуманитарные науки
7,7.0,132,0.017227,322984,2446.848485,0.013579,306305,2320.492424,0.015391,"[能源, 发电, 再生能源, 分布式, 燃料电池, 光伏, 新能源, 电网, 太阳能, 储能]","[energy, power generation, renewable energy, distributed, fuel cell, photovoltaic, new energy, p...","[энергетика, производство электроэнергии, возобновляемые источники энергии, распределенная энерг...",Renewable Energy,可再生能源,Возобновляемая энергия,Energy,能源,Энергетика,Enabling & Strategic Technologies,使能技术与战略技术,Прикладные и стратегические технологии,Applied Sciences,应用科学,Прикладные науки
8,8.0,165,0.017035,171243,1037.836364,0.007199,141168,855.563636,0.007093,"[紫外, 激光, 射线, 光谱仪, 成像仪, 伽玛, 相机, 固态, 拉曼, 曼光谱]","[ultraviolet, laser, ray, spectrometer, imager, gamma, camera, solid state, raman, mann spectros...","[уф, лазер, луч, спектрометр, формирователь изображения, гамма, камера, твердотельное тело, комб...",Optical and Spectroscopic Detection Instruments,光学与光谱探测仪器,Оптические и спектроскопические приборы обнаружения,Optoelectronics & Photonics,光电子学与光子学,Оптоэлектроника и фотоника,Enabling & Strategic Technologies,使能技术与战略技术,Прикладные и стратегические технологии,Applied Sciences,应用科学,Прикладные науки
9,9.0,147,0.016939,352933,2400.904762,0.014838,311003,2115.666667,0.015627,"[气候变化, 气候, 全球, 应对, 冰冻, 变化, 排放, 减排, 影响, 陆地]","[climate change, climate, global, response, freezing, change, emission, emission reduction, impa...","[изменение климата, климат, глобальный, ответные меры, замораживание, изменение, выбросы, сокращ...","Global Climate Change: Impact, Adaptation, and Mitigation Strategy",全球气候变化: 影响、适应与减缓策略研究,"Глобальное изменение климата: воздействие, адаптация и стратегия смягчения последствий",Environmental Sciences,环境科学,Науки об окружающей среде,Earth & Environmental Sciences,地球与环境科学,Науки о Земле и науки об окружающей среде,Natural Sciences,自然科学,Естественные науки


## Category Overlap

### Topic Overlap

In [19]:
from itertools import combinations

topic_title_sets = {topic: set(df[df['topic_en'] == topic]['title_cn']) for topic in df['topic_en'].unique()}

topics = list(topic_title_sets.keys())
overlap_data = []

for topic1, topic2 in combinations(topics, 2):
    overlap = len(topic_title_sets[topic1].intersection(topic_title_sets[topic2]))
    overlap_data.append({'topic1': topic1, 'topic2': topic2, 'overlap': overlap})

overlap_df = pd.DataFrame(overlap_data)

In [20]:
overlap_df.sort_values(by='overlap', ascending=False).head(50)

Unnamed: 0,topic1,topic2,overlap
585,S&T Innovation and Superpower Strategy,S&T Talent Cultivation,235
2805,Open Science and National Key Laboratories,Open Science and S&T Innovation Policy Research,143
3568,CAS Academicians and Academic Divisions Work,CAS Divisions and Academicians: History and Development,91
2535,Development History and Scientific Achievements of the Chinese Academy of Sciences,CAS Leaders Appointments and Profiles,83
432,Belt and Road Initiative: S&T Cooperation,Belt and Road Initiative: S&T Innovation and Sustainable Development,74
672,Sustainable Development in China,Earth Big Data for Sustainable Development,65
2538,Development History and Scientific Achievements of the Chinese Academy of Sciences,Open Science and National Key Laboratories,56
2539,Development History and Scientific Achievements of the Chinese Academy of Sciences,Open Science and S&T Innovation Policy Research,56
1228,S&T Talent Cultivation,Graduate Education and Talent Cultivation,51
4328,Optical and Spectroscopic Detection Instruments,High-Power Solid-State Lasers and Deep Ultraviolet Spectral Imaging Systems,49


### Subfield Overlap

In [21]:
df.columns

Index(['url', 'year', 'issue', 'page', 'views', 'downloads', 'title_cn',
       'title_en', 'author_cn', 'author_en', 'affiliation_cn',
       'affiliation_en', 'abstract_cn', 'abstract_en', 'keywords_cn_x',
       'keywords_en_x', 'fund_project', 'similar_ref', 'text',
       'text_tokenized', 'topic', 'keywords_cn_y', 'keywords_en_y',
       'keywords_ru', 'topic_en', 'topic_cn', 'topic_ru', 'subfield_en',
       'subfield_cn', 'subfield_ru', 'field_en', 'field_cn', 'field_ru',
       'domain_en', 'domain_cn', 'domain_ru'],
      dtype='object')

In [22]:
subfield_df = df[
    ['year', 'views', 'downloads', 'title_cn', 'author_cn',
    'subfield_en', 'subfield_cn', 'subfield_ru',
    'field_en', 'field_cn', 'field_ru',
    'domain_en', 'domain_cn', 'domain_ru']
]
subfield_df = subfield_df.drop_duplicates()

In [23]:
subfield_df.shape

(9143, 14)

In [24]:
from itertools import combinations

subfield_title_sets = {subfield: set(subfield_df[subfield_df['subfield_en'] == subfield]['title_cn']) for subfield in subfield_df['subfield_en'].unique()}

subfields = list(subfield_title_sets.keys())
overlap_data = []

for subfield1, subfield2 in combinations(subfields, 2):
    overlap = len(subfield_title_sets[subfield1].intersection(subfield_title_sets[subfield2]))
    overlap_data.append({'subfield1': subfield1, 'subfield2': subfield2, 'overlap': overlap})

overlap_subfield_df = pd.DataFrame(overlap_data)

In [25]:
overlap_subfield_df.sort_values(by='overlap', ascending=False).head(50)

Unnamed: 0,subfield1,subfield2,overlap
194,"Strategic, Defence & Security Studies",Education,240
355,Education,Science Studies,127
397,Science Studies,"History of Science, Technology & Medicine",104
368,Education,"History of Science, Technology & Medicine",76
45,Environmental Sciences,Ecology,62
195,"Strategic, Defence & Security Studies",Science Studies,61
44,Environmental Sciences,Environmental Engineering,55
0,Energy,Environmental Sciences,51
53,Environmental Sciences,Agronomy & Agriculture,48
55,Environmental Sciences,Information Systems,47


In [32]:
import numpy as np

subfields = sorted(overlap_subfield_df['subfield1'].unique())

heatmap_df = overlap_subfield_df.pivot(index='subfield1', columns='subfield2', values='overlap')
heatmap_df = heatmap_df.reindex(index=subfields, columns=subfields)
heatmap_df = heatmap_df.fillna(0)
heatmap_df = heatmap_df + heatmap_df.T


fig = px.imshow(heatmap_df,
                labels=dict(x="Subfield", y="Subfield", color="Overlap"),
                color_continuous_scale="Blues",
                text_auto='.0f',
                template='plotly_white')

fig.update_layout(
    layout,
    #title="Overlap Between Subfields",
    width=1200,
    height=1200,
    xaxis_showgrid=False,
    yaxis_showgrid=False
)

# Show the plot
fig.show()


### Field Overlap

In [33]:
field_df = df[
    ['year', 'views', 'downloads', 'title_cn', 'author_cn',
    'field_en', 'field_cn', 'field_ru',
    'domain_en', 'domain_cn', 'domain_ru']
]
field_df = field_df.drop_duplicates()

In [34]:
field_df = field_df[field_df['year'] > 2012]

In [35]:
field_df.shape

(4158, 11)

In [36]:
from itertools import combinations

field_title_sets = {field: set(field_df[field_df['field_en'] == field]['title_cn']) for field in field_df['field_en'].unique()}

fields = list(field_title_sets.keys())
overlap_data = []

for field1, field2 in combinations(fields, 2):
    overlap = len(field_title_sets[field1].intersection(field_title_sets[field2]))
    overlap_data.append({'field1': field1, 'field2': field2, 'overlap': overlap})

overlap_field_df = pd.DataFrame(overlap_data)

In [38]:
fields = sorted(overlap_field_df['field1'].unique())

heatmap_df = overlap_field_df.pivot(index='field1', columns='field2', values='overlap')
heatmap_df = heatmap_df.reindex(index=fields, columns=fields)
heatmap_df = heatmap_df.fillna(0)
heatmap_df = heatmap_df + heatmap_df.T

np.fill_diagonal(heatmap_df.values, 0)

fig = px.imshow(heatmap_df,
                text_auto='.0f',
                labels=dict(x="field", y="field", color="Overlap"),
                color_continuous_scale="Blues")

fig.update_layout(
    layout,
    #title="Overlap Between Fields",
    xaxis_tickangle=-270,
    width=900,
    height=800,
    xaxis_showgrid=False,
    yaxis_showgrid=False
)

fig.show()

### Domain Overlap

In [39]:
domain_df = df[
    ['year', 'views', 'downloads', 'title_cn', 'author_cn',
    'domain_en', 'domain_cn', 'domain_ru']
]
domain_df = domain_df.drop_duplicates()
domain_df.shape

(8167, 8)

In [40]:
from itertools import combinations

domain_title_sets = {domain: set(domain_df[domain_df['domain_en'] == domain]['title_cn']) for domain in domain_df['domain_en'].unique()}

domains = list(domain_title_sets.keys())
overlap_data = []

for domain1, domain2 in combinations(domains, 2):
    overlap = len(domain_title_sets[domain1].intersection(domain_title_sets[domain2]))
    overlap_data.append({'domain1': domain1, 'domain2': domain2, 'overlap': overlap})

overlap_domain_df = pd.DataFrame(overlap_data)

In [41]:
overlap_domain_df.sort_values(by='domain1', ascending=True)

Unnamed: 0,domain1,domain2,overlap
0,Applied Sciences,Natural Sciences,687
1,Applied Sciences,Economic & Social Sciences,631
2,Applied Sciences,Health Sciences,168
3,Applied Sciences,Arts & Humanities,92
7,Economic & Social Sciences,Health Sciences,62
8,Economic & Social Sciences,Arts & Humanities,171
9,Health Sciences,Arts & Humanities,7
4,Natural Sciences,Economic & Social Sciences,205
5,Natural Sciences,Health Sciences,86
6,Natural Sciences,Arts & Humanities,36


# Article Distributions By Category

In [42]:
# Count unique articles per domain per year
domain = df.groupby(['year', 'domain_en'])['title_cn'].nunique().reset_index(name='count')
domain = domain.pivot_table(index='year', columns='domain_en', values='count', fill_value=0)
domain = domain.div(domain.sum(axis=1), axis=0).reset_index()

fig = px.area(domain, x='year', y=[col for col in domain.columns if col != 'year'],
              color_discrete_map=color_palette)

fig.update_layout(
    layout,
    width=1000,
    #title='Distribution of Articles by Domain, 1986-2023',
    xaxis=dict(
        minor=dict(ticks="inside", showgrid=True),
        type='category'
    ),
    yaxis=dict(
        title="",
        tickformat='.0%'
    ),
    showlegend=True,
    legend=dict(title='Domain'),
    hovermode='x unified',
    margin=dict(b=0, r=250)
)

fig.update_traces(
    hovertemplate='%{fullData.name}: %{y:.1%}<extra></extra>'
)

fig.show()

In [43]:
gp = df.groupby(['year', 'field_en'])['title_cn'].nunique().reset_index(name='count')
gp['share'] = gp['count'] / gp.groupby('year')['count'].transform('sum')*100
gp = gp[gp['year'] > 2012]

# pivot the df to create a matrix & fill nan values with 0
heatmap_data = gp.pivot(index='field_en', columns='year', values='share')
heatmap_data = heatmap_data.fillna(0)

fig = px.imshow(heatmap_data,
                text_auto = '.2f',
                color_continuous_scale='Blues',
                labels = dict(x = "Year", y = "", color = "Share, %"))

fig.update_layout(
    layout,
    #title='Distribution of Articles by Field, 2013-2023',
    width=900,
    height=600,
    xaxis=dict(
        side = "bottom",
        tickmode='array', 
        tickvals=gp['year'].unique()
    )
)

fig.update_traces(
    hovertemplate='Year: %{x}<br>Field: %{y}<br>Share: %{z:.2f}%<extra></extra>'
)

fig.show()

In [44]:
gp = df.groupby(['year', 'subfield_en'])['title_cn'].nunique().reset_index(name='count')
gp['share'] = gp['count'] / gp.groupby('year')['count'].transform('sum')*100
gp = gp[gp['year'] > 2012]

# pivot the df to create a matrix & fill nan values with 0
heatmap_data = gp.pivot(index='subfield_en', columns='year', values='share')
heatmap_data = heatmap_data.fillna(0)

fig = px.imshow(heatmap_data,
                text_auto = '.2f',
                template='plotly_white',
                color_continuous_scale='Blues',
                labels = dict(x = "Year", y = "", color = "Share, %"))

fig.update_layout(
    layout,
    width=900,
    height=1500,
    #title='Distribution of Articles by Field, 2013-2023',
    xaxis=dict(
        side = "bottom",
        tickmode='array', 
        tickvals=gp['year'].unique()
    )
)

fig.update_traces(
    hovertemplate='Year: %{x}<br>Subfield: %{y}<br>Share: %{z:.2f}%<extra></extra>'
)

fig.show()

# Views & Downloads


* Identification of high-performing topics (high views and downloads)
* Outliers: topics with disproportionately high views but low downloads or vice versa

In [45]:
correlation = topic_stats.downloads_total.corr(topic_stats.views_total)
print(f'Correlation between downloads total and views total: {round(correlation, 2)}')

Correlation between downloads total and views total: 0.99


In [46]:
fig = px.scatter(
    topic_stats,
    x='downloads_total',
    y='views_total',
    color='domain_en',
    color_discrete_map=color_palette,
    hover_data=['topic_en'],
    log_x=False
)

fig.update_layout(
    layout,
    xaxis=dict(title="Downloads"),
    yaxis=dict(title="Views"),
    legend=dict(title='Domain')
)

fig.update_traces(
    marker=dict(size=10),
    hovertemplate='Topic: %{customdata[0]}<br>Downloads: %{x}<br>Views: %{y}<extra></extra>'
)

fig.show()

In [250]:
correlation = topic_stats.article_count.corr(topic_stats.views_total)
print(f'Correlation between article count and views total: {round(correlation, 2)}')

Correlation between article count and views total: 0.97



* Topics that are over-performing (high views relative to article count) or under-performing

Possible explanations:
* Topics with more articles might be more comprehensive or popular, attracting more views.
* There could be a content strategy that results in a consistent view rate across articles.

Considerations:
* This high correlation suggests that simply producing more content is strongly associated with getting more views.
* It might indicate that content quality or topic popularity has a smaller impact on views than the sheer volume of content.

In [47]:
fig = px.scatter(
    topic_stats,
    x='views_total',
    y='article_count',
    color='domain_en',
    color_discrete_map=color_palette,
    hover_data=['topic_en'],
    log_x=False
    )

fig.update_layout(
    layout,
    xaxis=dict(title="Views"),
    yaxis=dict(title="Article Count"),
    legend=dict(title='Domain')
)

fig.update_traces(
    marker=dict(size=10),
    hovertemplate='Topic: %{customdata[0]}<br>Views: %{x}<br>Articles: %{y}<extra></extra>'
)

fig.show()

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
test = topic_stats_past_decade
Q1 = test[['views_total', 'article_count']].quantile(0.25)
Q3 = test[['views_total', 'article_count']].quantile(0.75)
IQR = Q3 - Q1

# Determine if a point is an outlier
outliers_iqr = ((test[['views_total', 'article_count']] < (Q1 - 1.5 * IQR)) | 
                 (test[['views_total', 'article_count']] > (Q3 + 1.5 * IQR))).any(axis=1)

# Add a column to your DataFrame to flag outliers
test['outlier_iqr'] = outliers_iqr
test[test['outlier_iqr']==True][['topic_en', 'outlier_iqr']]

Topic Popularity: 

* Topics that are getting disproportionate views relative to their article count
* Identification of "efficient" topics (small article share but large view share)

In [252]:
correlation = topic_stats.views_share.corr(topic_stats.article_share)
print(f'Correlation between views share and article share: {round(correlation, 2)}')

Correlation between views share and article share: 0.79


* Generally, topics with a larger share of articles tend to have a larger share of views. However, there's more variation in this relationship compared to the absolute numbers.
* Some topics are likely performing better or worse in terms of views relative to their article count. Some topics might be more "efficient" at attracting views (high view share relative to article share).
* Other topics might be "overrepresented" in terms of article count but not attracting proportional views.
* Look for topics with lower view share compared to article share.

In [48]:
fig = px.scatter(
    topic_stats,
    x='views_share',
    y='article_share',
    color='domain_en',
    color_discrete_map=color_palette,
    hover_data=['topic_en'],
    log_x=False
)

fig.update_layout(
    layout,
    title='Views Share vs. Article Share by Topic',
    xaxis=dict(
        title="Views Share",
        tickformat=',.0%',
        hoverformat='.2%'
    ),
    yaxis=dict(
        title="Article Share",
        tickformat=',.0%',  # Format as percentage with no decimal places
        hoverformat='.2%'  # Show 2 decimal places on hover
    ),
    legend=dict(title='Domain')
)

fig.update_traces(
    marker=dict(size=10),
    hovertemplate='Topic: %{customdata[0]}<br>Views Share: %{x:.2%}<br>Article Share: %{y:.2%}<extra></extra>'
)

fig.show()

Calculate a "view efficiency" metric (view share / article share) to easily compare topic performance.

In [50]:
topic_stats['views_efficiency'] = topic_stats.views_share/topic_stats.article_share
topic_stats.views_efficiency.describe()

count    100.000000
mean       1.099025
std        0.753777
min        0.167022
25%        0.626129
50%        0.942919
75%        1.298138
max        4.851127
Name: views_efficiency, dtype: float64

In [51]:
# top topics by view efficiency
topic_stats[['topic_en', 'views_efficiency']].sort_values(by='views_efficiency', ascending=False).head(15)

Unnamed: 0,topic_en,views_efficiency
86,CAS Divisions and Academicians: History and Development,4.851127
98,Belt and Road Initiative: S&T Cooperation,3.507787
97,Nanotechnology Ethics and Governance,3.048172
67,Open Science and S&T Innovation Policy Research,3.004587
61,CAS Leaders Appointments and Profiles,2.793558
96,Disaster Mitigation,2.550853
51,Earth Big Data for Sustainable Development,2.417323
64,Open Science and National Key Laboratories,2.406713
41,Sustainable Development in China,2.047574
53,Graduate Education and Talent Cultivation,2.041947


In [52]:
correlation = topic_stats.views_efficiency.corr(topic_stats.views_total)
print(f'Correlation between views efficiency and views total: {round(correlation, 2)}')

Correlation between views efficiency and views total: 0.35


In [53]:
fig = px.scatter(
    topic_stats,
    x='views_efficiency',
    y='article_share',
    color='domain_en',
    color_discrete_map=color_palette,
    hover_data=['topic_en'],
    log_x=False
    )

fig.update_layout(
    layout,
    title='Views Efficiency vs Article Share by Topic',
    xaxis=dict(
        title="Views Efficiency",
        tickformat='.2',
        ),
    yaxis=dict(
        title="Article Share",
        tickformat=',.0%',
        hoverformat='.2%'
        ),
    legend=dict(title='Domain')
)

fig.update_traces(
    marker=dict(size=10),
    hovertemplate='Topic: %{customdata[0]}<br>Views Efficiency: %{x}<br>Article Share: %{y}<extra></extra>'
)

fig.show()

In [260]:
correlation = topic_stats.views_avg.corr(topic_stats.article_share)
print(f'Correlation between views average and article share: {round(correlation, 2)}')

Correlation between views average and article share: 0.15


* Topics with a larger share of articles don't necessarily have higher average views per article, and vice versa. 
* Topics with few articles can have high average views, and topics with many articles can have low average views.
* Some topics might have fewer articles but each article is high-quality or high-interest, leading to high average views.
* Topics with a high share of articles might be oversaturated, leading to competition for views within the topic.

In [55]:
fig = px.scatter(
    topic_stats,
    x='article_share',
    y='views_avg',
    color='domain_en',
    size='article_count',
    color_discrete_map=color_palette,
    hover_data=['topic_en'],
    log_x=False
    )

fig.update_layout(
    layout,
    title='Views Average vs. Article Share by Topic',
    xaxis=dict(
        title="Article Share",
        tickformat=',.0%',
        hoverformat='.2%'
        ),
    yaxis=dict(title="Views Average"),
    legend=dict(title='Domain')
)

fig.update_traces(
    marker=dict(size=10),
    hovertemplate='Topic: %{customdata[0]}<br>Article Share: %{x}<br>Views Average: %{y}<extra></extra>'
)
fig.show()

* Identify topics with high average views but low article share. These might be high-performing areas where more content could be beneficial.
* Look for topics with high article share but low average views. These might be oversaturated or underperforming areas.

# Organizations

In [56]:
#1986-2023
orgs = pd.read_csv('data/orgs.csv')
orgs = orgs[orgs['year'] < 2024]

In [57]:
orgs.shape

(4841, 8)

In [58]:
topics_orgs = pd.merge(
    orgs,
    df[['title_cn', 'topic_en', 'subfield_en', 'field_en', 'domain_en']],
    on='title_cn', how='left')

In [59]:
topics_orgs = topics_orgs.dropna(subset='topic_en')
topics_orgs = topics_orgs.dropna(subset='orgs_head')
topics_orgs = topics_orgs.drop_duplicates()

In [61]:
topics_orgs.shape

(6807, 12)

In [267]:
gp = topics_orgs.groupby('topic_en')['orgs_head'].nunique().reset_index()
mapping_dict = dict(zip(gp['topic_en'], gp['orgs_head']))
topic_stats['org_count'] = topic_stats['topic_en'].map(mapping_dict)
topic_stats['org_share'] = topic_stats['org_count']/topic_stats['org_count'].sum()

In [62]:
topics_orgs.head()

Unnamed: 0,url,org_cn,city_cn,city_en,org_cn_head,orgs_head,title_cn,year,topic_en,subfield_en,field_en,domain_en
0,http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230101&flag=1,中国石油勘探开发研究院,北京,Beijing,中国石油勘探开发研究院,Research Institute of Petroleum Exploration and Development,油气安全战略与“双碳”战略：关系与路径,2023,Renewable Energy,Energy,Enabling & Strategic Technologies,Applied Sciences
1,http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230101&flag=1,中国石油勘探开发研究院,北京,Beijing,中国石油勘探开发研究院,Research Institute of Petroleum Exploration and Development,油气安全战略与“双碳”战略：关系与路径,2023,Carbon Neutrality Goals,Environmental Sciences,Earth & Environmental Sciences,Natural Sciences
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230101&flag=1,国家油气战略研究中心,北京,Beijing,国家油气战略研究中心,National Oil and Gas Strategic Research Center,油气安全战略与“双碳”战略：关系与路径,2023,Renewable Energy,Energy,Enabling & Strategic Technologies,Applied Sciences
3,http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230101&flag=1,国家油气战略研究中心,北京,Beijing,国家油气战略研究中心,National Oil and Gas Strategic Research Center,油气安全战略与“双碳”战略：关系与路径,2023,Carbon Neutrality Goals,Environmental Sciences,Earth & Environmental Sciences,Natural Sciences
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230102&flag=1,深部煤矿采动响应与灾害防控国家重点实验室,淮南,Huainan,深部煤矿采动响应与灾害防控国家重点实验室,State Key Laboratory of Mining Response and Disaster Prevention In Deep Coal Mines,我国煤炭主体能源安全高质量发展的理论技术思考,2023,Carbon Neutrality Goals,Environmental Sciences,Earth & Environmental Sciences,Natural Sciences


## By Domain

In [63]:
#one organization can fit into multiple categories
gp = topics_orgs.groupby('domain_en')['orgs_head'].nunique().sort_values(ascending=False).reset_index(name='count')

fig = px.bar(gp,
             y='domain_en',
             x='count',
             orientation='h',
             color='domain_en',
             color_discrete_map=color_palette)

fig.update_layout(
    layout,
    xaxis=dict(
        title='Count',
        range=[0, gp['count'].max() * 1.1]
        ),
    #title='Number of Organizations by Domain, 2013-2023',
    showlegend=False,
)

fig.update_traces(
    textposition='outside', 
    texttemplate='%{x}',
    textfont=dict(color='black'), 
    opacity=0.7,
    hovertemplate='Domain: %{y}<br>Organizations: %{x}<extra></extra>'
)

fig.show()

In [64]:
from plotly.subplots import make_subplots

domains = topics_orgs['domain_en'].dropna().unique()

fig = make_subplots(rows=len(domains), cols=1, subplot_titles=domains, vertical_spacing=0.1)

for i, domain in enumerate(domains, 1):
    domain_data = topics_orgs[topics_orgs['domain_en'] == domain]
    top_topics_orgs = domain_data['orgs_head'].value_counts().nlargest(5).reset_index()
    top_topics_orgs.columns = ['orgs_head', 'count']
    
    fig.add_trace(
        go.Bar(
            y=top_topics_orgs['orgs_head'],
            x=top_topics_orgs['count'],
            orientation='h',
            marker_color=color_palette.get(domain, 'blue'),
            opacity=0.7,
            text=top_topics_orgs['count'],
            textposition='outside',
        ),
        row=i, col=1
    )
    
    fig.update_xaxes(
        title="Count",
        showline=True,
        linewidth=1,
        linecolor='black',
        range=[0, 400],      
        row=i, col=1
    )
    fig.update_yaxes(
        autorange='reversed',
        title="",
        showline=True,
        linewidth=1,
        linecolor='black',
        row=i, col=1
    )

fig.update_layout(
    layout,
    width=800,
    height=1200,
    font=dict(color='black', size=14, family='Verdana'),
    showlegend=False,
)

fig.update_traces(
    textposition='outside', 
    texttemplate='%{x}',
    textfont=dict(color='black'), 
    opacity=0.7,
    hovertemplate='Organization: %{y}<br>Affiliations: %{x}<extra></extra>'
)

fig.show()

## By Field

In [65]:
# One organization can fit into multiple categories
gp = topics_orgs.groupby(['field_en', 'domain_en'])['orgs_head'].nunique().sort_values(ascending=True).reset_index(name='count')

fig = px.bar(gp,
             y='field_en',
             x='count',
             orientation='h',
             color='domain_en',
             color_discrete_map=color_palette
             )

fig.update_layout(
    layout,
    #title="Number of Organizations by Field, 2013-2023",
    width=950,
    height=700,
    xaxis=dict(
        title="Organizations",
        range=[0, 500]
    ),
    yaxis=dict(
        #autorange='reversed',
        title="",
    ),
    showlegend=True,
    legend_title=dict(
        text="Domain",
        font=dict(
            size=14,
            family="Verdana",
            color="black"
            )
        )
)

fig.update_traces(
    textposition='outside',  
    texttemplate='%{x}', 
    opacity=0.7,
    textfont=dict(color='black'),
    hovertemplate='Field: %{y}<br>Affiliations: %{x}<extra></extra>' 
)

fig.show()

## By Subfield

In [66]:
# One organization can fit into multiple categories
gp = topics_orgs.groupby(['subfield_en', 'domain_en'])['orgs_head'].nunique().sort_values(ascending=True).reset_index(name='count')

fig = px.bar(gp,
             y='subfield_en',
             x='count',
             orientation='h',
             color='domain_en',
             color_discrete_map=color_palette
             )

fig.update_layout(
    layout,
    #title="Number of Organizations by Subfield, 2013-2023",
    width=950,
    height=1000,
    xaxis=dict(
        title="Organizations",
        range=[0, 400]
    ),
    showlegend=True,
    legend_title=dict(
        text="Domain",
        font=dict(size=14, family="Verdana", color="black")
        )
)

fig.update_traces(
    textposition='outside',  
    texttemplate='%{x}',  
    opacity=0.7,
    textfont=dict(color='black'),
    hovertemplate='Domain: %{y}<br>Affiliations: %{x}<extra></extra>' 
)

fig.show()

## Collaboration Network

In [68]:
# keep only the rows where the number of unique values in the 'orgs_head' column, grouped by the 'title_cn' column, is greater than 1
orgs_filtered = topics_orgs[topics_orgs.groupby('title_cn')['orgs_head'].transform('nunique') > 1]
orgs_filtered = orgs_filtered.drop(columns='topic_en')
orgs_filtered = orgs_filtered.drop_duplicates()

In [69]:
orgs_filtered.shape

(4458, 11)

In [71]:
from itertools import combinations

gp = orgs_filtered.groupby(['field_en', 'title_cn']).orgs_head.unique().reset_index()

# initialize list to hold collaboration pairs
collaboration_pairs = []

# iterate through the grouped data
for idx, row in gp.iterrows():
    orgs = row['orgs_head']
    title = row['title_cn']
    field = row['field_en']
    # generate all possible unique pairs of organizations
    if len(orgs) > 1:
        for pair in combinations(orgs, 2):
            collaboration_pairs.append((field, title, pair[0], pair[1]))

# create a df from the pairs
collaboration_df = pd.DataFrame(collaboration_pairs, columns=['field_en', 'title_cn', 'org1', 'org2'])

In [74]:
# calculate collaboration strength (=number of occurrences for a pair)
# group by field_en and organization pair, then count collaborations
collaboration_strength = collaboration_df.groupby(['field_en', 'org1', 'org2']).size().reset_index(name='strength')

# ensure each pair appears only once per field_en ((A, B) = (B, A))
collaboration_strength['org_pair'] = collaboration_strength.apply(lambda row: tuple(sorted([row['org1'], row['org2']])), axis=1)
collaboration_strength = collaboration_strength.groupby(['field_en', 'org_pair'])['strength'].sum().reset_index()

# split the org_pair back into separate columns
collaboration_strength[['org1', 'org2']] = pd.DataFrame(collaboration_strength['org_pair'].tolist(), index=collaboration_strength.index)

# drop the org_pair column
collaboration_strength = collaboration_strength.drop('org_pair', axis=1)

# sort
collaboration_strength = collaboration_strength.sort_values(['field_en', 'strength'], ascending=[True, False])

# reset the index
collaboration_strength = collaboration_strength.reset_index(drop=True)

In [75]:
collaboration_strength.sort_values(by='strength', ascending=False).head(10)

Unnamed: 0,field_en,strength,org1,org2
3238,Social Sciences,67,"Institutes of Science and Development, CAS",University of CAS
1740,Enabling & Strategic Technologies,43,"Institutes of Science and Development, CAS",University of CAS
3239,Social Sciences,24,"Institute of Geographic Sciences and Natural Resources Research, CAS",University of CAS
3240,Social Sciences,23,CAS,"Institutes of Science and Development, CAS"
3241,Social Sciences,23,CAS,University of CAS
666,Earth & Environmental Sciences,18,"Institutes of Science and Development, CAS",University of CAS
667,Earth & Environmental Sciences,17,"Institute of Geographic Sciences and Natural Resources Research, CAS",University of CAS
2620,Information & Communication Technologies,15,"Institutes of Science and Development, CAS",University of CAS
508,Built Environment & Design,15,"Institutes of Science and Development, CAS",University of CAS
509,Built Environment & Design,14,"Institute of Geographic Sciences and Natural Resources Research, CAS",University of CAS


In [78]:
collaboration_strength.groupby('field_en').strength.sum().sort_values(ascending=False)

field_en
Earth & Environmental Sciences              1262
Social Sciences                              946
Enabling & Strategic Technologies            596
Information & Communication Technologies     476
Engineering                                  444
Agriculture, Fisheries & Forestry            294
Biology                                      285
Economics & Business                         175
Built Environment & Design                   146
Psychology & Cognitive Sciences               91
Physics & Astronomy                           90
Clinical Medicine                             76
Public Health & Health Services               62
Biomedical Research                           59
Historical Studies                            38
Name: strength, dtype: int64

In [79]:
import networkx as nx
import plotly.graph_objects as go
import pandas as pd

def process_data(collaboration_strength):
    fields = sorted(collaboration_strength['field_en'].unique())
    graphs_by_field = {}
    all_nodes = set()
    
    for field in fields:
        field_data = collaboration_strength[collaboration_strength['field_en'] == field]
        G = nx.Graph()
        
        for _, row in field_data.iterrows():
            G.add_edge(row['org1'], row['org2'], weight=row['strength'])
            all_nodes.add(row['org1'])
            all_nodes.add(row['org2'])
        
        graphs_by_field[field] = G
    
    return graphs_by_field, all_nodes, fields

def plot_graph(G, pos, fig):
    edge_x, edge_y = [], []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
    
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(color='#0E86D4', width=1),
        hoverinfo='none',
        mode='lines'
    )
    
    node_x, node_y = [], []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
    
    node_adjacencies = [len(list(G.adj[node])) for node in G.nodes()]
    node_text = [f'{node}<br>Connections: {adj}' for node, adj in zip(G.nodes(), node_adjacencies)]

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='Blues',
            reversescale=False,
            color=node_adjacencies,
            size=10,
            colorbar=dict(thickness=15, title='Connections', xanchor='left', titleside='right'),
            line_width=2
        ),
        text=node_text
    )

    fig.add_trace(edge_trace)
    fig.add_trace(node_trace)
    
    return fig

def create_visualization(collaboration_strength):
    graphs_by_field, all_nodes, fields = process_data(collaboration_strength)
    
    G_combined = nx.Graph()
    for G in graphs_by_field.values():
        G_combined = nx.compose(G_combined, G)
    
    pos = nx.spring_layout(G_combined, k=0.6, iterations=50)
    
    fig = go.Figure()
    
    for field in fields:
        fig = plot_graph(G=graphs_by_field[field], pos=pos, fig=fig)
    
    steps = []
    for i, field in enumerate(fields):
        visibility = [False] * len(fig.data)
        start_idx = i * 2
        end_idx = (i + 1) * 2
        for j in range(start_idx, end_idx):
            visibility[j] = True
        
        network_density = nx.density(graphs_by_field[field])
        
        step = dict(
            method="update",
            args=[
                {"visible": visibility},
                {"annotations": [dict(
                    text=f"Number of organizations: {len(graphs_by_field[field].nodes())}<br>Network Density: {network_density:.2%}",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002,
                    font=dict(family="Verdana")
                )]}
            ],
            label=f"{field}"
        )
        steps.append(step)
    
    fig.data[0].visible = True
    fig.data[1].visible = True
    
    initial_network_density = nx.density(graphs_by_field[fields[0]])
    
    fig.update_layout(
        template='plotly_white',
        width=1200,
        height=700,
        title='Collaboration Network by Field, 2013-2023',
        titlefont=dict(family="Verdana", size=16),
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20,l=5,r=5,t=40),
        annotations=[dict(
            text=f"Number of organizations: {len(graphs_by_field[fields[0]].nodes())}<br>Network Density: {initial_network_density:.2%}",
            showarrow=False,
            xref="paper", yref="paper",
            x=0.005, y=-0.002,
            font=dict(family="Verdana")
        )],
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        font=dict(family="Verdana"),
        sliders=[{
            "active": 0,
            "currentvalue": {"prefix": "Field: "},
            "pad": {"t": 50},
            "steps": steps
        }],
    )
    
    return fig

fig = create_visualization(collaboration_strength)
fig.show()

Analyze the distribution of topics across different institutions

In [286]:
topics_orgs_en.groupby('orgs_head_en')['field_en'].value_counts().reset_index()

Unnamed: 0,orgs_head_en,field_en,count
0,Academy of Macroeconomic Research,Earth & Environmental Sciences,2
1,Academy of Macroeconomic Research,Biology,1
2,Academy of Macroeconomic Research,Engineering,1
3,"Academy of Macroeconomic Research, National Development and Reform Commission",Enabling & Strategic Technologies,1
4,Academy of Military Sciences,Biology,2
...,...,...,...
1706,Zhongguancun Science and Technology Park Management Committee,Enabling & Strategic Technologies,1
1707,Zhongguancun Science and Technology Park Management Committee,Social Sciences,1
1708,Zhongyu Gold Labeling Beijing Biotechnology,"Agriculture, Fisheries & Forestry",1
1709,"Zunyi First People's Hospital, Zunyi Medical University",Public Health & Health Services,1


# Authors

In [80]:
authors = pd.read_csv('data/authors_flat.csv')

In [81]:
authors.shape

(12837, 4)

In [82]:
exclude_list = ['not_specified', '本刊编辑部', '本刊特约评论员', '《中国科学院院刊》编辑部']
authors = authors.drop(authors[authors['author_cn'].isin(exclude_list)].index)

In [83]:
authors.shape

(10155, 4)

In [84]:
authors_topics = pd.merge(authors[['year', 'title_cn', 'author_cn']],
                          df[['title_cn', 'topic_en', 'domain_en', 'field_en', 'subfield_en']],
                          on='title_cn')

In [85]:
authors_topics.shape

(15314, 7)

In [86]:
authors_topics = authors_topics.dropna(subset='topic_en')

In [87]:
authors_topics.shape

(15314, 7)

In [88]:
#calculate author shares by topic, all time
author_topic_count = authors_topics.groupby('topic_en')['author_cn'].nunique().reset_index()
topic_stats = pd.merge(topic_stats, author_topic_count, on='topic_en')
topic_stats = topic_stats.rename(columns={'author_cn': 'author_count'})

In [89]:
topic_stats['author_share'] = topic_stats['author_count'] / topic_stats['author_count'].sum()

## By Domain

In [90]:
gp = authors_topics.groupby(['year', 'domain_en'])['author_cn'].nunique().reset_index()

fig = px.area(
    gp,
    x='year',
    y='author_cn',
    color='domain_en',
    color_discrete_map=color_palette,
)

fig.update_layout(
    layout,
    width=1000,
    xaxis=dict(minor=dict(ticks="inside", showgrid=True), type='category'),
    yaxis=dict(title="Author Count"),
    #title='Number of Authors by Domain, 1986-2023',
    showlegend=True,
    hovermode='x unified',
    legend=dict(title='Domain'),
)

fig.update_traces(
    hovertemplate='%{fullData.name}: %{y}<extra></extra>'
)

fig.show()

In [91]:
# Count unique articles per domain per year
gp = authors_topics.groupby(['year', 'domain_en'])['author_cn'].nunique().reset_index(name='count')
gp = gp.pivot_table(index='year', columns='domain_en', values='count', fill_value=0)
gp = gp.div(gp.sum(axis=1), axis=0).reset_index()

fig = px.area(gp, x='year', y=[col for col in gp.columns if col != 'year'],
              color_discrete_map=color_palette)

fig.update_layout(
    layout,
    width=1000,
    #title='Distribution of Authors by Domain, 1986-2023',
    xaxis=dict(
        minor=dict(ticks="inside", showgrid=True), type='category'
    ),
    yaxis=dict(
        title="",
        tickformat='.0%'
    ),
    showlegend=True,
    legend=dict(title='Domain'),
    hovermode='x unified',
    margin=dict(b=0, r=250)
)

fig.update_traces(
    hovertemplate='%{fullData.name}: %{y:.1%}<extra></extra>'
)


fig.show()

## By Field

In [92]:
gp = authors_topics.groupby(['year', 'field_en'])['author_cn'].nunique().reset_index(name='count')
gp = gp[gp['year'] > 2012]

heatmap_data_count = gp.pivot(index='field_en', columns='year', values='count')
heatmap_data_count = heatmap_data_count.fillna(0)

fig = px.imshow(heatmap_data_count,
                text_auto='.0f',
                color_continuous_scale='Blues',
                labels=dict(x="Year", y="", color="Count"))

fig.update_layout(
    layout,
    width=900,
    height=600,
    #title='Number of Authors by Field, 2013-2023',
    xaxis=dict(
        side="bottom",
        tickmode='array',
        tickvals=gp['year'].unique()
    )
)

fig.update_traces(
    hovertemplate='Year: %{x}<br>Field: %{y}<br>Authors: %{z:,.0f}<extra></extra>'
)

fig.show()

In [300]:
gp = authors_topics.groupby(['year', 'field_en'])['author_cn'].nunique().reset_index(name='count')
gp['share'] = gp['count'] / gp.groupby('year')['count'].transform('sum')*100
gp = gp[gp['year'] > 2012]

heatmap_data = gp.pivot(index='field_en', columns='year', values='share')
heatmap_data = heatmap_data.fillna(0)

fig = px.imshow(heatmap_data,
                text_auto = '.2f',
                color_continuous_scale='Blues',
                labels = dict(x = "Year", y = "", color = "Share, %"))

fig.update_layout(
    layout,
    title='Distribution of Authors by Field, 2013-2023',
    xaxis=dict(
        side = "bottom",
        tickmode='array', 
        tickvals=gp['year'].unique()
    )
)

fig.update_traces(
    hovertemplate='Year: %{x}<br>Field: %{y}<br>Share: %{z:.2f}%<extra></extra>'
)

fig.show()

## By Subfield

In [93]:
gp = authors_topics.groupby(['year', 'subfield_en'])['author_cn'].nunique().reset_index(name='count')
gp = gp[gp['year'] > 2012]

# Create a pivot table for absolute count values
heatmap_data_count = gp.pivot(index='subfield_en', columns='year', values='count')
heatmap_data_count = heatmap_data_count.fillna(0)

fig = px.imshow(heatmap_data_count,
                text_auto='.0f',  # Display whole numbers
                color_continuous_scale='Blues',
                labels=dict(x="Year", y="", color="Count"))

fig.update_layout(
    layout,
    width=900,
    height=1200,
    #title='Number of Authors by Subfield, 2013-2023',
    xaxis=dict(
        side="bottom",
        tickmode='array',
        tickvals=gp['year'].unique()
    )
)

# Update hovertemplate to show only the count
fig.update_traces(
    hovertemplate='Year: %{x}<br>Subfield: %{y}<br>Authors: %{z:,.0f}<extra></extra>'
)

fig.show()

In [94]:
gp = authors_topics.groupby(['year', 'subfield_en'])['author_cn'].nunique().reset_index(name='count')
gp['share'] = gp['count'] / gp.groupby('year')['count'].transform('sum')*100
gp = gp[gp['year'] > 2012]

# pivot the df to create a matrix & fill nan values with 0
heatmap_data = gp.pivot(index='subfield_en', columns='year', values='share')
heatmap_data = heatmap_data.fillna(0)

fig = px.imshow(heatmap_data,
                text_auto = '.2f',
                color_continuous_scale='Blues',
                labels = dict(x = "Year", y = "", color = "Share, %"))

fig.update_layout(
    layout,
    width=900,
    height=1200,
    #title='Share of Authors by Subfield, 2013-2023',
    xaxis=dict(
        side = "bottom",
        tickmode='array', 
        tickvals=gp['year'].unique()
    )
)

fig.update_traces(
    hovertemplate='Year: %{x}<br>Subfield: %{y}<br>Share: %{z:.2f}%<extra></extra>'
)

fig.show()

# Fund Projects

In [95]:
fund = pd.read_csv('data/fund_projects_flat.csv')
fund = fund[fund['year'] < 2024]

In [96]:
fund.shape

(1682, 4)

In [97]:
fund_topics = pd.merge(fund, df[['title_cn', 'topic_en', 'topic_ru', 'domain_en', 'domain_ru', 'field_en', 'field_ru', 'subfield_en', 'subfield_ru']],  on='title_cn', how='left')

In [98]:
fund_topics = fund_topics.dropna(subset=['fund_project', 'topic_en'])

In [99]:
fund_topics

Unnamed: 0,year,title_cn,fund_project,fund_project_clean,topic_en,topic_ru,domain_en,domain_ru,field_en,field_ru,subfield_en,subfield_ru
1,2023,美国联邦政府资助和管理阿尔茨海默病研究项目的实践与启示,中国科学院战略研究专项项目（GHJ-ZLZX-2022-22-3），中国科学院条件保障与财务局委托项目（E2J0471601）,中国科学院战略研究专项项目,Peer Review and Funding Allocation for Basic Research,Экспертная оценка и распределение финансирования для фундаментальных исследований,Economic & Social Sciences,Гуманитарные науки,Social Sciences,Общественные науки,Science Studies,Исследования науки
2,2023,美国联邦政府资助和管理阿尔茨海默病研究项目的实践与启示,中国科学院战略研究专项项目（GHJ-ZLZX-2022-22-3），中国科学院条件保障与财务局委托项目（E2J0471601）,中国科学院条件保障与财务局委托项目,Peer Review and Funding Allocation for Basic Research,Экспертная оценка и распределение финансирования для фундаментальных исследований,Economic & Social Sciences,Гуманитарные науки,Social Sciences,Общественные науки,Science Studies,Исследования науки
3,2023,新形势下我国能源高质量发展与能源安全,国家自然科学基金重点资助项目（51834006），国家重点研发计划项目（2017YFC0804304）,国家自然科学基金重点资助项目,Renewable Energy,Возобновляемая энергия,Applied Sciences,Прикладные науки,Enabling & Strategic Technologies,Прикладные и стратегические технологии,Energy,Энергетика
4,2023,新形势下我国能源高质量发展与能源安全,国家自然科学基金重点资助项目（51834006），国家重点研发计划项目（2017YFC0804304）,国家自然科学基金重点资助项目,Carbon Neutrality Goals,Достижение углеродной нейтральности,Natural Sciences,Естественные науки,Earth & Environmental Sciences,Науки о Земле и науки об окружающей среде,Environmental Sciences,Науки об окружающей среде
5,2023,新形势下我国能源高质量发展与能源安全,国家自然科学基金重点资助项目（51834006），国家重点研发计划项目（2017YFC0804304）,国家重点研发计划项目,Renewable Energy,Возобновляемая энергия,Applied Sciences,Прикладные науки,Enabling & Strategic Technologies,Прикладные и стратегические технологии,Energy,Энергетика
...,...,...,...,...,...,...,...,...,...,...,...,...
2526,1992,高技术及其社会化问题分析,国家社会科学基金资助项目,国家社会科学基金资助项目,Social Network Public Opinion Analysis,Анализ общественного мнения в социальных сетях,Health Sciences,Науки о здоровье,Psychology & Cognitive Sciences,Психология и когнитивные науки,Social Psychology,Социальная психология
2527,1989,分子有机地球化学研究取得重要进展,"本项工作为集体研究任务,主要参加者尚有彭平安、徐世平、G·Eglinton、S．C．Brassell等。课题任务一部分曾引入国家基金课题 (850303)、有机地球化学开放研究实验室课题(OG...",国家基金课题,Chemical Sciences and Technology,Химические науки и технологии,Applied Sciences,Прикладные науки,Engineering,Технические науки,Chemical Engineering,Химическая технология
2528,1989,分子有机地球化学研究取得重要进展,"本项工作为集体研究任务,主要参加者尚有彭平安、徐世平、G·Eglinton、S．C．Brassell等。课题任务一部分曾引入国家基金课题 (850303)、有机地球化学开放研究实验室课题(OG...",有机地球化学开放研究实验室课题,Chemical Sciences and Technology,Химические науки и технологии,Applied Sciences,Прикладные науки,Engineering,Технические науки,Chemical Engineering,Химическая технология
2529,1989,分子有机地球化学研究取得重要进展,"本项工作为集体研究任务,主要参加者尚有彭平安、徐世平、G·Eglinton、S．C．Brassell等。课题任务一部分曾引入国家基金课题 (850303)、有机地球化学开放研究实验室课题(OG...",六五煤成气攻关课题,Chemical Sciences and Technology,Химические науки и технологии,Applied Sciences,Прикладные науки,Engineering,Технические науки,Chemical Engineering,Химическая технология


In [100]:
fund_topics.to_csv('data/fund_topics.csv', index=False)

In [101]:
fund_topics_count = fund_topics.groupby('topic_en')['fund_project'].count().reset_index()
mapping_dict = dict(zip(fund_topics_count['topic_en'], fund_topics_count['fund_project']))
topic_stats['fund_project_count'] = topic_stats['topic_en'].map(mapping_dict)
topic_stats['fund_project_count'] = topic_stats['fund_project_count'].fillna(0)
topic_stats['fund_project_share'] = topic_stats['fund_project_count']/topic_stats['fund_project_count'].sum()*100

## By Domain

In [102]:
gp = fund_topics.groupby(['year', 'domain_en'])['title_cn'].nunique().reset_index(name='count')
gp = gp[(gp['year'] > 2012) & (gp['domain_en'] != 'Arts & Humanities')]

fig = px.area(
    gp,
    x='year',
    y='count',
    color='domain_en',
    color_discrete_map=color_palette,
)

fig.update_layout(
    layout,
    xaxis=dict(
        type='category',
        categoryorder='category ascending',
    ),
    yaxis=dict(
        title="Fund Project Count",
    ),
    #title='Number of Fund Projects by Domain, 2013-2023',
    showlegend=True,
    legend=dict(title='Domain'),
    hovermode='x unified'
)

fig.update_traces(
    opacity=0.7,
    hovertemplate='%{fullData.name}: %{y}<extra></extra>'
)

fig.show()

In [103]:
gp = fund_topics.groupby(['year', 'domain_en'])['title_cn'].nunique().reset_index(name='count')
gp = gp[(gp['year'] > 2012) & (gp['domain_en'] != 'Arts & Humanities')]

gp = gp.pivot_table(index='year', columns='domain_en', values='count', fill_value=0)
gp = gp.div(gp.sum(axis=1), axis=0).reset_index()

fig = px.area(gp, x='year', y=[col for col in gp.columns if col != 'year'],
              color_discrete_map=color_palette)

fig.update_layout(
    layout,
    #title='Distribution of Fund Projects by Domain, 2013-2023',
    xaxis=dict(
        minor=dict(ticks="inside", showgrid=True), type='category'
    ),
    yaxis=dict(
        title="",
        tickformat='.0%'
    ),
    showlegend=True,
    legend=dict(title='Domain'),
    hovermode='x unified',
    margin=dict(b=0, r=250)
)

fig.update_traces(
    hovertemplate='%{fullData.name}: %{y:.1%}<extra></extra>'
)

fig.show()

## By Field

In [104]:
gp = fund_topics[(fund_topics['year'] > 2012) & (fund_topics['domain_en'] != 'Arts & Humanities')]
gp = gp.groupby(['domain_en','field_en'])['title_cn'].nunique().sort_values(ascending=True).reset_index(name='count')

fig = px.bar(gp,
             y='field_en',
             x='count',
             orientation='h',
             color='domain_en',
             color_discrete_map=color_palette
             )

fig.update_layout(
    layout,
    #title="Number of Fund Projects by Field, 2013-2023",
    width=1000,
    height=600,
    xaxis=dict(
        title="Fund Projects",
        range=[0, 300]
    ),
    yaxis=dict(
        title="",
        showline=True,
        linewidth=1,
        linecolor='black'
    ),
    showlegend=True,
    legend_title=dict(text="Domain", font=dict(size=14, family="Verdana", color="black"))
)

fig.update_traces(
    textposition='outside',  
    texttemplate='%{x}',  
    opacity=0.7,
    textfont=dict(color='black'), 

)

fig.show()

In [105]:
gp = fund_topics[(fund_topics['year'] > 2012) & (fund_topics['domain_en'] != 'Arts & Humanities')]
gp = gp.groupby(['year', 'domain_en','field_en'])['title_cn'].nunique().sort_values(ascending=True).reset_index(name='count')

heatmap_data_count = gp.pivot(index='field_en', columns='year', values='count')
heatmap_data_count = heatmap_data_count.fillna(0)

fig = px.imshow(heatmap_data_count,
                text_auto='.0f',
                color_continuous_scale='Blues',
                labels=dict(x="Year", y="", color="Count"))

fig.update_layout(
    layout,
    width=900,
    height=600,
    title='Number of Fund Projects by Field, 2013-2023',
    xaxis=dict(
        side="bottom",
        tickmode='array',
        tickvals=gp['year'].unique()
    )
)

fig.update_traces(
    hovertemplate='Year: %{x}<br>Field: %{y}<br>Authors: %{z:,.0f}<extra></extra>'
)

fig.show()

# Statistics for 2013-2023

## Subfield

In [106]:
subfield_df = subfield_df[subfield_df['year'] > 2012]
subfield_stats = subfield_df.groupby('subfield_en').agg({
    'title_cn': 'count',
    'views': ['sum', 'mean'],
    'downloads': ['sum', 'mean'],
}).reset_index()

subfield_stats.columns = ['subfield_en', 'article_count', 'views_total', 'views_avg', 'downloads_total', 'downloads_avg']

# shares
subfield_stats['article_share'] = subfield_stats['article_count'] / subfield_stats['article_count'].sum()
subfield_stats['views_share'] = subfield_stats['views_total'] / subfield_stats['views_total'].sum()
subfield_stats['downloads_share'] = subfield_stats['downloads_total'] / subfield_stats['downloads_total'].sum()


subfield_stats = subfield_stats.sort_values(by='article_count', ascending=False)
subfield_orgs = topics_orgs.drop(columns='topic_en')
subfield_orgs = subfield_orgs.drop_duplicates()
subfield_orgs = subfield_orgs[subfield_orgs['year'] > 2012]
subfield_orgs_count = subfield_orgs.groupby('subfield_en')['orgs_head'].nunique().reset_index(name='org_count')

subfield_stats = pd.merge(subfield_stats, subfield_orgs_count, on='subfield_en', how='left')
subfield_stats['org_share'] = subfield_stats['org_count'] / subfield_stats['org_count'].sum()

subfield_stats['views_efficiency'] = subfield_stats.views_share/subfield_stats.article_share

fund_subfield = fund_topics.drop(columns=['topic_en', 'topic_ru'])
fund_subfield = fund_subfield.drop_duplicates()
fund_subfield = fund_subfield[fund_subfield['year'] > 2012]
subfield_fund_count = fund_subfield.subfield_en.value_counts().reset_index(name='fund_project_count')

subfield_stats = pd.merge(subfield_stats, subfield_fund_count, on='subfield_en', how='left')

subfield_stats['fund_project_count'] = subfield_stats['fund_project_count'].fillna(0)
subfield_stats['fund_project_share'] = subfield_stats['fund_project_count'] / subfield_stats['fund_project_count'].sum()

mappind_dict = dict(zip(topic_info['subfield_en'], topic_info['field_en']))
subfield_stats['field_en'] =subfield_stats['subfield_en'].map(mappind_dict)

mappind_dict = dict(zip(topic_info['subfield_en'], topic_info['domain_en']))
subfield_stats['domain_en'] =subfield_stats['subfield_en'].map(mappind_dict)

subfield_stats

Unnamed: 0,subfield_en,article_count,views_total,views_avg,downloads_total,downloads_avg,article_share,views_share,downloads_share,org_count,org_share,views_efficiency,fund_project_count,fund_project_share,field_en,domain_en
0,Environmental Sciences,556,1420184,2554.28777,1447014,2602.543165,0.127028,0.165732,0.169078,300.0,0.169396,1.304689,406.0,0.215042,Earth & Environmental Sciences,Natural Sciences
1,Optoelectronics & Photonics,432,429603,994.451389,400357,926.752315,0.098698,0.050134,0.04678,25.0,0.014116,0.50795,22.0,0.011653,Enabling & Strategic Technologies,Applied Sciences
2,"Strategic, Defence & Security Studies",292,758546,2597.760274,685898,2348.965753,0.066712,0.08852,0.080145,106.0,0.059853,1.326895,148.0,0.07839,Enabling & Strategic Technologies,Applied Sciences
3,Agronomy & Agriculture,212,495377,2336.683962,516847,2437.957547,0.048435,0.057809,0.060392,107.0,0.060418,1.193541,150.0,0.079449,"Agriculture, Fisheries & Forestry",Applied Sciences
4,Science Studies,210,398125,1895.833333,414598,1974.27619,0.047978,0.04646,0.048444,69.0,0.038961,0.968361,77.0,0.040784,Social Sciences,Economic & Social Sciences
5,Education,191,504482,2641.267016,391453,2049.492147,0.043637,0.058872,0.04574,67.0,0.037832,1.349117,68.0,0.036017,Social Sciences,Economic & Social Sciences
6,Electrical & Electronic Engineering,164,161109,982.371951,147888,901.756098,0.037469,0.018801,0.01728,5.0,0.002823,0.50178,3.0,0.001589,Engineering,Applied Sciences
7,Information Systems,160,346986,2168.6625,366489,2290.55625,0.036555,0.040492,0.042823,127.0,0.071711,1.107718,104.0,0.055085,Information & Communication Technologies,Applied Sciences
8,International Relations,149,358285,2404.597315,339709,2279.926174,0.034042,0.041811,0.039694,132.0,0.074534,1.22823,128.0,0.067797,Social Sciences,Economic & Social Sciences
9,Environmental Engineering,147,284603,1936.07483,267203,1817.707483,0.033585,0.033212,0.031222,101.0,0.05703,0.988916,111.0,0.058792,Engineering,Applied Sciences


In [107]:
fig = px.treemap(
    subfield_stats,
    path=['domain_en', 'field_en', 'subfield_en'],
    values='article_share',
    title='',
    color='domain_en',
    color_discrete_map=color_palette,
)

fig.update_layout(
    layout,
    height=1000,
    width=1500
)

fig.show()

## Field

In [108]:
field_df = field_df[field_df['year'] > 2012]

field_stats = field_df.groupby('field_en').agg({
    'title_cn': 'count',
    'views': ['sum', 'mean'],
    'downloads': ['sum', 'mean'],
}).reset_index()

field_stats.columns = ['field_en', 'article_count', 'views_total', 'views_avg', 'downloads_total', 'downloads_avg']

# shares
field_stats['article_share'] = field_stats['article_count'] / field_stats['article_count'].sum()
field_stats['views_share'] = field_stats['views_total'] / field_stats['views_total'].sum()
field_stats['downloads_share'] = field_stats['downloads_total'] / field_stats['downloads_total'].sum()


field_stats = field_stats.sort_values(by='article_count', ascending=False)
field_orgs = topics_orgs.drop(columns=['topic_en', 'subfield_en'])
field_orgs = field_orgs.drop_duplicates()
field_orgs = field_orgs[field_orgs['year'] > 2012]
field_orgs_count = field_orgs.groupby('field_en')['orgs_head'].nunique().reset_index(name='org_count')

field_stats = pd.merge(field_stats, field_orgs_count, on='field_en', how='left')
field_stats['org_share'] = field_stats['org_count'] / field_stats['org_count'].sum()
field_stats['views_efficiency'] = field_stats.views_share/field_stats.article_share

fund_field = fund_topics.drop(columns=['topic_en', 'topic_ru', 'subfield_en', 'subfield_ru'])
fund_field = fund_field.drop_duplicates()
fund_field = fund_field[fund_field['year'] > 2012]
field_fund_count = fund_field.field_en.value_counts().reset_index(name='fund_project_count')

field_stats = pd.merge(field_stats, field_fund_count, on='field_en', how='left')

field_stats['fund_project_count'] = field_stats['fund_project_count'].fillna(0)
field_stats['fund_project_share'] = field_stats['fund_project_count'] / field_stats['fund_project_count'].sum()

mappind_dict = dict(zip(topic_info['field_en'], topic_info['domain_en']))
field_stats['domain_en'] =field_stats['field_en'].map(mappind_dict)

field_stats

Unnamed: 0,field_en,article_count,views_total,views_avg,downloads_total,downloads_avg,article_share,views_share,downloads_share,org_count,org_share,views_efficiency,fund_project_count,fund_project_share,domain_en
0,Enabling & Strategic Technologies,1056,1737213,1645.088068,1707721,1617.160038,0.253968,0.212486,0.208738,208,0.135065,0.836664,274,0.150632,Applied Sciences
1,Earth & Environmental Sciences,672,1582038,2354.223214,1632788,2429.744048,0.161616,0.193506,0.199578,323,0.20974,1.197319,447,0.245739,Natural Sciences
2,Social Sciences,562,1307224,2326.021352,1169805,2081.503559,0.135161,0.159892,0.142987,232,0.150649,1.182976,292,0.160528,Economic & Social Sciences
3,Engineering,407,695931,1709.904177,666477,1637.535627,0.097884,0.085122,0.081465,137,0.088961,0.869629,152,0.083562,Applied Sciences
4,Physics & Astronomy,291,427023,1467.43299,413196,1419.917526,0.069986,0.052231,0.050506,30,0.019481,0.746312,20,0.010995,Natural Sciences
5,Information & Communication Technologies,249,480889,1931.281124,508877,2043.682731,0.059885,0.05882,0.062201,144,0.093506,0.982217,116,0.063771,Applied Sciences
6,"Agriculture, Fisheries & Forestry",212,495377,2336.683962,516847,2437.957547,0.050986,0.060592,0.063175,107,0.069481,1.188399,150,0.082463,Applied Sciences
7,Clinical Medicine,152,240916,1584.973684,264311,1738.888158,0.036556,0.029468,0.032307,45,0.029221,0.806091,21,0.011545,Health Sciences
8,Biology,150,359591,2397.273333,367857,2452.38,0.036075,0.043983,0.044964,109,0.070779,1.219213,131,0.072018,Natural Sciences
9,Biomedical Research,118,171598,1454.220339,162234,1374.864407,0.028379,0.020989,0.01983,33,0.021429,0.739592,19,0.010445,Health Sciences


In [109]:
fig = px.treemap(
    field_stats,
    path=['domain_en', 'field_en'],
    values='article_share',
    title='',
    color='domain_en',
    color_discrete_map=color_palette,
)

fig.update_layout(
    layout,
    height=1000,
    width=1500
)

fig.show()

## Domain

In [110]:
domain_df = domain_df[domain_df['year'] > 2012]
domain_stats = domain_df.groupby('domain_en').agg({
    'title_cn': 'count',
    'views': ['sum', 'mean'],
    'downloads': ['sum', 'mean'],
}).reset_index()

domain_stats.columns = ['domain_en', 'article_count', 'views_total', 'views_avg', 'downloads_total', 'downloads_avg']

# shares
domain_stats['article_share'] = domain_stats['article_count'] / domain_stats['article_count'].sum()
domain_stats['views_share'] = domain_stats['views_total'] / domain_stats['views_total'].sum()
domain_stats['downloads_share'] = domain_stats['downloads_total'] / domain_stats['downloads_total'].sum()


domain_stats = domain_stats.sort_values(by='article_count', ascending=False)
domain_orgs = topics_orgs.drop(columns=['topic_en', 'subfield_en', 'field_en'])
domain_orgs = domain_orgs.drop_duplicates()
domain_orgs = domain_orgs[domain_orgs['year'] > 2012]
domain_orgs_count = domain_orgs.groupby('domain_en')['orgs_head'].nunique().reset_index(name='org_count')

domain_stats = pd.merge(domain_stats, domain_orgs_count, on='domain_en', how='left')
domain_stats['org_share'] = domain_stats['org_count'] / domain_stats['org_count'].sum()
domain_stats['views_efficiency'] = domain_stats.views_share/domain_stats.article_share

fund_domain = fund_topics.drop(columns=['topic_en', 'topic_ru', 'subfield_en', 'subfield_ru', 'field_en', 'field_ru'])
fund_domain = fund_domain.drop_duplicates()
fund_domain = fund_domain[fund_domain['year'] > 2012]

domain_fund_count = fund_domain.domain_en.value_counts().reset_index(name='fund_project_count')
domain_stats = pd.merge(domain_stats, domain_fund_count, on='domain_en', how='left')
domain_stats['fund_project_count'] = domain_stats['fund_project_count'].fillna(0)
domain_stats['fund_project_share'] = domain_stats['fund_project_count'] / domain_stats['fund_project_count'].sum()

domain_stats

Unnamed: 0,domain_en,article_count,views_total,views_avg,downloads_total,downloads_avg,article_share,views_share,downloads_share,org_count,org_share,views_efficiency,fund_project_count,fund_project_share
0,Applied Sciences,1789,3324992,1858.575741,3308527,1849.372275,0.462752,0.43071,0.429411,441,0.366279,0.930757,728,0.420081
1,Natural Sciences,1059,2246455,2121.298395,2285427,2158.09915,0.273927,0.290999,0.296624,378,0.313953,1.062326,563,0.32487
2,Economic & Social Sciences,625,1446677,2314.6832,1342668,2148.2688,0.161666,0.187398,0.174264,247,0.20515,1.159171,343,0.197923
3,Health Sciences,361,625168,1731.767313,687010,1903.074792,0.093378,0.080982,0.089166,131,0.108804,0.867252,94,0.054241
4,Arts & Humanities,32,76507,2390.84375,81172,2536.625,0.008277,0.00991,0.010535,7,0.005814,1.197311,5,0.002885
