In [1]:
import pandas as pd

papers = pd.read_csv('data/papers_topics.csv').iloc[:, 1:]
patents = pd.read_csv('data/patents_topics.csv').iloc[:, 1:]

## Subset - Unique topics

In [2]:
import re

data = papers.sample(5000).copy()

def unique_topics(series):
    t = series.str.strip().str.split(' / ').tolist()
    t = set([x for y in t for x in y])
    print(len(t), 'topics')
    return t

unique_topics(data['topics'])

245 topics


{'3d print',
 '3d printable',
 '3d printed',
 '3d printer',
 '3d printers',
 '3d printing',
 '3d prints',
 'adam',
 'ai',
 'artificial intelligence',
 'artificial intelligent',
 'attention mechanism',
 'attention mechanisms',
 'audio classification',
 'autoencoder',
 'automated reasoning',
 'autonomous car',
 'autonomous cars',
 'autonomous vehicle',
 'auv',
 'aws',
 'back propagation',
 'backpropagation',
 'bayesian inference',
 'big data',
 'boltzmann machine',
 'boosting algorithm',
 'cassandra',
 'chatbot',
 'chatbots',
 'classification',
 'cloud architecture',
 'cloud architectures',
 'cloud computer',
 'cloud computig',
 'cloud computing',
 'cloud infrastructure',
 'cloud infrastructures',
 'cloud server',
 'cloud servers',
 'cloud storage',
 'cloud technologies',
 'cloud technology',
 'cluster computing',
 'clustering',
 'cnn',
 'cnns',
 'computer vision',
 'convolutional layers',
 'convolutional neural network',
 'convolutional neural networks',
 'cyber infrastructure',
 'cyber

**Remove plurals**

In [3]:
def remove_plurals(series):
    return series.str.replace('( \w+)s ', '\g<1> ', regex=True).str.replace(' (\w+)s $', ' \g<1> ', regex=True)

data['topics'] = remove_plurals(data['topics'])
unique_topics(data['topics'])

199 topics


{'3d print',
 '3d printable',
 '3d printed',
 '3d printer',
 '3d printing',
 'adam',
 'ai',
 'artificial intelligence',
 'artificial intelligent',
 'attention mechanism',
 'audio classification',
 'autoencoder',
 'automated reasoning',
 'autonomou car',
 'autonomou cars',
 'autonomou vehicle',
 'auv',
 'aw',
 'back propagation',
 'backpropagation',
 'bayesian inference',
 'big data',
 'boltzmann machine',
 'boosting algorithm',
 'cassandra',
 'chatbot',
 'classification',
 'cloud architecture',
 'cloud computer',
 'cloud computig',
 'cloud computing',
 'cloud infrastructure',
 'cloud server',
 'cloud storage',
 'cloud technologie',
 'cloud technology',
 'cluster computing',
 'clustering',
 'cnn',
 'computer vision',
 'convolutional layer',
 'convolutional neural network',
 'cyber infrastructure',
 'cyberinfrastructure',
 'data center',
 'data centre',
 'data intensive computing',
 'data mining',
 'data science',
 'data storage',
 'database',
 'dbm',
 'deep learning',
 'deep neural netw

**Group related topics**

In [4]:
def map_topics(series, mapper):
    series = series.str.title()
    for topic, regex in mapper.items():
        series = series.str.replace(regex, topic, regex=True, flags=re.IGNORECASE)
    return series

mapper = {
    "Analysis": "analysi", " Loss ": ' los? ', "Extraction": "extrac\w+", " Modeling ": " model\w* ",
    " SaaS ": " saas? ", " IaaS ": " iaas? ", " PaaS ": " paas? ",
    "Robots": "robots?", "Robotics": "robotics?|robotsics?",
    "Classification": "classif\w+",
    "Redis ": "redi ",
    "3D Print": "3d print\w*",
    "Big": "massive|large scale",
    " AI ": " ai | artificial intellig\w+ | Machine Intelligence | Intelligent Machine ",
    " AUV ": " auv | autonomous underwater vehicle | underwater vehicle | Unmanned Underwater ",
    "Back-propagation": "back propagation|backpropagation",
    " Clouds ": " cloud \w+ | Multi Cloud ",
    "Cluster Computing ": "cluster \w+ ",
    " DBM ": " dbm ",
    " Deep Learning ": " deep learning | dl ",
    " Generative Adversarial Network ": " gans? | generative adversarial networks? | generative adversarial ",
    " GPU ": " gpu | graphics? processing units? ",
    "Hardware Acceleration": "hardware acceler\w+",
    " Machine Learning ": " ml | machine learning ",
    "Markov ": "markov \w+ ",
    " Multi-layer perceptron ": " multi layer perceptrons? | mlp | multilayer perceptron ",
    " NLP ": " natural language processing | nlp ",
    "Nearest Neighbors": "nearest neighbou?rs?",
    "Object Recognition": "object detection|object recognition|object identification",
    "Dimensionality Reduction": 'factor analysis|principal components?',
    "QPU": "quantum processing units?",
    " SVM ": " support vector | svm ",
    " UAM ": " drones? | uav | unmanned aerial vehicle | unmanned aircraft\s?\w* | uas | unmanned air vehicle ",
    " AWS ": " aw ",
    " Optimization Algorithm ": " adam | gradient \w{1,2}scent ",
    "Boosting": "Boosting Algorithm|Gradient Boosting",
    " Convolutional Networks ": " cnns? | Convolutional Layers? | Convolutional Neural Net\w* ",
    "Data Center": "data centre",
    "Deep Learning": "Deep Neural Network|dnn",
    "Text Embedding": "doc2vec|word2vec|seq2seq",
    " RNN ": " gru | lstm | rnn | recurrent neural net\w* | Gated Recurrent Unit | Long Short Term Memory ",
    " UGV ": " ugv | Unmanned Ground Vehicle | Autonomou Vehicle | Self Driving | Autonomou car ",
    " Voice Recognition ": " Voice Activity Detection | text2speech | text to speech | Speech Recognition ",
    " Search Algorithm ": " Tree Search | search trees? ",
#     "Computer Vision": "text\s?to\s?image|Image Classification|Image Recognition|Image Segmentation|Gesture Recognition|",
    " Trees ": " regression trees? | classification trees? | random forests? ",
    "Transformers & Attention": "Attention Mechanisms?|Attention layers?|transformer architectures?|transformer net\\w+",
    "Supercomputers": "Super\s?comput\w+",
    "Quantum Computing": "Quantum Comput\w+",
    "Neuromorphic Computing": "Neuromorphic Comput\w+",
    " Reinforcement Learning ": " rl | q learning | policy gradient | policy optimization ",
    " BD & Cloud Solutions ": " On Demand Computing | hadoop | spark | mysql | cassandra | neo[45]j | mongodb | hive | kafka | ec2 | elasticsearch | bigtable | hbase | oracle | microsoft sql | redis | ibm db2 | microsoft azure | google cloud | sqlite | mariadb | splunk | dynamodb ",
    " NRDBMS & NoSQL ": " nosql | nrdbm?s? | non\s?Relational Database ",
    " RDBMS & SQL ": " rdbm?s? | Relational Database | sql | mysql | sqlite ",
    " Database ": " dbm ",
    "Cyber Infrastructure": "Cyber\s?Infrastructure"
}

data['topics'] = map_topics(data['topics'], mapper)
unique_topics(data['topics'])

108 topics


{'3D Print',
 'AI',
 'AUV',
 'AWS',
 'Audio Classification',
 'Autoencoder',
 'Automated Reasoning',
 'Autonomou Cars',
 'BD & Cloud Solutions',
 'Back-propagation',
 'Bayesian Inference',
 'Big Data',
 'Boltzmann Machine',
 'Boosting',
 'Chatbot',
 'Classification',
 'Clouds',
 'Cluster Computing',
 'Clustering',
 'Computer Vision',
 'Convolutional Networks',
 'Cyber Infrastructure',
 'Data Center',
 'Data Intensive Computing',
 'Data Mining',
 'Data Science',
 'Data Storage',
 'Database',
 'Deep Learning',
 'Dimensionality Reduction',
 'Distributed Computing',
 'Docker',
 'Dynamic Programming',
 'Edge Computing',
 'Encoder Decoder',
 'Ensemble Modeling',
 'Entity Recognition',
 'Expert System',
 'Face Detection',
 'Feature Engineering',
 'Feature Extraction',
 'Fog Computing',
 'Fully Connected Layer',
 'GPU',
 'Generative Adversarial Network',
 'Genetic Algorithm',
 'Gesture Recognition',
 'Grid Computing',
 'Hardware Acceleration',
 'High Performance Computing',
 'Humanoid Robots',

In [5]:
data.head()

Unnamed: 0,topics,year,AU,publisher,research_areas,n_refs,n_citations,n_authors,n_affils,n_grants
214505,Loss Function / Pattern Recognition,2009,"Rafajlowicz, E; Krzyzak, A",NONLINEAR ANALYSIS-THEORY METHODS & APPLICATIONS,Mathematics,22,5,2,2,2
81548,AI,2015,"Broekens, J",2015 INTERNATIONAL CONFERENCE ON AFFECTIVE COM...,Computer Science; Engineering,13,1,1,1,0
215248,Information Retrieval,2009,"Geldart, J; Song, W; Li, Y",2009 IEEE 33RD INTERNATIONAL COMPUTER SOFTWARE...,Computer Science,7,0,3,2,0
144403,Regression / Machine Learning,2018,"Badii, C; Nesi, P; Paoli, I",IEEE ACCESS,Computer Science; Engineering; Telecommunications,47,5,3,1,2
176881,Pattern Recognition,2006,"Parra, V; Arrieta, AA; Fernandez-Escudero, JA;...",SENSORS AND ACTUATORS B-CHEMICAL,Chemistry; Electrochemistry; Instruments & Ins...,23,89,5,0,0


## Whole data

**Science**

In [6]:
papers2 = papers.copy()
unique_topics(papers2['topics'])

377 topics


{'3d print',
 '3d printability',
 '3d printable',
 '3d printablility',
 '3d printed',
 '3d printer',
 '3d printers',
 '3d printhead',
 '3d printing',
 '3d printings',
 '3d printinng',
 '3d printinted',
 '3d printouts',
 '3d prints',
 'adam',
 'ai',
 'artificial intelligecet',
 'artificial intelligence',
 'artificial intelligencegreenhouse',
 'artificial intelligences',
 'artificial intelligent',
 'attention layer',
 'attention layers',
 'attention mechanism',
 'attention mechanisms',
 'audio classification',
 'auto encoder',
 'autoencoder',
 'automated question answering',
 'automated reasoning',
 'autonomous car',
 'autonomous cars',
 'autonomous vehicle',
 'auv',
 'aws',
 'back propagation',
 'backpropagation',
 'bayesian inference',
 'bayesian machine learning',
 'bert',
 'big data',
 'bigtable',
 'boltzmann machine',
 'boosting algorithm',
 'boosting algorithms',
 'cassandra',
 'chatbot',
 'chatbots',
 'classification',
 'cloud architecture',
 'cloud architectures',
 'cloud client'

In [7]:
papers2['topics'] = remove_plurals(papers2['topics'])
unique_topics(papers2['topics'])

297 topics


{'3d print',
 '3d printability',
 '3d printable',
 '3d printablility',
 '3d printed',
 '3d printer',
 '3d printhead',
 '3d printing',
 '3d printinng',
 '3d printinted',
 '3d printout',
 'adam',
 'ai',
 'artificial intelligecet',
 'artificial intelligence',
 'artificial intelligencegreenhouse',
 'artificial intelligent',
 'attention layer',
 'attention mechanism',
 'audio classification',
 'auto encoder',
 'autoencoder',
 'automated question answering',
 'automated reasoning',
 'autonomou car',
 'autonomou cars',
 'autonomou vehicle',
 'auv',
 'aw',
 'back propagation',
 'backpropagation',
 'bayesian inference',
 'bayesian machine learning',
 'bert',
 'big data',
 'bigtable',
 'boltzmann machine',
 'boosting algorithm',
 'cassandra',
 'chatbot',
 'classification',
 'cloud architecture',
 'cloud client',
 'cloud computation',
 'cloud computational',
 'cloud compute',
 'cloud computed',
 'cloud computer',
 'cloud computig',
 'cloud computing',
 'cloud computingi',
 'cloud computingprocess

In [8]:
papers2['topics'] = map_topics(papers2['topics'], mapper)
unique_topics(papers2['topics'])

132 topics


{'3D Print',
 'AI',
 'AUV',
 'AWS',
 'Audio Classification',
 'Auto Encoder',
 'Autoencoder',
 'Automated Question Answering',
 'Automated Reasoning',
 'Autonomou Cars',
 'BD & Cloud Solutions',
 'Back-propagation',
 'Bayesian Inference',
 'Bayesian Machine Learning',
 'Bert',
 'Big Data',
 'Boltzmann Machine',
 'Boosting',
 'Chatbot',
 'Classification',
 'Clouds',
 'Cluster Computing',
 'Clustering',
 'Computer Vision',
 'Convolutional Networks',
 'Cyber Infrastructure',
 'Data Center',
 'Data Intensive Computing',
 'Data Mining',
 'Data Science',
 'Data Storage',
 'Database',
 'Decentralized Computing',
 'Deep Learning',
 'Deep Learningbased',
 'Deep Neural Net',
 'Deep Neural Netork',
 'Dew Computing',
 'Dimensionality Reduction',
 'Distributed Computing',
 'Distributed File System',
 'Docker',
 'Dynamic Cloud',
 'Dynamic Programming',
 'Edge Computing',
 'Encoder Decoder',
 'Ensemble Modeling',
 'Entity Recognition',
 'Expert System',
 'Face Detection',
 'Feature Engineering',
 'Fe

In [9]:
papers2.to_csv('data/papers_topics2.csv', index=False)
papers2.head()

Unnamed: 0,topics,year,AU,publisher,research_areas,n_refs,n_citations,n_authors,n_affils,n_grants
0,Monte Carlo,2018,"Rastegarfar, H; Svensson, T; Peyghambarian, N",JOURNAL OF OPTICAL COMMUNICATIONS AND NETWORKING,Computer Science; Optics; Telecommunications,46,1,3,2,1
1,Search Algorithm,2018,"Ali, M; Qaisar, S; Naeem, M; Rodrigues, JJPC; ...",TRANSACTIONS ON EMERGING TELECOMMUNICATIONS TE...,Telecommunications,42,8,5,7,2
2,Clouds,2018,"Liu, JY; Yang, QH; Simon, G; Cui, WL",IEEE TRANSACTIONS ON NETWORK AND SERVICE MANAG...,Computer Science,30,2,4,3,2
3,Clouds,2018,"Leiter, A","2018 INTERNATIONAL SYMPOSIUM ON NETWORKS, COMP...",Computer Science; Telecommunications,0,0,1,1,0
4,Edge Computing,2018,"Bruschi, R; Davoli, F; Lago, P; Lombardo, C; P...",2018 IEEE CONFERENCE ON NETWORK FUNCTION VIRTU...,Computer Science; Engineering,12,0,5,2,0


**Industry**

In [10]:
patents2 = patents.copy()
unique_topics(patents2['topics'])

225 topics


{'3d print',
 '3d printable',
 '3d printed',
 '3d printer',
 '3d printers',
 '3d printhead',
 '3d printing',
 '3d printout',
 'ai',
 'artificial intelligence',
 'artificial intelligent',
 'attention layer',
 'attention mechanism',
 'attention mechanisms',
 'audio classification',
 'auto encoder',
 'autoencoder',
 'automated reasoning',
 'autonomous car',
 'autonomous cars',
 'autonomous vehicle',
 'auv',
 'aws',
 'back propagation',
 'backpropagation',
 'bayesian inference',
 'big data',
 'boltzmann machine',
 'boosting algorithm',
 'chatbot',
 'chatbots',
 'classification',
 'cloud architecture',
 'cloud client',
 'cloud clients',
 'cloud compute',
 'cloud computer',
 'cloud computing',
 'cloud computuing',
 'cloud infrastructure',
 'cloud infrastructures',
 'cloud server',
 'cloud servers',
 'cloud storage',
 'cloud technology',
 'cluster computing',
 'clustering',
 'cnn',
 'cnns',
 'computer vision',
 'convolutional layer',
 'convolutional layers',
 'convolutional neural network',
 

In [11]:
patents2['topics'] = remove_plurals(patents2['topics'])
unique_topics(patents2['topics'])

184 topics


{'3d print',
 '3d printable',
 '3d printed',
 '3d printer',
 '3d printhead',
 '3d printing',
 '3d printout',
 'ai',
 'artificial intelligence',
 'artificial intelligent',
 'attention layer',
 'attention mechanism',
 'audio classification',
 'auto encoder',
 'autoencoder',
 'automated reasoning',
 'autonomou car',
 'autonomou vehicle',
 'auv',
 'aw',
 'back propagation',
 'backpropagation',
 'bayesian inference',
 'big data',
 'boltzmann machine',
 'boosting algorithm',
 'chatbot',
 'classification',
 'cloud architecture',
 'cloud client',
 'cloud compute',
 'cloud computer',
 'cloud computing',
 'cloud computuing',
 'cloud infrastructure',
 'cloud server',
 'cloud storage',
 'cloud technology',
 'cluster computing',
 'clustering',
 'cnn',
 'computer vision',
 'convolutional layer',
 'convolutional neural network',
 'data center',
 'data centre',
 'data mining',
 'data science',
 'data storage',
 'database',
 'dbm',
 'deep learning',
 'deep neural network',
 'distributed computing',
 'd

In [12]:
patents2['topics'] = map_topics(patents2['topics'], mapper)
unique_topics(patents2['topics'])

103 topics


{'3D Print',
 'AI',
 'AUV',
 'AWS',
 'Audio Classification',
 'Auto Encoder',
 'Autoencoder',
 'Automated Reasoning',
 'BD & Cloud Solutions',
 'Back-propagation',
 'Bayesian Inference',
 'Big Data',
 'Boltzmann Machine',
 'Boosting',
 'Chatbot',
 'Classification',
 'Clouds',
 'Cluster Computing',
 'Clustering',
 'Computer Vision',
 'Convolutional Networks',
 'Data Center',
 'Data Mining',
 'Data Science',
 'Data Storage',
 'Database',
 'Deep Learning',
 'Dimensionality Reduction',
 'Distributed Computing',
 'Distributed File System',
 'Dynamic Cloud',
 'Dynamic Programming',
 'Edge Computing',
 'Encoder Decoder',
 'Entity Recognition',
 'Expert System',
 'Face Detection',
 'Feature Engineering',
 'Feature Extraction',
 'Fog Computing',
 'Fully Connected Layer',
 'GPU',
 'Generative Adversarial Network',
 'Genetic Algorithm',
 'Gesture Recognition',
 'Hardware Acceleration',
 'High Performance Computing',
 'Humanoid Robots',
 'IaaS',
 'Image Classification',
 'Image Recognition',
 'Ima

In [13]:
patents2.to_csv('data/patents_topics2.csv', index=False)
patents2.head()

Unnamed: 0,topics,year,date,inventor,applicant,country
0,UAM / Computer Vision,2020,20200917,"'GHIGLINO NOVOA, Pablo Francisco, ', 'BARBADIL...","'ALERION TECH S L 2002ES', 'ALERION TECHNOLOGI...",US
1,Machine Learning,2020,20200903,"'WENZEL, FABIAN, ', 'BROSCH, TOM'","'KONINKLIJKE PHILIPS NV 2002NL', 'KONINKLIJKE ...",US
2,AI,2020,20200820,"'ROYTMAN, Anatoly, ', 'Naressi, Alexandre'","'ACCENTURE GLOBAL SERVICES LTD 2002IE', 'Accen...",US
3,AI,2020,20200811,"'H.<U+5B54><U+6CFD>, ', 'P.<U+8D39><U+5E0C><U+...","'SIEMENS HEALTHCARE GMBH', '<U+897F><U+95E8><U...",CN
4,AI,2020,20200806,"'Kunze, Holger, ', 'Fischer, Peter, ', 'Kreher...","'SIEMENS HEALTHCARE GMBH 2002DE', 'Siemens Hea...",US


## Data preparation

In [13]:
import pandas as pd

papers = pd.read_csv('data/papers_topics2.csv')
patents = pd.read_csv('data/patents_topics2.csv')

papers = papers[~(papers['research_areas'].isna() | papers.AU.isna())]

In [14]:
import numpy as np
import re

def count_topics(data, science=False):
    series = data["topics"]
    t = series.str.strip().str.split(' / ').tolist()
    t = pd.Series([x for y in t for x in y])
    vc, vcp = t.value_counts(), t.value_counts(normalize=True)
    d = pd.DataFrame({'count': vc, 'proportion': vcp})
    if science:
        d['citations'] = pd.Series(vc.index).\
                            apply(lambda key: data.loc[data["topics"].str.contains(key), "n_citations"].sum(skipna=True)).\
                            values
    return d

def strata_count(data, strata='year', datat='patents', string=False, rank=None):
    if string:
        col = strata
        strata = data[strata].str.split('; ').tolist()
        strata = set([x for y in strata for x in y])
        for stratum in strata:
            if stratum in rank.index: 
                ct = count_topics(data.loc[data[col].str.contains(stratum, regex=True)]) if datat=='patents' else \
                     count_topics(data.loc[data[col].str.contains(stratum, regex=True)], science=True)
                path = f"data/counts/{datat}_count_{col}_{rank[stratum] if rank is not None else 'a'}_" + \
                        re.sub('\W', '_', stratum) + ".csv"
                ct.to_csv(path, index_label="topic")
    else:
        for stratum in data[strata].unique():
            ct = count_topics(data.loc[data[strata] == stratum])if datat=='patents' else \
                 count_topics(data.loc[data[strata] == stratum], science=True)
            ct.to_csv(f"data/counts/{datat}_count_{strata}_{stratum}.csv", index_label="topic")

**Overall count**

In [17]:
count_topics(patents).to_csv('data/counts/patents_count.csv', index_label="topic")
count_topics(papers, science=True).to_csv('data/counts/papers_count.csv', index_label="topic")

**Count per year**

In [20]:
strata_count(patents, strata='year', datat='patents')
strata_count(papers, strata='year', datat='papers')

**Count per author**

In [21]:
patents.inventor = patents.inventor.str.split(", ', ").str.join("; ").str.replace("[',]", "", regex=True)
au = patents.inventor.str.split("; ").tolist()
au = pd.Series([x for y in au for x in y]).value_counts()[:50]
au = pd.Series(list(range(1, 51)), index=au.index)
pattern = '|'.join(au.index)

strata_count(patents[patents.inventor.str.contains(pattern, regex=True)], 
             strata='inventor', datat='patents', string=True, rank = au)

In [12]:
papers.AU = papers.AU.str.strip().str.replace(',', '',regex=True)
au = papers.AU.str.split('; ').tolist()
au = pd.Series([x for y in au for x in y]).value_counts()[:50]
au = pd.Series(list(range(1, 51)), index=au.index)
pattern = '|'.join(au.index)

strata_count(papers[papers.AU.str.contains(pattern, regex=True)], 
             strata='AU', datat='papers', string=True, rank = au)

**Count per Publisher (papers only)**

In [15]:
papers.publisher = papers.publisher.str.strip().str.replace('\W', ' ',regex=True)
pub = papers.publisher.str.split('; ').tolist()
pub = pd.Series([x for y in pub for x in y]).value_counts()[:50]
pub = pd.Series(list(range(1, 51)), index=pub.index)
pattern = '|'.join(pub.index)

strata_count(papers[papers.publisher.str.contains(pattern, regex=True)], 
             strata='publisher', datat='papers', string=True, rank = pub)

**Count per research area (papers only)**

In [4]:
def ra_count(data, rank, y):
    strata = data['research_areas'].str.split('\s{2,3}').tolist()
    strata = set([x for y in strata for x in y])
    ds = []
    for stratum in strata:
        d = data[data["research_areas"].str.contains(stratum)]
        t = d['topics'].str.strip().str.split(' / ').tolist()
        t = pd.Series([x for y in t for x in y])
        vc = t.value_counts()
        d = pd.DataFrame({'count': vc, 'ra': stratum, 'year': y})
        d['citations'] = pd.Series(vc.index).\
                            apply(lambda key: data.loc[data["topics"].str.contains(key), "n_citations"].sum(skipna=True)).\
                            values
        ds.append(d.reset_index())
    try:
        return pd.concat(ds)
    except:
        pass

In [14]:
papers.research_areas = papers.research_areas.str.strip().str.replace('\W', ' ',regex=True)
ra = papers.research_areas.str.split('\s{2,3}').tolist()
ra = pd.Series([x for y in ra for x in y]).value_counts()[:30]
ra = pd.Series(list(range(1, 31)), index=ra.index)
pattern = '|'.join(ra.index)

ds = []
for y in range(2000, 2022):
    d = ra_count(papers[(papers.year==y) & (papers.research_areas.str.contains(pattern, regex=True))], ra, y)
    ds.append(d)
res = pd.concat(ds).reset_index(drop=True)
res.columns = ['Topic', 'count', 'ScienceT', 'year', 'citations']
res.to_csv('data/counts/papers_RA.csv', index=False)
res

Unnamed: 0,Topic,count,ScienceT,year,citations
0,AI,5,Building Technology,2000,33238
1,Robots,3,Building Technology,2000,12399
2,Robotics,2,Building Technology,2000,6934
3,Data Mining,1,Building Technology,2000,7278
4,Voice Recognition,1,Building Technology,2000,1859
...,...,...,...,...,...
76539,Classification,1,Radiology,2021,8
76540,Big Data,1,Radiology,2021,16
76541,Classification,1,Nuclear Medicine,2021,8
76542,Big Data,1,Nuclear Medicine,2021,16


## Co-occurence matrix

In [26]:
import pandas as pd

papers = pd.read_csv('data/papers_topics2.csv')
patents = pd.read_csv('data/patents_topics2.csv')

In [27]:
import numpy as np
import itertools
from scipy.sparse import csr_matrix

def co_matrix(unique_topics, topics_list): 
    word_to_id = dict(zip(unique_topics, range(len(unique_topics))))
    documents_as_ids = [np.sort([word_to_id[w] for w in doc if w in word_to_id]).astype('uint32') for doc in topics_list]
    row_ind, col_ind = zip(*itertools.chain(*[[(i, w) for w in doc] for i, doc in enumerate(documents_as_ids)]))
    
    data = np.ones(len(row_ind), dtype='uint32')  
    max_word_id = max(itertools.chain(*documents_as_ids)) + 1
    
    docs_words_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(documents_as_ids), max_word_id))  
    words_cooc_matrix = docs_words_matrix.T * docs_words_matrix 
    words_cooc_matrix.setdiag(0)
    
    return pd.DataFrame(words_cooc_matrix.todense(), index=unique_topics, columns=unique_topics)

**Industry**

In [28]:
topics_list = patents.topics.str.split("/").tolist()
unique_topics = set([x for y in topics_list for x in y])

cm = co_matrix(unique_topics, topics_list)
cm.to_csv("data/counts/patents_co_occ.csv", index_label="Topic")
cm.head()

Unnamed: 0,Nearest Neighbors,BD & Cloud Solutions,Monte Carlo,Database,Machine Translation,Knowledge Representation,AWS,Language Modeling,Supercomputers,Classification,...,Computer Vision,Loss Functions,Data Science,IaaS,Data Mining,GPU,Image Segmentation,UGV,Entity Recognition,Loss Function
Nearest Neighbors,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,2,0,0,0,0,0
BD & Cloud Solutions,0,0,0,8,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Monte Carlo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Database,0,8,0,0,1,1,2,1,0,17,...,4,0,0,0,20,0,4,5,0,1
Machine Translation,0,0,0,1,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
for y in patents.year.unique():
    topics_list = patents.loc[patents.year == y, "topics"].str.split("/").tolist()
    unique_topics = set([x for y in topics_list for x in y])

    cm = co_matrix(unique_topics, topics_list)
    cm.to_csv(f"data/counts/patents_co_occ_{y}.csv", index_label="Topic")

**Science**

In [30]:
topics_list = papers.topics.str.split("/").tolist()
unique_topics = set([x for y in topics_list for x in y])

cm = co_matrix(unique_topics, topics_list)
cm.to_csv("data/counts/papers_co_occ.csv", index_label="Topic")
cm.head()

Unnamed: 0,Multiprocessing,Nearest Neighbors,Autonomou Cars,BD & Cloud Solutions,Multi Processing,Industry 4 0,Monte Carlo,Database,Machine Translation,Knowledge Representation,...,Computer Vision,Loss Functions,Data Science,IaaS,Data Mining,GPU,Image Segmentation,UGV,Entity Recognition,Loss Function
Multiprocessing,0,0,0,2,1,0,0,1,0,2,...,1,0,0,0,1,2,0,0,0,0
Nearest Neighbors,0,0,0,4,0,0,3,54,0,1,...,20,1,1,0,68,0,28,3,0,1
Autonomou Cars,0,0,0,0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,11,0,0
BD & Cloud Solutions,2,4,0,0,0,23,14,450,10,13,...,14,3,35,73,232,5,10,4,4,3
Multi Processing,1,0,0,0,0,0,0,0,0,1,...,2,0,0,0,0,4,3,0,0,0


In [31]:
for y in papers.year.unique():
    topics_list = papers.loc[papers.year == y, "topics"].str.split("/").tolist()
    unique_topics = set([x for y in topics_list for x in y])

    cm = co_matrix(unique_topics, topics_list)
    cm.to_csv(f"data/counts/papers_co_occ_{y}.csv", index_label="Topic")