# Final Analysis with EPO data

Here I attempt to perform a cohesive analysis of the data. All file imports are specified in the associated `read_me` document. It is noted that importing the data can take a significant amount of time.

##### Adjusting the notebook cells to allow for a wider fit

In [None]:
# Adjusting screen 
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

#### Relevant imports

In [None]:
import networkx as nx
import random
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import IPython
from tqdm import tqdm
import pandas as pd
import matplotlib as mpl
from itertools import count
import geopandas as gpd
import pydot
from networkx.drawing.nx_pydot import to_pydot
import igraph as ig
import matplotlib.pyplot as plt
import matplotlib.pylab as pl
from textwrap import wrap
from matplotlib.ticker import MaxNLocator

plt.rcParams["font.family"] = "Helvetica Neue" # Adjusting all of the plot fonts for legibility

%matplotlib inline # Ensuring the plots are outputted correctly in the notebook

Initialising a function for a rapid description of selected IPC descriptions.

In [None]:
IPC_descriptions = pd.read_csv('/Users/joebacchus/Desktop/CASA/All_IPC.txt',sep='	').astype(str) # --> IPC_DESCRIPTIONS.txt
def translate_ipc(IPC):
    '''
    Returns the description of an inputted IPC code, to any scale of character length.
    '''
    return list(IPC_descriptions.loc[IPC_descriptions['IPC'] == IPC]['Description']) 

# Importing the RAW EPO patent and citation data

In [None]:
Patents_raw_EPO = pd.read_csv('/Users/joebacchus/Desktop/CASA/Data Original/Patent data/202001_EPO_Inventor_reg.txt', sep='|')  # --> EPO_PATENTS_RAW.txt
IPCs_raw_EPO = pd.read_csv('/Users/joebacchus/Desktop/CASA/Data Original/Patent data/202001_EPO_IPC.txt', sep='|')              # --> EPO_IPC_RAW.txt
Citations_raw_EPO = pd.read_csv('/Users/joebacchus/Desktop/CASA/Data Original/Citation data/202001_EPO_CITATIONS.txt', sep='|') # --> EPO_CITATIONS_RAW.txt

<font color='blue'>The IPC precision value n is to be modified here and all subsequent cells are to be rerun to investigate the effects with this level.</font>

In [None]:
n = 1 # Adjust according to anaylsis
IPCs_raw_EPO['IPC'] = IPCs_raw_EPO['IPC'].astype(str).str[:n] # Shortening the IPC codes to n-characters of precision

Restricting the patents to the UK. Note that we perform this via the regional code as supposed to the country code to avoid missing patents labelled as part of the 'UK' as supposed to 'GB'.

In [None]:
Patents_raw_EPO_PRE = Patents_raw_EPO.loc[Patents_raw_EPO['reg_code'].str.contains('^UK')][['app_nbr','appln_id','reg_code']] # Eleminating unnecessary columns

In [None]:
IPCs_raw_EPO = IPCs_raw_EPO[['appln_id','prio_year','IPC']] # Eleminating unnecessary columns

Here we associate each patent to the list of it's corresponding IPC codes.

In [None]:
EPO_PAT_IPC = Patents_raw_EPO_PRE.merge(IPCs_raw_EPO, left_on = 'appln_id', right_on = 'appln_id')

In [None]:
EPO_PAT_IPC = EPO_PAT_IPC[['app_nbr','reg_code','prio_year','IPC']].drop_duplicates() # Eleminating unnecessary columns and possible duplicates

In [None]:
Patents_EPO = EPO_PAT_IPC.rename(columns={'app_nbr':'Application','reg_code':'Region','prio_year':'Patent Year','IPC':'IPC'}) # Renaming columns for legibility

In [None]:
Citations_raw_EPO = Citations_raw_EPO[['Citing_app_nbr','Cited_App_nbr','Citing_pub_date']] # Eleminating unnecessary columns

In [None]:
Citations_raw_EPO['Citing_pub_date'] = Citations_raw_EPO['Citing_pub_date'].astype(str).str[:4]   # Modifying the citation dates to only list the year (As supposed to a specific date)
Citations_raw_EPO = Citations_raw_EPO.fillna(False).astype(str)                                   # Modifying the data-type and accounting for missing values.
Citations_raw_EPO = Citations_raw_EPO.loc[Citations_raw_EPO['Cited_App_nbr'].str.contains('^EP')] # Restricting the cited applications to EPO patents (As a preliminary reduction of the file size)

In [None]:
Citations_raw_EPO = Citations_raw_EPO.drop_duplicates() # Dropping possible duplicates

In [None]:
part1 = Patents_EPO.merge(Citations_raw_EPO, left_on = 'Application', right_on = 'Citing_app_nbr') # Ensuring that all citing patents are complete with their detailed characteristics 

In [None]:
part1 = part1.drop(columns=['Application']) # Removing the now duplicated columns (As this is now simply the Source)

In [None]:
part1 = part1.rename(columns={'Citing_app_nbr':'Source','Cited_App_nbr':'Target','Citing_pub_date':'Citation Year','IPC':'Source IPC','Region':'Source Region', 'Patent Year':'Source Patent Year'})

In [None]:
part2 = part1.merge(Patents_EPO, left_on = 'Target', right_on = 'Application') # Ensuring that all cited patents are complete with their detailed characteristics 

In [None]:
part2 = part2.drop(columns=['Application']) # Removing the now duplicated columns (As this is now simply the Target)

In [None]:
part2 = part2.rename(columns = {'Region':'Target Region', 'Patent Year':'Target Patent Year', 'IPC': 'Target IPC'})

In [None]:
part2 = part2[['Source','Target','Citation Year','Source IPC', 'Source Region', 'Source Patent Year', 'Target IPC', 'Target Region','Target Patent Year']] # Re-ordering columns

In [None]:
Data_EPO = part2.drop_duplicates() # Finalising the data

In [None]:
# Eliminates an error occuring in an IPC precision of n=1 
Data = Data_EPO.dropna()
Data = Data.drop(Data[Data['Source IPC'] == 'n'].index)
Data = Data.drop(Data[Data['Target IPC'] == 'n'].index)

We hence conduct the rest of our anaylsis with only a single comprehensive table: `Data`. On occasion we may also take advantage of the more simplified structure of `Patents`. For this reason they are shown below:

### Data 

In [None]:
Data

### Patents

In [None]:
Patents

We now prepare some useful lists and dictionaries to gain some preliminary insights into the prepared data and allow more concise manipulations hereafter.

In [None]:
Present_IPCs = sorted(list(set(list(np.array(Data[['Source IPC','Target IPC']]).flatten()))))                      # List of all possible IPC classes
Present_Dates_Pat = sorted(list(set(list(np.array(Data[['Source Patent Year','Target Patent Year']]).flatten())))) # List of all possible Patent creation years
Present_Dates_Cit = sorted(list(set(list(np.array(Data[['Citation Year']]).flatten()))))                           # List of all possible Patent citation years
Present_Regions = sorted(list(set(list(np.array(Data[['Source Region','Target Region']]).flatten()))))             # List of all possible regions

Present_IPCs_Keys = dict(list(zip(Present_IPCs,np.arange(len(Present_IPCs)))))                                     # IPCs associated in alphabetical order with a number (Facilitates future manipulations)

print('Number of IPCs:',len(Present_IPCs),'|','Number of Dates (Patents):',len(Present_Dates_Pat),'|','Number of Dates (Citations):',len(Present_Dates_Cit),'|','Number of Regions:',len(Present_Regions))

In [None]:
Patents_dates = list(zip(list(list(Data['Source'])+list(Data['Target'])),list(list(Data['Source Patent Year'])+list(Data['Target Patent Year']))))
Patents_dates = pd.DataFrame(Patents_dates, columns = [['Patent','Date']])
Patents_dates['IPC'] = list(Data['Source IPC'])+list(Data['Target IPC'])
Patents_dates = Patents_dates.drop_duplicates().astype(str)
Patents_dates.to_clipboard()
Patents_dates = pd.read_clipboard()

### Number of patents

In [None]:
Storing_array = np.zeros((len(Present_IPCs_Keys),len(Present_Dates_Pat))) # Initialising empty storing array

for i in range(len(Present_Dates_Pat)):
    
    # Creating IPC Citation relationships
    Data_select = Patents_dates.loc[Patents_dates['Date'] == int(Present_Dates_Pat[i])] # Selecting all patents created in specific year
    Counts_prelim = dict(Data_select['IPC'].value_counts()) # Converting multiple citation counts into weights
    extract_information = list(zip(list(Counts_prelim.keys()),list(Counts_prelim.values())))

    # Storing all values into the storing array
    for j in range(len(extract_information)):
        Storing_array[Present_IPCs_Keys[extract_information[j][0]]][i] = extract_information[j][1]
        
plt.figure(figsize=(20, 10))
Dates_plotting = np.arange(min(np.array(Present_Dates_Pat).astype(int)), max(np.array(Present_Dates_Pat).astype(int)) + 1) # Dates for plotting
Colors = pl.cm.coolwarm(np.linspace(0, 1, len(Present_IPCs))) # Colors for plotting

Most_Cited = []
for i in range(len(Present_Dates_Pat)):
    Most_Cited.append(list(Present_IPCs_Keys.keys())[np.argmax(Storing_array.T[i])])

for k in range(len(Present_IPCs_Keys)):
    plt.plot(Dates_plotting, Storing_array[k], color=Colors[k], label=f'{list(Present_IPCs_Keys.keys())[k]} : {translate_ipc(list(Present_IPCs_Keys.keys())[k])[0]}')

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color='black', linestyle=':', linewidth=0.5)
    highest_point = max(Storing_array[:, k])  # Get the highest point for this instance
    plt.text(Dates_plotting[k] - 0.19, highest_point + -3.8, Most_Cited[k], fontsize=12, zorder=20, color='White')  # Adjust the y coordinate to move the text above the highest point and set zorder to 10
    plt.scatter(Dates_plotting[k], highest_point, color=Colors[Present_IPCs_Keys[Most_Cited[k]]], s=200, zorder=10)  # Add a point at the highest point and set zorder to 10

leg = plt.legend(loc='upper right', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white', fontsize=12) 
for line in leg.get_lines():
    line.set_linewidth(10) 

#plt.xlabel('Year', fontsize=15)
plt.ylabel('Number of patents approved for designated year', fontsize=15)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOT1' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)

plt.show()

### Number of patents (Normalised stack)

In [None]:
Storing_array = np.zeros((len(Present_IPCs_Keys),len(Present_Dates_Pat))) # Initialising empty storing array

for i in range(len(Present_Dates_Pat)):
    
    # Creating IPC Citation relationships
    Data_select = Patents_dates.loc[Patents_dates['Date'] == int(Present_Dates_Pat[i])] # Selecting all patents created in specific year
    Counts_prelim = dict(Data_select['IPC'].value_counts()) # Converting multiple citation counts into weights
    extract_information = list(zip(list(Counts_prelim.keys()),list(Counts_prelim.values())))

    # Storing all values into the storing array
    for j in range(len(extract_information)):
        Storing_array[Present_IPCs_Keys[extract_information[j][0]]][i] = extract_information[j][1]

for i in range(len(Storing_array.T)):
    if np.sum(Storing_array.T[i]) != 0:
        Storing_array.T[i] = Storing_array.T[i]/(np.sum(Storing_array.T[i]))
        
plt.figure(figsize=(20, 5))
Dates_plotting = np.arange(min(np.array(Present_Dates_Pat).astype(int)), max(np.array(Present_Dates_Pat).astype(int)) + 1) # Dates for plotting
Colors = pl.cm.coolwarm(np.linspace(0, 1, len(Present_IPCs))) # Colors for plotting

Most_Cited = []
for i in range(len(Present_Dates_Pat)):
    Most_Cited.append(list(Present_IPCs_Keys.keys())[np.argmax(Storing_array.T[i])])

plt.stackplot(Dates_plotting,Storing_array,labels=Present_IPCs_Keys.keys(),colors = Colors)

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color='black', linestyle=':', linewidth=0.5)
    #highest_point = 2  # Get the highest point for this instance
    #plt.text(Dates_plotting[k], 1+0.01, Most_Cited[k], fontsize=12, zorder=20, color='Black')  # Adjust the y coordinate to move the text above the highest point and set zorder to 10
    #plt.scatter(Dates_plotting[k], highest_point, color=Colors[Present_IPCs_Keys[Most_Cited[k]]], s=200, zorder=10)  # Add a point at the highest point and set zorder to 10

#plt.legend(loc='upper right', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white') 
#plt.xlabel('Year', fontsize=15)
plt.ylabel('Share', fontsize=15)
plt.gca().set_yticklabels(['{:.0f}%'.format(x * 100) for x in plt.gca().get_yticks()])

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOT2' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)

plt.show()

Storing_array_PAT = Storing_array

# Number of patents correlation matrix

In [None]:
matrix_pd = pd.DataFrame(Storing_array.T,columns=Present_IPCs)
matrix_corr = matrix_pd.corr()
matrix_plot = np.array(matrix_corr)
labels = list(matrix_corr)

plt.figure(figsize=(10, 10),dpi=100)
plt.imshow(matrix_plot, cmap='coolwarm', vmin=-1, vmax=1, aspect='auto')

for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, f'{matrix_plot[i, j]:.2f}', ha='center', va='center', fontsize=14, color='black')

plt.xticks(np.arange(len(labels)), labels, fontsize=14)
plt.yticks(np.arange(len(labels)), labels, fontsize=14)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOT3' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)

#plt.title('Correlation Matrix')
plt.show()

### Number of citations

In [None]:
Storing_array = np.zeros((len(Present_IPCs_Keys),len(Present_Dates_Cit))) # Initialising empty storing array

for i in range(len(Present_Dates_Cit)):
    
    # Creating IPC Citation relationships
    Data_select = Data.loc[Data['Citation Year'] == Present_Dates_Cit[i]] # Selecting all patents created in specific year
    Counts_prelim = dict(Data_select[['Source IPC','Target IPC']].value_counts()) # Converting multiple citation counts into weights
    Counts = [(key[0], key[1], value) for key, value in Counts_prelim.items()] # Formatted correctly
    
    # Creating graph
    graph = nx.DiGraph()
    graph.add_weighted_edges_from(Counts)
    graph.remove_edges_from(nx.selfloop_edges(graph)) # NOT ACCOUNTING FOR SELF CITATIONS
    extract_information = list(graph.in_degree(weight='weight')) # Accounting for the counts
    
    # Storing all values into the storing array
    for j in range(len(extract_information)):
        Storing_array[Present_IPCs_Keys[extract_information[j][0]]][i] = extract_information[j][1]
        
plt.figure(figsize=(20, 10))
Dates_plotting = np.arange(min(np.array(Present_Dates_Cit).astype(int)), max(np.array(Present_Dates_Cit).astype(int)) + 1) # Dates for plotting
Colors = pl.cm.coolwarm(np.linspace(0, 1, len(Present_IPCs))) # Colors for plotting

Most_Cited = []
for i in range(len(Present_Dates_Cit)):
    Most_Cited.append(list(Present_IPCs_Keys.keys())[np.argmax(Storing_array.T[i])])

for k in range(len(Present_IPCs_Keys)):
    plt.plot(Dates_plotting, Storing_array[k], color=Colors[k], label=f'{list(Present_IPCs_Keys.keys())[k]} : {translate_ipc(list(Present_IPCs_Keys.keys())[k])[0]}')

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color='black', linestyle=':', linewidth=0.5)
    highest_point = max(Storing_array[:, k])  # Get the highest point for this instance
    plt.text(Dates_plotting[k] - 0.19, highest_point + -8, Most_Cited[k], fontsize=12, zorder=20, color='White')  # Adjust the y coordinate to move the text above the highest point and set zorder to 10
    plt.scatter(Dates_plotting[k], highest_point, color=Colors[Present_IPCs_Keys[Most_Cited[k]]], s=200, zorder=10)  # Add a point at the highest point and set zorder to 10

leg = plt.legend(loc='upper right', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white', fontsize=12) 
for line in leg.get_lines():
    line.set_linewidth(10) 

plt.ylabel('Number of citations occurring for designated year', fontsize=15)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOT4' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)

plt.show()

### Number of citations (Normalised stack)

In [None]:
Storing_array = np.zeros((len(Present_IPCs_Keys),len(Present_Dates_Cit))) # Initialising empty storing array

for i in range(len(Present_Dates_Cit)):
    
    # Creating IPC Citation relationships
    Data_select = Data.loc[Data['Citation Year'] == Present_Dates_Cit[i]] # Selecting all patents created in specific year
    Counts_prelim = dict(Data_select[['Source IPC','Target IPC']].value_counts()) # Converting multiple citation counts into weights
    Counts = [(key[0], key[1], value) for key, value in Counts_prelim.items()] # Formatted correctly
    
    # Creating graph
    graph = nx.DiGraph()
    graph.add_weighted_edges_from(Counts)
    graph.remove_edges_from(nx.selfloop_edges(graph)) # NOT ACCOUNTING FOR SELF CITATIONS
    extract_information = list(graph.in_degree(weight='weight')) # Accounting for the counts
    
    # Storing all values into the storing array
    for j in range(len(extract_information)):
        Storing_array[Present_IPCs_Keys[extract_information[j][0]]][i] = extract_information[j][1]

for i in range(len(Storing_array.T)):
    if np.sum(Storing_array.T[i]) != 0:
        Storing_array.T[i] = Storing_array.T[i]/(np.sum(Storing_array.T[i]))
        
plt.figure(figsize=(20, 5))
Dates_plotting = np.arange(min(np.array(Present_Dates_Cit).astype(int)), max(np.array(Present_Dates_Cit).astype(int)) + 1) # Dates for plotting
Colors = pl.cm.coolwarm(np.linspace(0, 1, len(Present_IPCs))) # Colors for plotting

Most_Cited = []
for i in range(len(Present_Dates_Cit)):
    Most_Cited.append(list(Present_IPCs_Keys.keys())[np.argmax(Storing_array.T[i])])

plt.stackplot(Dates_plotting,Storing_array,labels=Present_IPCs_Keys.keys(),colors = Colors)

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color='black', linestyle=':', linewidth=0.5)
    #highest_point = 2  # Get the highest point for this instance
    #plt.text(Dates_plotting[k], 1+0.01, Most_Cited[k], fontsize=12, zorder=20, color='Black')  # Adjust the y coordinate to move the text above the highest point and set zorder to 10
    #plt.scatter(Dates_plotting[k], highest_point, color=Colors[Present_IPCs_Keys[Most_Cited[k]]], s=200, zorder=10)  # Add a point at the highest point and set zorder to 10

#plt.legend(loc='upper right', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white') 
#plt.xlabel('Year', fontsize=15)
plt.ylabel('Share', fontsize=15)
plt.gca().set_yticklabels(['{:.0f}%'.format(x * 100) for x in plt.gca().get_yticks()])

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOT5' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)

plt.show()

Storing_array_CIT = Storing_array

# Number of citations correlation matrix

In [None]:
matrix_pd = pd.DataFrame(Storing_array.T,columns=Present_IPCs)
matrix_corr = matrix_pd.corr()
matrix_plot = np.array(matrix_corr)
labels = list(matrix_corr)

plt.figure(figsize=(10, 10),dpi=100)
plt.imshow(matrix_plot, cmap='coolwarm', vmin=-1, vmax=1, aspect='auto')

for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, f'{matrix_plot[i, j]:.2f}', ha='center', va='center', fontsize=14, color='black')

plt.xticks(np.arange(len(labels)), labels, fontsize=14)
plt.yticks(np.arange(len(labels)), labels, fontsize=14)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOT6' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)

#plt.title('Correlation Matrix')
plt.show()

# Cross-correlation

In [None]:
matrix_pd_PAT = pd.DataFrame(Storing_array_PAT.T,columns=Present_IPCs)
matrix_pd_CIT = pd.DataFrame(Storing_array_CIT.T,columns=Present_IPCs)
labels = Present_IPCs

matrix_PC = np.zeros((len(Present_IPCs),len(Present_IPCs)))
for j in range(len(Present_IPCs)):
    for i in range(len(Present_IPCs)):
        matrix_PC[j][i] = matrix_pd_PAT[Present_IPCs[j]].corr(matrix_pd_CIT[Present_IPCs[i]])

plt.figure(figsize=(10, 10),dpi=100)
plt.imshow(matrix_PC, cmap='coolwarm', vmin=-1, vmax=1, aspect='auto')

for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, f'{matrix_PC[i, j]:.2f}', ha='center', va='center', fontsize=14, color='black')

plt.xlabel('Citation count', fontsize=14)
plt.ylabel('Patent count', fontsize=14)
plt.xticks(np.arange(len(labels)), labels, fontsize=14)
plt.yticks(np.arange(len(labels)), labels, fontsize=14)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOT7' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)

#plt.title('Correlation Matrix')
plt.show()

In [None]:
graph = ig.Graph(directed=True)
graph.add_vertices(Present_IPCs)
graph.Weighted_Adjacency(matrix_PC+1)
graph.es['weight']

In [None]:
colormap = plt.cm.get_cmap('coolwarm')  # Choose any colormap you like

fig, ax = plt.subplots(figsize=(5, 5))

ig.plot(
    graph,
    layout=graph.layout(layout='reingold_tilford_circular'),
    target=ax,
    vertex_size=[s * 0.015 + 0.005 for s in in_degrees],
    mark_groups=True,
    vertex_frame_width=1.0,
    vertex_frame_color='White',
    vertex_label=graph.vs["name"],
    vlabel_size=10.0,
    vertex_color=[colormap(d/max_in_degrees) for d in in_degrees],
    edge_width=[w*0.01 + 0.05 for w in graph.es['weight']],
    vertex_label_color='White',
    edge_color=[colormap(w/max(graph.es['weight'])) for w in graph.es['weight']],
    edge_arrow_size=[w*0.00001 + 0.001 for w in graph.es['weight']]
)

fig.set_size_inches(20, 20)
plt.show()

# Characteristics (UNUSED)

In [None]:
Data_CHA = Data[['Source','Target','Source Patent Year','Target Patent Year']].drop_duplicates().astype(str)

In [None]:
av_clust = []
av_clust_ind = []
edges_source = []
edges_target = []

for i in range(len(Present_Dates_Pat)):
    Data_CHA_u = Data_CHA.loc[Data_CHA['Source Patent Year'] == Present_Dates_Pat[i]]
    
    edges_source_ind = list(Data_CHA_u['Source'])
    edges_target_ind = list(Data_CHA_u['Target'])
    edges_source_ind = [item for sublist in edges_source_ind for item in (sublist if isinstance(sublist, list) else [sublist])]
    edges_target_ind = [item for sublist in edges_target_ind for item in (sublist if isinstance(sublist, list) else [sublist])]
    edges_ind = list(zip(list(edges_source_ind),list(edges_target_ind)))

    # Creating graph
    graph = nx.DiGraph()
    graph.add_edges_from(edges_ind)
    av_clust_ind.append(nx.average_clustering(graph))
    
    edges_source.append(edges_source_ind)
    edges_target.append(edges_target_ind)
    edges_source = [item for sublist in edges_source for item in (sublist if isinstance(sublist, list) else [sublist])]
    edges_target = [item for sublist in edges_target for item in (sublist if isinstance(sublist, list) else [sublist])]
    edges = list(zip(list(edges_source),list(edges_target)))
    
    # Creating graph
    graph = nx.DiGraph()
    graph.add_edges_from(edges)
    av_clust.append(nx.average_clustering(graph))
    

In [None]:
plt.figure(figsize=(20,5))
Dates_plotting = np.arange(min(np.array(Present_Dates_Pat).astype(int)), max(np.array(Present_Dates_Pat).astype(int)) + 1) # Dates for plotting
plt.plot(Dates_plotting, av_clust, color='black', label='Cumulative')
plt.plot(Dates_plotting, av_clust_ind, color='grey', label='Independent')

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color='black', linestyle=':', linewidth=0.5)
    
leg = plt.legend(loc='upper right', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white', fontsize=12) 
for line in leg.get_lines():
    line.set_linewidth(10) 
plt.show()

# Specific (UNUSED)

In [None]:
Storing_array = np.zeros((len(Present_IPCs_Keys),len(Present_Dates_Cit))) # Initialising empty storing array
IPC_Selected = Present_IPCs[0] # Select the IPC

for i in range(len(Present_Dates_Cit)):
    
    # Creating IPC Citation relationships
    Data_select = Data.loc[Data['Citation Year'] == Present_Dates_Cit[i]] # Selecting all patents created in specific year
    Data_select = Data_select.loc[Data_select['Source IPC'] == IPC_Selected] # Now choose IPC class of interest
    Counts_prelim = dict(Data_select[['Source IPC','Target IPC']].value_counts()) # Converting multiple citation counts into weights
    Counts = [(key[0], key[1], value) for key, value in Counts_prelim.items()] # Formatted correctly
    
    # Creating graph
    graph = nx.DiGraph()
    graph.add_weighted_edges_from(Counts)
    extract_information = list(graph.in_degree(weight='weight')) # Accounting for the counts
    
    # Storing all values into the storing array
    for j in range(len(extract_information)):
        Storing_array[Present_IPCs_Keys[extract_information[j][0]]][i] = extract_information[j][1]
        
plt.figure(figsize=(20,10),facecolor='lightgrey')
Dates_plotting = np.arange(min(np.array(Present_Dates_Cit).astype(int)),max(np.array(Present_Dates_Cit).astype(int))+1) # Dates for plotting
Colors = pl.cm.cividis(np.linspace(0,1,len(Present_IPCs))) # Colors for plotting

Most_Cited = []
for i in range(len(Present_Dates_Cit)):
    Most_Cited.append(list(Present_IPCs_Keys.keys())[np.argmax(Storing_array.T[i])])
    
for k in range(len(Present_IPCs_Keys)):
    if list(Present_IPCs_Keys.keys())[k] != IPC_Selected:
        plt.plot(Dates_plotting, Storing_array[k], color=Colors[k], label=list(Present_IPCs_Keys.keys())[k])
    else:
        plt.plot(Dates_plotting, Storing_array[k], color='Blue', linestyle=':', label=list(Present_IPCs_Keys.keys())[k])

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color=Colors[Present_IPCs_Keys[Most_Cited[k]]], linewidth=0.5)
    plt.text(Dates_plotting[k]-0.2, max(Storing_array.flatten())+max(Storing_array.flatten())/100, Most_Cited[k], fontsize=10)
    
plt.legend(frameon=False,loc='upper left', bbox_to_anchor=(1, 1), borderaxespad=2)
plt.title(f'Most prevalent citation classifications per year for class {IPC_Selected}')
plt.xlabel('Year')
plt.ylabel('Number of citations from specific year')
plt.show()

# Mapping IPC presence (UNUSED)

In [None]:
shapefile = gpd.read_file("/Users/joebacchus/Desktop/CASA/NUTS Shapefiles/NUTS_RG_20M_2021_3035.shp")
shapefile_UK = shapefile.loc[shapefile['CNTR_CODE'] == "UK"] # Can modify UK
shapefile_UK = shapefile_UK.loc[shapefile_UK['LEVL_CODE'] == 3][['NUTS_ID','geometry']].rename(columns={'NUTS_ID':'Region','geometry':'geometry'}) # Restricting to selected NUTS

In [None]:
Regions_shapefile = list(shapefile_UK['Region'])

In [None]:
IPC_Selected = 'B01' # Select the IPC
Storing_array = []

for i in range(len(Regions_shapefile)):
    IPC_count_region = dict(Patents.loc[Patents['Region'] == Regions_shapefile[i]]['IPC'].value_counts())
    IPC_count_region = pd.DataFrame(list(zip(list(IPC_count_region.keys()),list(IPC_count_region.values()))),columns=[['IPC','Count']])
    store_value = np.array(IPC_count_region.loc[(IPC_count_region['IPC'] == IPC_Selected).values]['Count']).flatten()
    Storing_array.append(store_value) # Add count of particular IPC to array
    
Storing_array = [-1000 if x.size == 0 else x.item() for x in Storing_array]
shapefile_UK['Values'] = Storing_array # Assigning to the values

# Plotting
fig, ax = plt.subplots(figsize=(5, 5))
cmap = 'Spectral_r'
norm = plt.Normalize(-shapefile_UK['Values'].max(), shapefile_UK['Values'].max())
shapefile_UK.plot(ax=ax, column='Values', cmap=cmap, linewidth=0.1, edgecolor='black', norm=norm)
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
#cbar = plt.colorbar(sm, ax=ax, fraction=0.02, pad=0.02)
cbar.set_label('Values')

ax.axis("off")
plt.show()

# Graphing

### Citations unnormalised total

In [None]:
graph = ig.Graph(directed=True)
graph.add_vertices(Present_IPCs)
graph.add_edges(list(dict(Data[['Source IPC','Target IPC']].value_counts()).keys()))
graph.simplify() # Remove self loops
to_delete_ids = [v.index for v in graph.vs if v.degree() == 0] # Delete isolated nodes
graph.delete_vertices(to_delete_ids)
graph.delete_vertices('na')

# Adding weights
Counts_prelim = dict(Data[['Source IPC','Target IPC']].value_counts())
graph.es['weight'] = list(Counts_prelim.values())

communities = graph.community_edge_betweenness(weights=graph.es['weight'])
communities = communities.as_clustering()
bridges = graph.bridges()

in_degrees = graph.degree(mode='in')
max_in_degrees = max(in_degrees)

In [None]:
# Define a colormap for node colors
colormap = plt.cm.get_cmap('coolwarm')  # Choose any colormap you like

fig, ax = plt.subplots(figsize=(5, 5))

ig.plot(
    graph,
    layout=graph.layout(layout='reingold_tilford_circular'),
    target=ax,
    vertex_size=[s * 0.003 + 0.02 for s in in_degrees],
    mark_groups=True,
    vertex_frame_width=1.0,
    vertex_frame_color='White',
    #vertex_label=graph.vs["name"],
    vlabel_size=10.0,
    vertex_color=[colormap(d/max_in_degrees) for d in in_degrees],
    edge_width=[w*0.001 + 0.05 for w in graph.es['weight']],
    vertex_label_color='White',
    edge_color=[colormap(w/max(graph.es['weight'])) for w in graph.es['weight']],
    edge_arrow_size=[w*0.0000005 + 0.001 for w in graph.es['weight']]
)

layout = graph.layout(layout='reingold_tilford_circular')

for v, label, size in zip(layout, graph.vs["name"], in_degrees):
    shift = 0.002*size + 0.03  # Adjust the multiplier (0.1) to control the shift amount
    ax.text(v[0], v[1] + shift, label, color='black', fontsize=15, ha='center', va='center', fontweight='heavy', path_effects=[withStroke(linewidth=1, foreground='black')])

fig.set_size_inches(20, 20)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOTIPC2UNORM' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)

plt.show()

### Citations unnormalised yearly

In [None]:
Storing_array = np.zeros((len(Present_IPCs_Keys),len(Present_Dates_Cit))) # Initialising empty storing array

for y in tqdm(range(len(Present_Dates_Pat)-range_y)):
    
    range_y = 1 # Select 5 years
    Years_Selected = Present_Dates_Pat[y:y+range_y]
    Data_Y = Data.loc[Data['Source Patent Year'].isin(Years_Selected)]

    # Graphing normalised

    graph = ig.Graph(directed=True)
    graph.add_vertices(Present_IPCs)
    graph.add_edges(list(dict(Data_Y[['Source IPC','Target IPC']].value_counts()).keys()))
    graph.simplify() # Remove self loops
    to_delete_ids = [v.index for v in graph.vs if v.degree() == 0] # Delete isolated nodes
    graph.delete_vertices(to_delete_ids)

    # Adding weights
    Counts_prelim = dict(Data_Y[['Source IPC','Target IPC']].value_counts())
    graph.es['weight'] = list(Counts_prelim.values())

    in_degrees = graph.degree(mode='in')
    max_in_degrees = max(in_degrees)

    extract_information = list(zip(list(graph.vs['name']),list(graph.vs['weight'])))

    # Storing all values into the storing array
    for j in range(len(extract_information)):
        Storing_array[Present_IPCs_Keys[extract_information[j][0]]][y] = extract_information[j][1]

In [None]:
plt.figure(figsize=(20, 10))
Dates_plotting = np.arange(min(np.array(Present_Dates_Cit).astype(int)), max(np.array(Present_Dates_Cit).astype(int)) + 1) # Dates for plotting
Colors = pl.cm.coolwarm(np.linspace(0, 1, len(Present_IPCs))) # Colors for plotting

Most_Cited = []
for i in range(len(Present_Dates_Cit)):
    Most_Cited.append(list(Present_IPCs_Keys.keys())[np.argmax(Storing_array.T[i])])

for k in range(len(Present_IPCs_Keys)):
    if str(list(Present_IPCs_Keys.keys())[k]) in Most_Cited: # Filtering only the most relevant
        translated = str(translate_ipc(list(Present_IPCs_Keys.keys())[k])[0])[:22] # Reducing to 22 characters
        plt.scatter(Dates_plotting, Storing_array[k], color=Colors[k], s=20, label=f'{list(Present_IPCs_Keys.keys())[k]} : {translated}')

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color='black', linestyle=':', linewidth=0.5)
    highest_point = max(Storing_array[:, k])  # Get the highest point for this instance
    plt.text(Dates_plotting[k]-0.4, highest_point+0.15, Most_Cited[k], fontsize=12, zorder=20)
    plt.scatter(Dates_plotting[k], highest_point, color=Colors[Present_IPCs_Keys[Most_Cited[k]]], s=200, zorder=10)  # Add a point at the highest point and set zorder to 10
    
plt.legend(loc='upper left', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white') 
plt.title('Ratio of citations')
plt.xlabel('Year', fontsize=15)
plt.ylabel('Importance', fontsize=15)
plt.show()

# Graphing normalised

In [None]:
from matplotlib.patheffects import withStroke

In [None]:
graph = ig.Graph(directed=True)
graph.add_vertices(Present_IPCs)
graph.add_edges(list(dict(Data[['Source IPC','Target IPC']].value_counts()).keys()))
graph.simplify() # Remove self loops
to_delete_ids = [v.index for v in graph.vs if v.degree() == 0] # Delete isolated nodes
graph.delete_vertices(to_delete_ids)
graph.delete_vertices('nan')

# Adding weights
Counts_prelim = dict(Data[['Source IPC','Target IPC']].value_counts())
graph.es['weight'] = list(Counts_prelim.values())

communities = graph.community_edge_betweenness(weights=graph.es['weight'])
communities = communities.as_clustering()
bridges = graph.bridges()

in_degrees = graph.degree(mode='in')
max_in_degrees = max(in_degrees)

##### Normalising

In [None]:
# Tables
IPC_counts = pd.DataFrame(list(zip(list(dict(Data['Source IPC'].value_counts()).keys()),list(dict(Data['Source IPC'].value_counts()).values()))),columns=[['IPC','Count']])
IPC_counts.to_clipboard()
IPC_counts = pd.read_clipboard()
IPC_in_degrees = pd.DataFrame(list(zip(list(graph.vs['name']),list(in_degrees))),columns=[['IPC','In Degrees']])
IPC_in_degrees.to_clipboard()
IPC_in_degrees = pd.read_clipboard()
IPC_together = IPC_counts.merge(IPC_in_degrees, left_on='IPC', right_on='IPC')
IPC_together = IPC_together[['Count','In Degrees']].astype(int)

Count_Degree_Ratio = []
for i in range(len(IPC_together)):
    Count_Degree_Ratio.append(IPC_together['In Degrees'][i]/IPC_together['Count'][i])
    
graph.vs['CDR'] = np.array(Count_Degree_Ratio).astype(float)
max_CDR = max(graph.vs['CDR'])

In [None]:
# Define a colormap for node colors
colormap = plt.cm.get_cmap('coolwarm')  # Choose any colormap you like

fig, ax = plt.subplots(figsize=(5, 5))

ig.plot(
    graph,
    layout=graph.layout(layout='reingold_tilford_circular'),
    target=ax,
    vertex_size=[s * 0.03 + 0.02 for s in graph.vs['CDR']],
    mark_groups=True,
    vertex_frame_width=1.0,
    vertex_frame_color='white',
    vlabel_size=10.0,
    vertex_color=[colormap(d/max_CDR) for d in graph.vs['CDR']],
    edge_width=[w*0.001 + 0.05 for w in graph.es['weight']],
    vertex_label_color='white',
    edge_color=[colormap(w/max(graph.es['weight'])) for w in graph.es['weight']],
    edge_arrow_size=[w*0.0000005 + 0.001 for w in graph.es['weight']]
)

layout = graph.layout(layout='reingold_tilford_circular')

for v, label, size in zip(layout, graph.vs["name"], graph.vs["CDR"]):
    shift = 0.01*size + 0.05  # Adjust the multiplier (0.1) to control the shift amount
    ax.text(v[0], v[1] + shift, label, color='black', fontsize=10, ha='center', va='center', fontweight='heavy', path_effects=[withStroke(linewidth=1, foreground='black')])
    
fig.set_size_inches(20, 20)

#file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOTIPC3' + ".png"
#plt.savefig(file_dir, dpi=300, transparent=False)

plt.show()

In [None]:
translate_ipc('H05')

In [None]:
graph.vs['name']

# Graphing Normalised Yearly

In [None]:
Storing_array = np.zeros((len(Present_IPCs_Keys),len(Present_Dates_Cit))) # Initialising empty storing array

range_y = 5 # Select 5 years

for y in tqdm(range(len(Present_Dates_Pat)-range_y)):
    
    Years_Selected = Present_Dates_Pat[y:y+range_y]
    Data_Y = Data.loc[Data['Source Patent Year'].isin(Years_Selected)]

    # Graphing normalised

    graph = ig.Graph(directed=True)
    graph.add_vertices(Present_IPCs)
    graph.add_edges(list(dict(Data_Y[['Source IPC','Target IPC']].value_counts()).keys()))
    graph.simplify() # Remove self loops
    to_delete_ids = [v.index for v in graph.vs if v.degree() == 0] # Delete isolated nodes
    graph.delete_vertices(to_delete_ids)

    # Adding weights
    Counts_prelim = dict(Data_Y[['Source IPC','Target IPC']].value_counts())
    graph.es['weight'] = list(Counts_prelim.values())

    in_degrees = graph.degree(mode='in')
    max_in_degrees = max(in_degrees)

    ##### Normalising

    # Tables
    IPC_counts = pd.DataFrame(list(zip(list(dict(Data_Y['Source IPC'].value_counts()).keys()),list(dict(Data_Y['Source IPC'].value_counts()).values()))),columns=[['IPC','Count']])
    IPC_counts.to_clipboard()
    IPC_counts = pd.read_clipboard()
    IPC_in_degrees = pd.DataFrame(list(zip(list(graph.vs['name']),list(in_degrees))),columns=[['IPC','In Degrees']])
    IPC_in_degrees.to_clipboard()
    IPC_in_degrees = pd.read_clipboard()
    IPC_together = IPC_counts.merge(IPC_in_degrees, left_on='IPC', right_on='IPC')
    IPC_together = IPC_together[['Count','In Degrees']].astype(int)

    Count_Degree_Ratio = []
    for i in range(len(IPC_together)):
        Count_Degree_Ratio.append(IPC_together['In Degrees'][i]/IPC_together['Count'][i])

    graph.vs['CDR'] = np.array(Count_Degree_Ratio).astype(float)
    max_CDR = max(graph.vs['CDR'])

    extract_information = list(zip(list(graph.vs['name']),list(graph.vs['CDR'])))

    # Storing all values into the storing array
    for j in range(len(extract_information)):
        Storing_array[Present_IPCs_Keys[extract_information[j][0]]][y] = extract_information[j][1]
        
epsilon = 0.001 # To avoid error in logarithmic plot
Storing_array[Storing_array == 0] = epsilon
Storing_array = np.log(Storing_array)

In [None]:
plt.figure(figsize=(20, 10))
Dates_plotting = np.arange(min(np.array(Present_Dates_Cit).astype(int)), max(np.array(Present_Dates_Cit).astype(int)) + 1) # Dates for plotting
Colors = pl.cm.coolwarm(np.linspace(0, 1, len(Present_IPCs))) # Colors for plotting

Most_Cited = []
Least_Cited = []
for i in range(len(Present_Dates_Cit)):
    Most_Cited.append(list(Present_IPCs_Keys.keys())[np.argmax(Storing_array.T[i])])
    Least_Cited.append(list(Present_IPCs_Keys.keys())[np.argmin(Storing_array.T[i])])

for k in range(len(Present_IPCs_Keys)):
    if str(list(Present_IPCs_Keys.keys())[k]) in Most_Cited: # Filtering only the most relevant
        translated = str(translate_ipc(list(Present_IPCs_Keys.keys())[k])[0])[:22] # Reducing to 22 characters
        plt.scatter(Dates_plotting, Storing_array[k], color=Colors[k], s=20, label=f'{list(Present_IPCs_Keys.keys())[k]} : {translated}')
    if str(list(Present_IPCs_Keys.keys())[k]) in Least_Cited: # Filtering only the most relevant
        translated = str(translate_ipc(list(Present_IPCs_Keys.keys())[k])[0])[:22] # Reducing to 22 characters
        plt.scatter(Dates_plotting, Storing_array[k], color=Colors[k], s=20, label=f'{list(Present_IPCs_Keys.keys())[k]} : {translated}')

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color='black', linestyle=':', linewidth=0.5)
    highest_point = max(Storing_array[:, k])  # Get the highest point for this instance
    
    plt.text(Dates_plotting[k]-0.4, highest_point+0.15, Most_Cited[k], fontsize=12, zorder=20)
    plt.scatter(Dates_plotting[k], highest_point, color=Colors[Present_IPCs_Keys[Most_Cited[k]]], s=200, zorder=10)  # Add a point at the highest point and set zorder to 10
    lowest_point = min(Storing_array[:, k])  # Get the highest point for this instance

#plt.legend(loc='upper left', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white') 
#plt.title('Ratio of citations')
#plt.xlabel('Year', fontsize=15)
plt.axhline(y=0, color='grey', linestyle='-', linewidth=1)
plt.ylabel('Citation to count ratio (Logarithmic)', fontsize=15)
plt.ylim(-5,3)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'PLOT_5_YEARLY' + ".png"
plt.savefig(file_dir, dpi=300, transparent=False)
#plt.yscale("log");
plt.show()

# Diversity

In [None]:
graph = ig.Graph(directed=True)
graph.add_vertices(Present_IPCs)
graph.add_edges(list(dict(Data[['Source IPC','Target IPC']].value_counts()).keys()))
graph.simplify() # Remove self loops
to_delete_ids = [v.index for v in graph.vs if v.degree() == 0] # Delete isolated nodes
graph.delete_vertices(to_delete_ids)
graph.delete_vertices('nan')

# Adding weights
Counts_prelim = dict(Data[['Source IPC','Target IPC']].value_counts())
graph.es['weight'] = list(Counts_prelim.values())

communities = graph.community_edge_betweenness(weights=graph.es['weight'])
communities = communities.as_clustering()
bridges = graph.bridges()

in_degrees = graph.degree(mode='in')
max_in_degrees = max(in_degrees)

##### Diversity

In [None]:
def get_div_mes(IPC_chosen):
    
    '''Diversity Measure'''
    
    IPC_spreads_all = dict(zip(list(Present_IPCs),np.zeros(len(Present_IPCs))))
    IPC_spreads_all = dict(zip(list(Present_IPCs),np.zeros(len(Present_IPCs))))
    IPC_spread_spec = Data[Data['Source IPC'] == IPC_chosen]['Target IPC'].value_counts()[1:] 
    IPC_spread_spec_keys = list(dict(IPC_spread_spec).keys())
    IPC_spread_spec_values = list(dict(IPC_spread_spec).values())
    for i in range(len(IPC_spread_spec_keys)):
        IPC_spreads_all[IPC_spread_spec_keys[i]] = np.log(IPC_spread_spec_values[i])
    diversity_spread = dict(sorted(IPC_spreads_all.items(), key = lambda x:x[1], reverse=True )) # The plot of diversity
    best_fit_grad = list(np.poly1d(np.polyfit(np.arange(len(list(diversity_spread.values()))), list(diversity_spread.values()), 1)))[0] # Best fit line
    
    return (abs(1/(best_fit_grad+1))-1)*100, diversity_spread

In [None]:
diversity_measures = []

for k in range(len(list(graph.vs['name']))):
    IPC_chosen = list(graph.vs['name'])[k]
    diversity_measures.append(get_div_mes(IPC_chosen)[0])
    
graph.vs['div'] = diversity_measures
max_div = max(graph.vs['div'])

In [None]:
associations_div_mes = list(zip(list(graph.vs['name']),list(diversity_measures)))

In [None]:
# Define a colormap for node colors
colormap = plt.cm.get_cmap('coolwarm')  # Choose any colormap you like

fig, ax = plt.subplots(figsize=(5, 5))

ig.plot(
    graph,
    layout=graph.layout(layout='reingold_tilford_circular'),
    target=ax,
    vertex_size=[s * 0.1 + 0.05 for s in graph.vs['div']],
    mark_groups=True,
    vertex_frame_width=1.0,
    vertex_frame_color='White',
    vertex_label=graph.vs["name"],
    vlabel_size=10.0,
    vertex_color=[colormap(d/max_div) for d in graph.vs['div']],
    edge_width=[w*0.005 + 0.01 for w in graph.es['weight']],
    vertex_label_color='White',
    edge_color=[colormap(w/max(graph.es['weight'])) for w in graph.es['weight']],
    edge_arrow_size=[w*0.00001 + 0.001 for w in graph.es['weight']]
)

fig.set_size_inches(20, 20)
plt.show()

In [None]:
IPC_s = 'F04'
diversity_measure = np.array(get_div_mes(IPC_s)[0])
plt.figure(figsize=(20,5))

func_x = np.arange(len(list(get_div_mes(IPC_s)[1].values())))
func_y = list(get_div_mes(IPC_s)[1].values())
names = list(get_div_mes(IPC_s)[1].keys())
plt.scatter(func_x,func_y, color='white')

i=0
for x, y in zip(func_x, func_y):
    plt.vlines(x, 0, y, linestyle='-', colors='white', alpha=0.5)
    plt.text(x-0.3, y+0.2, names[i], rotation=90,color='white')
    i+=1

plt.axis('off')
#plt.legend(frameon=False,fontsize=12)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'plotsEXAMPLEIPCDIV2' + ".png"
plt.savefig(file_dir, dpi=300, transparent=True)

plt.show()

In [None]:
topx=10
top_div = dict(sorted(dict(zip(graph.vs['name'],graph.vs['div'])).items(), key=lambda x:x[1], reverse=True)[:topx])
top_div_IPCs = list(top_div.keys())
top_div_values = list(top_div.values())

i=0
for ipc in top_div_IPCs:
    print(sorted(graph.vs['div'],reverse=True)[i])
    i+=1
    print(ipc)
    print(list(translate_ipc(ipc)))

In [None]:
bottomx=10
bottom_div = dict(sorted(dict(zip(graph.vs['name'],graph.vs['div'])).items(), key=lambda x:x[1], reverse=True)[-bottomx:])
bottom_div_IPCs = list(bottom_div.keys())
bottom_div_values = list(bottom_div.values())

for ipc in bottom_div_IPCs:
    print(ipc,translate_ipc(ipc))

# Evolution of patents

In [None]:
Patents_ind = (Patents[['Application','Patent Year']].drop_duplicates()).astype(str)
year_selected = 2001
Patents_cumu = 0
Patents_cumu_ar = []
Patents_increase = 0
Patents_increase_ar = []

for i in range(len(Present_Dates_Pat)):
    Patents_increase = len(Patents_ind.loc[Patents_ind['Patent Year'] == str(Present_Dates_Pat[i])])
    Patents_cumu += Patents_increase
    Patents_increase_ar.append(Patents_increase)
    Patents_cumu_ar.append(Patents_cumu)

In [None]:
plt.figure(figsize=(20, 5))
Dates_plotting = np.arange(min(np.array(Present_Dates_Pat).astype(int)), max(np.array(Present_Dates_Pat).astype(int)) + 1) # Dates for plotting
Colors = pl.cm.coolwarm(np.linspace(0, 1, len(Present_IPCs))) # Colors for plotting

#plt.plot(Dates_plotting,Patents_cumu_ar,color='black')
plt.plot(Dates_plotting,Patents_increase_ar,color='black')

#plt.legend(loc='upper right', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white') 
plt.xlabel('Year', fontsize=15)
plt.ylabel('Number of citations occurring for designated year', fontsize=15)
plt.show()

# Normalised share of patents

In [None]:
Storing_array = np.zeros((len(Present_IPCs_Keys),len(Present_Dates_Pat))) # Initialising empty storing array

for i in range(len(Present_Dates_Pat)):
    
    # Creating IPC Citation relationships
    Data_select = Data.loc[Data['Citation Year'] == Present_Dates_Pat[i]] # Selecting all patents created in specific year
    Counts_prelim = dict(Data_select[['Source IPC','Target IPC']].value_counts()) # Converting multiple citation counts into weights
    Counts = [(key[0], key[1], value) for key, value in Counts_prelim.items()] # Formatted correctly
    
    # Creating graph
    graph = nx.DiGraph()
    graph.add_weighted_edges_from(Counts)
    graph.remove_edges_from(nx.selfloop_edges(graph)) # NOT ACCOUNTING FOR SELF CITATIONS
    extract_information = list(graph.in_degree(weight='weight')) # Accounting for the counts
    
    # Storing all values into the storing array
    for j in range(len(extract_information)):
        Storing_array[Present_IPCs_Keys[extract_information[j][0]]][i] = extract_information[j][1]

for i in range(len(Storing_array.T)):
    if np.sum(Storing_array.T[i]) != 0:
        Storing_array.T[i] = Storing_array.T[i]/(np.sum(Storing_array.T[i]))
        
plt.figure(figsize=(20, 5))
Dates_plotting = np.arange(min(np.array(Present_Dates_Pat).astype(int)), max(np.array(Present_Dates_Pat).astype(int)) + 1) # Dates for plotting
Colors = pl.cm.coolwarm(np.linspace(0, 1, len(Present_IPCs))) # Colors for plotting

Most_Cited = []
for i in range(len(Present_Dates_Pat)):
    Most_Cited.append(list(Present_IPCs_Keys.keys())[np.argmax(Storing_array.T[i])])

plt.stackplot(Dates_plotting,Storing_array,labels=Present_IPCs_Keys.keys(),colors = Colors)

for k in range(len(Dates_plotting)):
    plt.axvline(x=Dates_plotting[k], color='black', linestyle=':', linewidth=0.5)
    #highest_point = 2  # Get the highest point for this instance
    #plt.text(Dates_plotting[k], 1+0.01, Most_Cited[k], fontsize=12, zorder=20, color='Black')  # Adjust the y coordinate to move the text above the highest point and set zorder to 10
    #plt.scatter(Dates_plotting[k], highest_point, color=Colors[Present_IPCs_Keys[Most_Cited[k]]], s=200, zorder=10)  # Add a point at the highest point and set zorder to 10

plt.legend(loc='upper right', borderaxespad=0.1, fancybox=True, facecolor='white', frameon=True, framealpha=1, edgecolor='white') 
#plt.xlabel('Year', fontsize=15)
#plt.ylabel('Number of citations occurring for designated year', fontsize=15)
plt.axis('off')
plt.show()

# Fix from EPO

In [None]:
Data = Data.astype(str)

In [None]:
Data['Citation Year'] = Data['Citation Year'].astype(int)
Data['Source Patent Year'] = Data['Source Patent Year'].astype(int)
Data['Target Year'] = Data['Target Patent Year'].astype(int)

In [None]:
Citations_unfiltered = Data[['Source','Target','Citation Year']]

In [None]:
all_unique_EPO_patents = list(set(list(Data['Source'])+list(Data['Target'])))

In [None]:
Patents_for_EPO = pd.concat([Data[['Source','Source IPC','Source Region','Source Patent Year']], Data[['Target','Target IPC','Target Region','Target Patent Year']].rename(columns={'Target':'Source','Target IPC':'Source IPC','Target Region':'Source Region','Target Patent Year':'Source Patent Year'})], axis=0)
Patents_for_EPO = Patents_for_EPO.drop_duplicates()

In [None]:
Patents_for_EPO = Patents_for_EPO.rename(columns={'Source':'Application','Region':'Region','Source Patent Year':'Patent Year','Source IPC':'IPC'})

In [None]:
Patents = Patents_for_EPO

# 3D Plotting

In [None]:
Data['Source IPC'] = Data['Source IPC'].str.replace(" ", "")
Data['Target IPC'] = Data['Target IPC'].str.replace(" ", "")
Patents['IPC'] = Patents['IPC'].str.replace(" ", "")

In [None]:
#Citations_unfiltered = Citations_raw.rename(columns={'source':'Source','target':'Target','time':'Citation Year'})

In [None]:
graph = nx.Graph()
graph.add_edges_from(list(zip(list(Citations_unfiltered['Source']),list(Citations_unfiltered['Target']))))

In [None]:
all_patents = list(graph.nodes())

dates_for_patents = dict(list(zip(list(Citations_unfiltered['Source']),list((Citations_unfiltered['Citation Year'])))))
nx.set_node_attributes(graph, 1970, name="date")
nx.set_node_attributes(graph, dates_for_patents, name="date")

IPC_for_pat = Patents[['Application','IPC']].drop_duplicates()
IPC_for_pat = IPC_for_pat.groupby('Application')['IPC'].apply(', '.join).reset_index()
IPC_codes = dict(list(zip(list(IPC_for_pat['Application']),list(IPC_for_pat['IPC']))))
nx.set_node_attributes(graph, ' ', name="IPC")
nx.set_node_attributes(graph, IPC_codes, name="IPC")

graph_orig = graph
ccs = sorted(nx.connected_components(graph_orig), key=len, reverse=True)
largest_ccs = graph_orig.subgraph(ccs[0])
graph = largest_ccs

In [None]:
pos_spring = nx.kamada_kawai_layout(graph)
pos = {list(graph.nodes())[i]: (list(dict(pos_spring).values())[i][0], list(nx.get_node_attributes(graph, 'date').values())[i], list(dict(pos_spring).values())[i][1]) for i in tqdm(range(len(graph.nodes)))}
nx.set_node_attributes(graph, pos, name="pos")

In [None]:
len(graph.nodes())

In [None]:
def network_plot_3D(G, angle, save):

    # extracting info
    pos = nx.get_node_attributes(G, 'pos')
    n = G.number_of_nodes()
    dates = list(nx.get_node_attributes(G, 'date').values())
    IPCs = list(nx.get_node_attributes(G, 'IPC').values())
    
    # get shortest path
    node_dates = nx.get_node_attributes(G, name="date")
    lowest_date_node = min(node_dates, key=node_dates.get)
    highest_date_node = max(node_dates, key=node_dates.get)
    path_nodes = nx.shortest_path(G,source=lowest_date_node,target=highest_date_node)
    
    with plt.style.context(('ggplot')):
        
        fig = plt.figure(figsize=(20,15))
        ax = fig.gca(projection='3d',facecolor='white')
        
        jcounter = 0
        for key, value in pos.items(): # nodes
            xi = value[0]
            yi = value[1]
            zi = value[2]
            relative_output = (10+10*G.degree(key)) # Highlight only those with known IPC
            #relative_output = (10+100*G.out_degree(key))/(1+10*abs(max(dates)-value[1])) 
            #if IPCs[jcounter] != 'N/A':
            ax.scatter(xi, yi, zi, c=value[1], norm=mpl.colors.Normalize(vmin=min(dates),vmax=max(dates)), cmap = 'coolwarm', s=relative_output, edgecolors='k', alpha=0.9, zorder=10) # Out degree since those CITE the most
                #ax.text(xi, yi, zi+0.1, s=IPCs[jcounter], c='white', fontsize=7, zorder=20)
            #else:
                #ax.scatter(xi, yi, zi, c=value[1], norm=mpl.colors.Normalize(vmin=min(dates),vmax=max(dates)), cmap = 'coolwarm', s=relative_output, edgecolors='k', alpha=0.45, zorder=10) # Out degree since those CITE the most
            
            ax.plot(np.array((xi,xi)), np.array((yi,yi)), np.array((-1,zi)), c='white', alpha=0.5, linewidth=0.5, linestyle=':')
            
            jcounter += 1
            
        c_selection = mpl.cm.coolwarm(np.linspace(0,1,int(abs(max(dates)-min(dates)))+1)) # Edge Colors
        
        for i,j in enumerate(G.edges()): # edges
            x = np.array((pos[j[0]][0], pos[j[1]][0]))
            y = np.array((pos[j[0]][1], pos[j[1]][1]))
            z = np.array((pos[j[0]][2], pos[j[1]][2]))
            
            if pos[j[0]][1] < 1970:
                print('occuring 1')
                ax.plot(x, np.array((1970, pos[j[1]][1])), z, c=c_selection[int(abs(pos[j[1]][1]-min(dates)))], alpha=0.1, linewidth=0.5)
            if pos[j[1]][1] < 1970:
                print('occuring 2')
                ax.plot(x, np.array((pos[j[0]][1], 1970)), z, c=c_selection[int(abs(pos[j[1]][1]-min(dates)))], alpha=0.1, linewidth=0.5)
            else:
                ax.plot(x, y, z, c=c_selection[int(abs(pos[j[1]][1]-min(dates)))], alpha=0.5, linewidth=0.5)
            #ax.quiver(x[1], y[1], z[1], x[0]-x[1], y[0]-y[1], z[0]-z[1], color=c_selection[int(abs(pos[j[1]][1]-min(dates)))], alpha=0.5, linewidth=0.5)
            
    # viewpoint
    #ax.axes.set_ylim3d(bottom=1970, top=2017) 
    ax.axes.set_zlim3d(bottom=-1, top=1) 
    
    ax.w_xaxis.set_pane_color((0.33, 0.33, 0.33, 1.0))
    ax.w_yaxis.set_pane_color((0.33, 0.33, 0.33, 1.0))
    ax.w_zaxis.set_pane_color((0.33, 0.33, 0.33, 1.0))
    
    ax.xaxis._axinfo["grid"]['color'] = mpl.colors.to_rgba('lightgrey', alpha=0.3)
    ax.yaxis._axinfo["grid"]['color'] = mpl.colors.to_rgba('lightgrey', alpha=0.3)
    ax.zaxis._axinfo["grid"]['color'] = mpl.colors.to_rgba('lightgrey', alpha=0.3)
    
    ax.yaxis.set_major_locator(MaxNLocator(integer=True, nbins=30))
    
    ax.set_xticks([])
    ax.set_zticks([])
    ax.set_xticklabels([])
    ax.set_zticklabels([])

    ax.set_box_aspect([1,3,1])
    ax.view_init(30, angle)
            
    if save is not False:
        file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots"+'3DPLOT1'+".png"
        plt.savefig(file_dir, dpi=300)
        plt.close("all")
        print('Done.')
        
    plt.show()
    
    return

In [None]:
network_plot_3D(graph, 155, False)

# Visible clusters

In [None]:
S = [graph_orig.subgraph(c).copy() for c in nx.connected_components(graph_orig)]
S_sorted = sorted(S, key=lambda g: len(g.nodes()), reverse=True)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(21, 14))
ax = axes.flatten()  # Flatten the 2D array of axes for easy access

for i in range(6):
    graph = S_sorted[i]
    graph = nx.Graph(graph)
    dates_for_cluster = nx.get_node_attributes(graph, 'date').values()
    min_date_value = min(dates_for_cluster)
    max_date_value = max(dates_for_cluster)
    values_of_dates = dict(pd.DataFrame(dates_for_cluster).astype(int).value_counts())

    ax[i].hist(dates_for_cluster, bins=len(np.arange(min_date_value, max_date_value)), color='black')
    # ax[i].set_title(f'Cluster {i+1} Dates Histogram')
    ax[i].set_ylabel('Count')
    ax[i].set_ylim(0, 50)
    ax[i].set_xlim(1979, 2020)
    ax[i].xaxis.set_major_locator(MaxNLocator(integer=True))
    ax[i].yaxis.set_major_locator(MaxNLocator(integer=True))
    ax[i].locator_params(axis='x', nbins=20)
    ax[i].locator_params(axis='y', nbins=20)
    ax[i].grid(True, which='both', linestyle='-', linewidth=0.5, color='lightgrey', alpha=0.6)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots"+'barplots'+".png"
plt.tight_layout()
plt.savefig(file_dir, dpi=300)
plt.show()

In [None]:
# Create a 2x2 grid of subplots
colormap = plt.cm.get_cmap('coolwarm_r')
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(42, 28))

for i in range(6):
    graph = S_sorted[i]
    graph = nx.Graph(graph)
    graph.remove_edges_from(nx.selfloop_edges(graph))
    
    pos_spring = nx.kamada_kawai_layout(graph)
    date_label_pos = {node: (x, y + 0.05) for node, (x, y) in pos_spring.items()}
    ipc_label_pos = {node: (x, y - 0.05) for node, (x, y) in pos_spring.items()}
    
    dates = list(nx.get_node_attributes(graph, 'date').values())
    dates_arranged = sorted(list(set(dates))[1:])
    date_labels = {node: date for node, date in zip(graph.nodes(), dates)}
    
    min_date_value = min(dates_arranged)
    max_date_value = max(dates_arranged)
    min_date_nodes = [node for node, date in date_labels.items() if date < min_date_value]
    
    ipcs = list(nx.get_node_attributes(graph, 'IPC').values())
    ipc_labels = {node: date for node, date in zip(graph.nodes(), ipcs)}
    
    normalized_dates = [(date - min_date_value) / (max(dates_arranged) - min_date_value) for date in dates]
    node_colors = [colormap(norm_date) for node, norm_date in zip(graph.nodes(), normalized_dates)]
    
    nx.draw(graph,
            pos=pos_spring, 
            with_labels=False,
            node_color=node_colors,
            cmap=colormap, 
            edge_color='lightgrey',
            node_size=[30 if node in min_date_nodes else 10 for node in graph.nodes()],
            width=1,
            arrowsize=10,
            font_color="black",
            font_size=5,
            ax=axes[i // 3, i % 3])
    
    #nx.draw_networkx_labels(graph, date_label_pos, date_labels, font_size=10, font_color='black', ax=axes[i // 3, i % 3])
    #nx.draw_networkx_labels(graph, ipc_label_pos, ipc_labels, font_size=10, font_color='black', ax=axes[i // 3, i % 3])
    
    avg_clustering_coefficient = nx.average_clustering(graph)
    avg_clustering_coefficient = 0 
    if avg_clustering_coefficient > 0:
        axes[i // 3, i % 3].text(0.8, 0.9, f'Average Clustering: {avg_clustering_coefficient:.3f}', 
                                transform=axes[i // 3, i % 3].transAxes,
                                fontsize=10, ha='right', va='center', bbox=dict(facecolor='white', alpha=0))
    
    # Add the smallest and largest dates at the top of the plot
    axes[i // 3, i % 3].text(0.9, 1, f'{min_date_value} – {max_date_value}', 
                            transform=axes[i // 3, i % 3].transAxes,
                            fontsize=10, ha='center', va='center')
    
    axes[i // 3, i % 3].set_aspect('equal')

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots"+'multiplotsDATED'+".png"
plt.tight_layout()
plt.savefig(file_dir, dpi=300, transparent=True)
plt.show()

In [None]:
# Create a 2x2 grid of subplots
colormap = plt.cm.get_cmap('coolwarm_r')
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(30, 20))

for i in range(6):
    graph = S_sorted[i]
    graph = nx.Graph(graph)
    graph.remove_edges_from(nx.selfloop_edges(graph))
    
    pos_spring = nx.kamada_kawai_layout(graph)
    date_label_pos = {node: (x, y + 0.05) for node, (x, y) in pos_spring.items()}
    ipc_label_pos = {node: (x, y - 0.05) for node, (x, y) in pos_spring.items()}
    
    dates = list(nx.get_node_attributes(graph, 'date').values())
    dates_arranged = sorted(list(set(dates))[1:])
    date_labels = {node: date for node, date in zip(graph.nodes(), dates)}
    
    min_date_value = min(dates_arranged)
    max_date_value = max(dates_arranged)
    min_date_nodes = [node for node, date in date_labels.items() if date < min_date_value]
    
    ipcs = list(nx.get_node_attributes(graph, 'IPC').values())
    ipc_labels = {node: date for node, date in zip(graph.nodes(), ipcs)}
    
    normalized_dates = [(date - min_date_value) / (max(dates_arranged) - min_date_value) for date in dates]
    node_colors = [colormap(norm_date) for node, norm_date in zip(graph.nodes(), normalized_dates)]
    node_colors = [(1,1,0,1) if node in biotech_patents_EPO else 'lightgrey' for node in graph.nodes()]
    
    nx.draw(graph,
            pos=pos_spring, 
            with_labels=False,
            node_color=node_colors,
            cmap=colormap, 
            edge_color='lightgrey',
            node_size=[40 if node in min_date_nodes else 20 for node in graph.nodes()],
            width=1.5,
            arrowsize=10,
            font_color="white",
            font_size=5,
            ax=axes[i // 3, i % 3])
    
    #nx.draw_networkx_labels(graph, date_label_pos, date_labels, font_size=10, font_color='black', ax=axes[i // 3, i % 3])
    #nx.draw_networkx_labels(graph, ipc_label_pos, ipc_labels, font_size=10, font_color='black', ax=axes[i // 3, i % 3])
    
    avg_clustering_coefficient = nx.average_clustering(graph)
    avg_clustering_coefficient = 0 
    if avg_clustering_coefficient > 0:
        axes[i // 3, i % 3].text(0.8, 0.9, f'Average Clustering: {avg_clustering_coefficient:.3f}', 
                                transform=axes[i // 3, i % 3].transAxes,
                                fontsize=10, ha='right', va='center', bbox=dict(facecolor='white', alpha=0))
    
    # Add the smallest and largest dates at the top of the plot
    axes[i // 3, i % 3].text(0.9, 1, f'{min_date_value} – {max_date_value}', 
                            transform=axes[i // 3, i % 3].transAxes,
                            fontsize=10, ha='center', va='center', color='white')
    
    axes[i // 3, i % 3].set_aspect('equal')

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots"+'multiplotsALT3'+".png"
plt.tight_layout()
plt.savefig(file_dir, dpi=300, transparent=True)
plt.show()

In [None]:
# Highlight if biotech! reduce IPC number

In [None]:
biotech_classes = [
    'A01H1',
    'A01H4',
    'A01K67',
    'A61K48',
    'C12M',
    'C12N',
    'C12P',
    'C12Q',
    'G01N27/327',
    'C07K4',
    'C07K14', 
    'C07K16',
    'C07K17',
    'C07K19',
    'C40B10',
    'G01N3353',
    'G01N3354',
    'G01N3355',
    'G01N3357',
    'G01N3368',
    'G01N3374',
    'G01N3376',
    'G01N3378',
    'G01N3388',
    'G01N3392',
    'C07G11', 
    'C07G13', 
    'C07G15',
    'C02F3/34',
    'A61K38',
    'A61K39',
    'C40B50/06',
    'A61K35/12', 'A61K35/13', 'A61K35/14', 'A61K35/15', 'A61K35/16', 'A61K35/17', 'A61K35/18', 'A61K35/19', 'A61K35/20', 'A61K35/21', 'A61K35/22', 'A61K35/23', 'A61K35/24', 'A61K35/25', 'A61K35/26', 'A61K35/27', 'A61K35/28', 'A61K35/29', 'A61K35/30', 'A61K35/31', 'A61K35/32', 'A61K35/33', 'A61K35/34', 'A61K35/35', 'A61K35/36', 'A61K35/37', 'A61K35/38', 'A61K35/39', 'A61K35/40', 'A61K35/41', 'A61K35/42', 'A61K35/43', 'A61K35/44', 'A61K35/45', 'A61K35/46', 'A61K35/47', 'A61K35/48', 'A61K35/49', 'A61K35/50', 'A61K35/51', 'A61K35/52', 'A61K35/53', 'A61K35/54', 'A61K35/55', 'A61K35/56', 'A61K35/57', 'A61K35/58', 'A61K35/59', 'A61K35/60', 'A61K35/61', 'A61K35/62', 'A61K35/63', 'A61K35/64', 'A61K35/65', 'A61K35/66', 'A61K35/67', 'A61K35/68', 'A61K35/69', 'A61K35/70', 'A61K35/71', 'A61K35/72', 'A61K35/73', 'A61K35/74', 'A61K35/75', 'A61K35/76', 'A61K35/77', 'A61K35/78', 'A61K35/79',
    'C40B40/02', 'C40B40/03', 'C40B40/04', 'C40B40/05', 'C40B40/06', 'C40B40/07', 'C40B40/08',
    'G06F19/10', 'G06F19/11', 'G06F19/12', 'G06F19/13', 'G06F19/14', 'G06F19/15', 'G06F19/16', 'G06F19/17', 'G06F19/18',
    'G06F19/20', 'G06F19/21', 'G06F19/22', 'G06F19/23', 'G06F19/24'
]


In [None]:
#biotech_classes = ["A01H001", "A01H004", "A01K067", "A01K035/12", "A01K035/13", "A01K035/14","A01K035/15", "A01K035/16", "A01K035/17", "A01K035/18", "A01K035/19", "A01K035/20", "A01K035/21", "A01K035/22", "A01K035/23", "A01K035/24", "A01K035/25", "A01K035/26", "A01K035/27", "A01K035/28", "A01K035/29", "A01K035/30", "A01K035/31", "A01K305/32", "A01K035/33", "A01K035/34", "A01K035/35", "A01K035/36", "A01K035/37", "A01K035/38", "A01K035/39", "A01K035/40", "A01K035/41", "A01K035/42", "A01K035/43", "A01K035/44", "A01K035/45", "A01K035/46", "A01K035/47", "A01K035/48", "A01K035/49", "A01K035/50", "A01K035/51", "A01K035/52", "A01K035/53", "A01K035/54", "A01K035/55", "A01K035/56", "A01K035/57", "A01K035/58", "A01K035/59", "A01K035/60", "A01K035/61", "A01K035/62", "A01K035/63", "A01K035/64", "A01K035/65", "A01K035/66", "A01K035/67", "A01K035/68", "A01K035/69", "A01K035/70", "A01K035/71", "A01K035/72", "A01K035/73", "A01K035/74", "A01K035/75", "A01K035/76", "A01K035/77", "A01K035/78", "A01K035/79", "A61K38", "A61K039", "A16K048", "C02F003/34", "C07G011", "C07G013", "C07G015", "C07K004", "C07K014", "C07K016", "C07K017", "C07K019", "C12M", "C12N", "C12P", "C12Q", "C40B010", "C40B040/02", "C40B040/03","C40B040/04", "C40B040/05", "C40B040/06", "C40B040/07", "C40B040/08", "C40B050/06", "G01N027/327", "G01N033/53", "G01N033/54", "G01N033/55", "G01N033/57", "G01N033/68", "G01N033/74", "G01N033/76", "G01N033/78", "G01N033/88", "G01N033/92", "G06F019/10","G06F019/11", "G06F019/12", "G06F019/13", "G06F019/14", "G06F019/15", "G06F019/16", "G06F019/17", "G06F019/18", "G06F019/20", "G06F019/21", "G06F019/22", "G06F019/23", "G06F019/24"]

The following section is particularly messy and many imports are repeated.

# New EPO VERSION

In [None]:
biotech_pd=pd.DataFrame()
for i in tqdm(range(len(biotech_classes))):
    filt_for_type = EPO_IPCs['IPC'].str.contains('^'+biotech_classes[i]).fillna(False)
    biotech_spec = EPO_IPCs.loc[filt_for_type]
    biotech_pd = pd.concat([biotech_pd, biotech_spec], ignore_index=True)

#### EPO VERSION

In [None]:
EPO_IPCs = pd.read_csv('/Users/joebacchus/Desktop/CASA/Data Original/Patent data/202001_EPO_IPC.txt', sep='|')
EPO_IPCs = EPO_IPCs[['appln_id','IPC']]

In [None]:
EPO_PATENTS = pd.read_csv('/Users/joebacchus/Desktop/CASA/Data Original/Patent data/202001_EPO_Inventor_reg.txt',sep='|')

In [None]:
biotech_pd=pd.DataFrame()
for i in tqdm(range(len(biotech_classes))):
    filt_for_type = EPO_IPCs['IPC'].str.contains('^'+biotech_classes[i]).fillna(False)
    biotech_spec = EPO_IPCs.loc[filt_for_type]
    biotech_pd = pd.concat([biotech_pd, biotech_spec], ignore_index=True)

In [None]:
biotech_patents_EPO = list(set(list(EPO_PATENTS.merge(biotech_pd, left_on = 'appln_id', right_on = 'appln_id')['app_nbr'])))

#### PCT VERSION (REDUNDENT!)

In [None]:
PCT_IPCs = pd.read_csv('/Users/joebacchus/Desktop/CASA/Data Original/Patent data/202001_PCT_IPC.txt', sep='|')

In [None]:
EPO_PCT = pd.read_csv('/Users/joebacchus/Desktop/CASA/Data Original/Patent data/202001_EPO_PCT.txt', sep='|')

In [None]:
PCT_IPC_USE = EPO_PCT.merge(PCT_IPCs)[['app_nbr','IPC']].drop_duplicates()

In [None]:
biotech_pd=pd.DataFrame()
for i in tqdm(range(len(biotech_classes))):
    filt_for_type = PCT_IPC_USE['IPC'].str.contains('^'+biotech_classes[i]).fillna(False)
    biotech_spec = PCT_IPC_USE.loc[filt_for_type]
    biotech_pd = pd.concat([biotech_pd, biotech_spec], ignore_index=True)

In [None]:
biotech_patents_PCT = list(set(list(biotech_pd['app_nbr'])))

In [None]:
len(biotech_patents_PCT)

In [None]:
len(biotech_patents_EPO)

In [None]:
colormap = plt.cm.get_cmap('coolwarm_r')
fig = plt.figure(figsize=(20, 20))

graph = S_sorted[3] # Select the biotech one
graph = nx.Graph(graph)

pos_spring = nx.kamada_kawai_layout(graph)
date_label_pos = {node: (x, y + 0.02) for node, (x, y) in pos_spring.items()}
ipc_label_pos = {node: (x, y - 0.05) for node, (x, y) in pos_spring.items()}

dates = list(nx.get_node_attributes(graph, 'date').values())
dates_arranged = sorted(dates)
date_labels = {node: date for node, date in zip(graph.nodes(), dates)}

min_date_value = min(dates_arranged)
max_date_value = max(dates_arranged)
min_date_nodes = [node for node, date in date_labels.items() if date == min_date_value]

ipcs = list(nx.get_node_attributes(graph, 'IPC').values())
ipc_labels = {node: date for node, date in zip(graph.nodes(), ipcs)}

normalized_dates = [(date - min_date_value) / (max(dates_arranged) - min_date_value) for date in dates]
node_colors = [colormap(norm_date) for node, norm_date in zip(graph.nodes(), normalized_dates)]
nodeedge_colors = [(0.33, 0.33, 0.33, 1) if node in biotech_patents_EPO else 'lightgrey' for node in graph.nodes()]
node_labels = ['+' if node in biotech_patents_EPO else '' for node in graph.nodes()]
#nodeedge_colors = [colormap(norm_date) for node, norm_date in zip(graph.nodes(), normalized_dates)]

nx.draw(graph,
        pos=pos_spring, 
        with_labels=False,
        node_color=node_colors,
        cmap=colormap, 
        edge_color='lightgrey',
        #edgecolors=nodeedge_colors,
        node_size=[80 if node in min_date_nodes else 40 for node in graph.nodes()],
        width=2,
        arrowsize=10,
        labels=node_labels,
        font_color="black",
        font_size=5)

#nx.draw_networkx_labels(graph, pos=date_label_pos, font_size=5, font_color='black')

#nodes.set_edgecolor([(0.33, 0.33, 0.33, 1.0) if node in biotech_patents_EPO else 'lightgrey' for node in graph.nodes()])
plt.gca().set_aspect('equal')

avg_clustering_coefficient = nx.average_clustering(graph)
avg_clustering_coefficient = 0 
if avg_clustering_coefficient > 0:
    plt.text(0.8, 0.9, f'Average Clustering: {avg_clustering_coefficient:.3f}', 
             transform=plt.gca().transAxes,
             fontsize=10, ha='right', va='center', bbox=dict(facecolor='white', alpha=0))
    
#for v, label in zip(pos_spring, node_labels):
    #ax.text(v[0], v[1], label, color='black', fontsize=10, ha='center', va='center', fontweight='heavy', path_effects=[withStroke(linewidth=1, foreground='black')])
    
fig.set_size_inches(20, 20)

file_dir = "/Users/joebacchus/Desktop/PLOTS4LATEX/plots" + 'plotsBIOTECH' + ".png"
plt.savefig(file_dir, dpi=300, transparent=True)
plt.show()