## 库

In [2]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
import spacy
import plotly.graph_objects as go
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import numpy as np

## 处理

In [3]:
# Create directory if it doesn't exist
if not os.path.exists('Similarity_html'):
    os.makedirs('Similarity_html')

# Load data
file_path = 'data/CIP_2023.7.xlsx'
data_cip = pd.read_excel(file_path)

file_path = 'data/JACS3.0_2023.7.xlsx'
data_jacs = pd.read_excel(file_path)

nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    doc = nlp(text)
    words = [token.lemma_ for token in doc if token.text not in stop_words]
    return ' '.join(words)

# Preprocess text
data_cip['Class definition'] = data_cip['Class definition'].apply(preprocess_text)
docs_cip = data_cip['Class definition'].tolist()

data_jacs['Class definition'] = data_jacs['Class definition'].apply(preprocess_text)
docs_jacs = data_jacs['Class definition'].tolist()

# Get embeddings
sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings_cip = sentence_model.encode(docs_cip)
embeddings_jacs = sentence_model.encode(docs_jacs)

## Biology

In [5]:
jacs_start=87
jacs_end=223
cip_start=728
cip_end=820
similarity_matrix = cosine_similarity(embeddings_cip[cip_start:cip_end], embeddings_jacs[jacs_start:jacs_end])
# Get codes and names
cip_codes = data_cip.iloc[cip_start:cip_end, 0].tolist()
cip_names = data_cip.iloc[cip_start:cip_end, 1].tolist()
jacs_codes = data_jacs.iloc[jacs_start:jacs_end, 0].tolist()
jacs_names = data_jacs.iloc[jacs_start:jacs_end, 1].tolist()

# Create interactive heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    colorscale=[
        [0, 'rgb(30,70,110)'],
        [0.10, 'rgb(55,103,149)'],
        [0.20, 'rgb(82,143,173)'],
        [0.30, 'rgb(114,188,213)'],
        [0.38, 'rgb(170,220,224)'],
        [0.52,'rgb(255,230,183)'],
        [0.64, 'rgb(255,208,111)'],
        [0.78, 'rgb(247,170,88)'],
        [0.88, 'rgb(239,138,71)'],
        [1, 'rgb(231,98,84)']
    ],
    zmin=-0.2,
    zmax=1,
    hoverongaps=False,
    hovertemplate=
    'CIP Code: %{customdata[0]}<br>' +
    'CIP Name: %{customdata[1]}<br>' +
    'JACS Code: %{customdata[2]}<br>' +
    'JACS Name: %{customdata[3]}<br>' +
    'Similarity: %{z:.3f}<extra></extra>'
))

# Create custom data for hover information
customdata = [[[cip_codes[i], cip_names[i], jacs_codes[j], jacs_names[j]]
              for j in range(len(jacs_codes))]
              for i in range(len(cip_codes))]
fig.data[0].customdata = customdata

# Update layout with transparent background and black text
fig.update_layout(
    title='Similarity Matrix (Biology)',
    xaxis_title='JACS',
    yaxis_title='CIP',
    width=900,
    height=500,
    xaxis={'showticklabels': False},
    yaxis={'showticklabels': False},
    margin=dict(t=30, l=60, r=30, b=30),
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
    font=dict(color='black'),       # Black text for all labels
    title_font_color='black'        # Black text for title
)

# Save as HTML
fig.write_html('Similarity_html/Biology_similarity.html')

## Computer_Science

In [6]:
jacs_start=584
jacs_end=635
cip_start=231
cip_end=265
similarity_matrix = cosine_similarity(embeddings_cip[cip_start:cip_end], embeddings_jacs[jacs_start:jacs_end])
# Get codes and names
cip_codes = data_cip.iloc[cip_start:cip_end, 0].tolist()
cip_names = data_cip.iloc[cip_start:cip_end, 1].tolist()
jacs_codes = data_jacs.iloc[jacs_start:jacs_end, 0].tolist()
jacs_names = data_jacs.iloc[jacs_start:jacs_end, 1].tolist()

# Create interactive heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    colorscale=[
        [0, 'rgb(30,70,110)'],
        [0.10, 'rgb(55,103,149)'],
        [0.20, 'rgb(82,143,173)'],
        [0.30, 'rgb(114,188,213)'],
        [0.38, 'rgb(170,220,224)'],
        [0.52,'rgb(255,230,183)'],
        [0.64, 'rgb(255,208,111)'],
        [0.78, 'rgb(247,170,88)'],
        [0.88, 'rgb(239,138,71)'],
        [1, 'rgb(231,98,84)']
    ],
    zmin=-0.2,
    zmax=1,
    hoverongaps=False,
    hovertemplate=
    'CIP Code: %{customdata[0]}<br>' +
    'CIP Name: %{customdata[1]}<br>' +
    'JACS Code: %{customdata[2]}<br>' +
    'JACS Name: %{customdata[3]}<br>' +
    'Similarity: %{z:.3f}<extra></extra>'
))

# Create custom data for hover information
customdata = [[[cip_codes[i], cip_names[i], jacs_codes[j], jacs_names[j]]
              for j in range(len(jacs_codes))]
              for i in range(len(cip_codes))]
fig.data[0].customdata = customdata

# Update layout with transparent background and black text
fig.update_layout(
    title='Similarity Matrix (Computer_Science)',
    xaxis_title='JACS',
    yaxis_title='CIP',
    width=900,
    height=500,
    xaxis={'showticklabels': False},
    yaxis={'showticklabels': False},
    margin=dict(t=30, l=60, r=30, b=30),
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
    font=dict(color='black'),       # Black text for all labels
    title_font_color='black'        # Black text for title
)

# Save as HTML
fig.write_html('Similarity_html/Computer_Science_similarity.html')

## Engineering_Science

In [7]:
jacs_start=460
jacs_end=583
cip_start=409
cip_end=468
similarity_matrix = cosine_similarity(embeddings_cip[cip_start:cip_end], embeddings_jacs[jacs_start:jacs_end])
# Get codes and names
cip_codes = data_cip.iloc[cip_start:cip_end, 0].tolist()
cip_names = data_cip.iloc[cip_start:cip_end, 1].tolist()
jacs_codes = data_jacs.iloc[jacs_start:jacs_end, 0].tolist()
jacs_names = data_jacs.iloc[jacs_start:jacs_end, 1].tolist()

# Create interactive heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    colorscale=[
        [0, 'rgb(30,70,110)'],
        [0.10, 'rgb(55,103,149)'],
        [0.20, 'rgb(82,143,173)'],
        [0.30, 'rgb(114,188,213)'],
        [0.38, 'rgb(170,220,224)'],
        [0.52,'rgb(255,230,183)'],
        [0.64, 'rgb(255,208,111)'],
        [0.78, 'rgb(247,170,88)'],
        [0.88, 'rgb(239,138,71)'],
        [1, 'rgb(231,98,84)']
    ],
    zmin=-0.2,
    zmax=1,
    hoverongaps=False,
    hovertemplate=
    'CIP Code: %{customdata[0]}<br>' +
    'CIP Name: %{customdata[1]}<br>' +
    'JACS Code: %{customdata[2]}<br>' +
    'JACS Name: %{customdata[3]}<br>' +
    'Similarity: %{z:.3f}<extra></extra>'
))

# Create custom data for hover information
customdata = [[[cip_codes[i], cip_names[i], jacs_codes[j], jacs_names[j]]
              for j in range(len(jacs_codes))]
              for i in range(len(cip_codes))]
fig.data[0].customdata = customdata

# Update layout with transparent background and black text
fig.update_layout(
    title='Similarity Matrix (Engineering_Science)',
    xaxis_title='JACS',
    yaxis_title='CIP',
    width=900,
    height=500,
    xaxis={'showticklabels': False},
    yaxis={'showticklabels': False},
    margin=dict(t=30, l=60, r=30, b=30),
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
    font=dict(color='black'),       # Black text for all labels
    title_font_color='black'        # Black text for title
)

# Save as HTML
fig.write_html('Similarity_html/Engineering_Science_similarity.html')

## Math

In [8]:
jacs_start=439
jacs_end=459
cip_start=820
cip_end=837
similarity_matrix = cosine_similarity(embeddings_cip[cip_start:cip_end], embeddings_jacs[jacs_start:jacs_end])
# Get codes and names
cip_codes = data_cip.iloc[cip_start:cip_end, 0].tolist()
cip_names = data_cip.iloc[cip_start:cip_end, 1].tolist()
jacs_codes = data_jacs.iloc[jacs_start:jacs_end, 0].tolist()
jacs_names = data_jacs.iloc[jacs_start:jacs_end, 1].tolist()

# Create interactive heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    colorscale=[
        [0, 'rgb(30,70,110)'],
        [0.10, 'rgb(55,103,149)'],
        [0.20, 'rgb(82,143,173)'],
        [0.30, 'rgb(114,188,213)'],
        [0.38, 'rgb(170,220,224)'],
        [0.52,'rgb(255,230,183)'],
        [0.64, 'rgb(255,208,111)'],
        [0.78, 'rgb(247,170,88)'],
        [0.88, 'rgb(239,138,71)'],
        [1, 'rgb(231,98,84)']
    ],
    zmin=-0.2,
    zmax=1,
    hoverongaps=False,
    hovertemplate=
    'CIP Code: %{customdata[0]}<br>' +
    'CIP Name: %{customdata[1]}<br>' +
    'JACS Code: %{customdata[2]}<br>' +
    'JACS Name: %{customdata[3]}<br>' +
    'Similarity: %{z:.3f}<extra></extra>'
))

# Create custom data for hover information
customdata = [[[cip_codes[i], cip_names[i], jacs_codes[j], jacs_names[j]]
              for j in range(len(jacs_codes))]
              for i in range(len(cip_codes))]
fig.data[0].customdata = customdata

# Update layout with transparent background and black text
fig.update_layout(
    title='Similarity Matrix (Math)',
    xaxis_title='JACS',
    yaxis_title='CIP',
    width=900,
    height=500,
    xaxis={'showticklabels': False},
    yaxis={'showticklabels': False},
    margin=dict(t=30, l=60, r=30, b=30),
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
    font=dict(color='black'),       # Black text for all labels
    title_font_color='black'        # Black text for title
)

# Save as HTML
fig.write_html('Similarity_html/Math_similarity.html')

## Pedagogy

In [9]:
jacs_start=1517
jacs_end=1550
cip_start=300
cip_end=408
similarity_matrix = cosine_similarity(embeddings_cip[cip_start:cip_end], embeddings_jacs[jacs_start:jacs_end]).T
# Get codes and names
cip_codes = data_cip.iloc[cip_start:cip_end, 0].tolist()
cip_names = data_cip.iloc[cip_start:cip_end, 1].tolist()
jacs_codes = data_jacs.iloc[jacs_start:jacs_end, 0].tolist()
jacs_names = data_jacs.iloc[jacs_start:jacs_end, 1].tolist()

# Create interactive heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    colorscale=[
        [0, 'rgb(30,70,110)'],
        [0.10, 'rgb(55,103,149)'],
        [0.20, 'rgb(82,143,173)'],
        [0.30, 'rgb(114,188,213)'],
        [0.38, 'rgb(170,220,224)'],
        [0.52,'rgb(255,230,183)'],
        [0.64, 'rgb(255,208,111)'],
        [0.78, 'rgb(247,170,88)'],
        [0.88, 'rgb(239,138,71)'],
        [1, 'rgb(231,98,84)']
    ],
    zmin=0,
    zmax=0.8,
    hoverongaps=False,
    hovertemplate=
    'CIP Code: %{customdata[0]}<br>' +
    'CIP Name: %{customdata[1]}<br>' +
    'JACS Code: %{customdata[2]}<br>' +
    'JACS Name: %{customdata[3]}<br>' +
    'Similarity: %{z:.3f}<extra></extra>'
))

# Create custom data for hover information
customdata = [[[ jacs_codes[i], jacs_names[i],cip_codes[j], cip_names[j]]
              for j in range(len(cip_codes))]
              for i in range(len(jacs_codes))]
fig.data[0].customdata = customdata

# Update layout with transparent background and black text
fig.update_layout(
    title='Similarity Matrix (Pedagogy)',
    xaxis_title='CIP',
    yaxis_title='JACS',
    width=900,
    height=500,
    xaxis={'showticklabels': False},
    yaxis={'showticklabels': False},
    margin=dict(t=30, l=60, r=30, b=30),
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
    font=dict(color='black'),       # Black text for all labels
    title_font_color='black'        # Black text for title
)

# Save as HTML
fig.write_html('Similarity_html/Pedagogy_similarity.html')

## Physics

In [10]:
jacs_start=325
jacs_end=438
cip_start=1029
cip_end=1073
similarity_matrix = cosine_similarity(embeddings_cip[cip_start:cip_end], embeddings_jacs[jacs_start:jacs_end])
# Get codes and names
cip_codes = data_cip.iloc[cip_start:cip_end, 0].tolist()
cip_names = data_cip.iloc[cip_start:cip_end, 1].tolist()
jacs_codes = data_jacs.iloc[jacs_start:jacs_end, 0].tolist()
jacs_names = data_jacs.iloc[jacs_start:jacs_end, 1].tolist()

# Create interactive heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    colorscale=[
        [0, 'rgb(30,70,110)'],
        [0.10, 'rgb(55,103,149)'],
        [0.20, 'rgb(82,143,173)'],
        [0.30, 'rgb(114,188,213)'],
        [0.38, 'rgb(170,220,224)'],
        [0.52,'rgb(255,230,183)'],
        [0.64, 'rgb(255,208,111)'],
        [0.78, 'rgb(247,170,88)'],
        [0.88, 'rgb(239,138,71)'],
        [1, 'rgb(231,98,84)']
    ],
    zmin=-0.2,
    zmax=1,
    hoverongaps=False,
    hovertemplate=
    'CIP Code: %{customdata[0]}<br>' +
    'CIP Name: %{customdata[1]}<br>' +
    'JACS Code: %{customdata[2]}<br>' +
    'JACS Name: %{customdata[3]}<br>' +
    'Similarity: %{z:.3f}<extra></extra>'
))

# Create custom data for hover information
customdata = [[[cip_codes[i], cip_names[i], jacs_codes[j], jacs_names[j]]
              for j in range(len(jacs_codes))]
              for i in range(len(cip_codes))]
fig.data[0].customdata = customdata

# Update layout with transparent background and black text
fig.update_layout(
    title='Similarity Matrix (Physics)',
    xaxis_title='JACS',
    yaxis_title='CIP',
    width=900,
    height=500,
    xaxis={'showticklabels': False},
    yaxis={'showticklabels': False},
    margin=dict(t=30, l=60, r=30, b=30),
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
    font=dict(color='black'),       # Black text for all labels
    title_font_color='black'        # Black text for title
)

# Save as HTML
fig.write_html('Similarity_html/Physics_similarity.html')

## All

In [4]:
# import numpy as np
# Create similarity matrix for biology section
jacs_start = 0
jacs_end = 1550
cip_start = 0
cip_end = 2117

similarity_matrix = cosine_similarity(embeddings_jacs[jacs_start:jacs_end], embeddings_cip[cip_start:cip_end])
# Round similarity matrix to 3 decimal places to reduce memory usage
similarity_matrix = np.round(similarity_matrix, decimals=3)

# Get codes and names
cip_codes = data_cip.iloc[cip_start:cip_end, 0].tolist()
cip_names = data_cip.iloc[cip_start:cip_end, 1].tolist()
jacs_codes = data_jacs.iloc[jacs_start:jacs_end, 0].tolist()
jacs_names = data_jacs.iloc[jacs_start:jacs_end, 1].tolist()

# Create interactive heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    colorscale=[
        [0, 'rgb(30,70,110)'],
        [0.10, 'rgb(55,103,149)'],
        [0.20, 'rgb(82,143,173)'],
        [0.30, 'rgb(114,188,213)'],
        [0.38, 'rgb(170,220,224)'],
        [0.52,'rgb(255,230,183)'],
        [0.64, 'rgb(255,208,111)'],
        [0.78, 'rgb(247,170,88)'],
        [0.88, 'rgb(239,138,71)'],
        [1, 'rgb(231,98,84)']
    ],
    zmin=-0.2,
    zmax=1,
    hoverongaps=False,
    hovertemplate=
    'CIP Code: %{customdata[0]}<br>' +
    'CIP Name: %{customdata[1]}<br>' +
    'JACS Code: %{customdata[2]}<br>' +
    'JACS Name: %{customdata[3]}<br>' +
    'Similarity: %{z:.3f}<extra></extra>'
))

# Create custom data using numpy array for better efficiency
customdata = np.array([[[cip_codes[i], cip_names[i], jacs_codes[j], jacs_names[j]]
                       for j in range(len(jacs_codes))]
                      for i in range(len(cip_codes))])
fig.data[0].customdata = customdata

# Update layout with transparent background and black text
fig.update_layout(
    title='Similarity Matrix',
    xaxis_title='JACS',
    yaxis_title='CIP',
    width=2250,
    height=1200,
    xaxis={'showticklabels': False},
    yaxis={'showticklabels': False},
    margin=dict(t=30, l=60, r=30, b=30),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='black'),
    title_font_color='black'
)

# Save as HTML with optimized parameters
fig.write_html(
    'Similarity_html/All_similarity.html',
    include_plotlyjs='cdn',
    full_html=False,
    include_mathjax=False,
    post_script=None,
    validate=False
)
