In [12]:
import pandas as pd

DATAFRAME_PICKLE_PATH = "data/course_data_embedded.pkl"
course_df = pd.read_pickle(DATAFRAME_PICKLE_PATH)
course_df.head(2)

Unnamed: 0,emne_id,emne_navn,emne_navn_eng,studiepoeng,emne_utbytte,emne_innhold,faculty_code,embedding
0,BE-002,Forkurs i bedriftsøkonomi for EMBA-studenter,Forkurs i bedriftsøkonomi for EMBA-studenter,0.0,"[Etter fullført emne skal studenten:, [ha gene...",[Emnet tilbys som et valgfritt forkurs til stu...,HH,"[-0.007871167734265327, -0.09496525675058365, ..."
1,BE-111,Innføring i finansregnskap,Introduction to Financial Accounting,7.5,[Etter å ha fullført dette emnet skal studente...,[Emnet gir en grunnleggende innføring i finans...,HH,"[0.005303630139678717, -0.07340927422046661, -..."


In [None]:
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd

FACULTY_CODE_TO_NAME = {
    "TR": "Engineering and Science",
    "HI": "Health and Sport Science",
    "SV": "Social Sciences",
    "LU": "Teacher Education Unit",
    "KU": "Fine Arts",
    "HH": "School of Business and Law",
    "HP": "Humanities and Education",
}

# Build a function to plot an interactive t-SNE visualization
def plot_interactive_tsne(embeddings, faculty_codes, course_names, title="UiA Course Embeddings Interactive Plot"):
    # The embeddings are high-dimensional, we need to reduce them to 2D for visualization
    tsne = TSNE(n_components=2, perplexity=30, random_state=41)
    tsne_results = tsne.fit_transform(embeddings)
    
    df = pd.DataFrame({
        "x": tsne_results[:, 0],
        "y": tsne_results[:, 1],
        "Faculty": faculty_codes,
        "Course": course_names
    })
    
    # Create interactive scatter plot
    fig = px.scatter(
        df,
        x="x",
        y="y",
        color="Faculty",
        hover_data=["Course", "Faculty"],
        title=title,
        color_discrete_sequence=px.colors.qualitative.Set3,
        labels={
            "x": "",
            "y": ""
        } # Hide axis labels, in a t-SNE diagram these are essentially arbitrary
    )
    
    # Customize the layout, we"ll be color coding by faculty but there"s no reason we can"t use 
    fig.update_layout(
        title={
            "y":0.95,
            "x":0.5,
            "xanchor": "center",
            "yanchor": "top",
            "font": {"size": 20}
        },
        legend_title_text="Faculty Color Codes",
        width=1000,
        height=700,
        template="simple_white",
        hovermode="closest",
        # Similarly to the labels, there's no reason to show the axes in a t-SNE diagram
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, showline=False, ticks=""), 
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, showline=False, ticks=""),
    )
    
    # Customize hover template
    fig.update_traces(
        hovertemplate="<br>".join([
            "<b>Course:</b> %{customdata[0]}",
            "<b>Faculty:</b> %{customdata[1]}",
            "<extra></extra>"
        ]),
        marker=dict(size=8)
    )
    
    return fig

# How we choose to represent course names in the plot
def build_course_string(row):
    return f"{row['emne_id']} - {row['emne_navn_eng']}"

course_embeddings = np.stack(course_df["embedding"])
course_faculties = course_df["faculty_code"].apply(lambda x: FACULTY_CODE_TO_NAME[x]).values
course_names = course_df.apply(build_course_string, axis=1).values
fig = plot_interactive_tsne(course_embeddings, course_faculties, course_names)
fig.show()


In [None]:
# Saving the plot to html handily retains the interactivity via Javascript, simply open the file in any browser
# (Aside from Firefox at the time of writing)
fig.write_html("data/course_similarity.html")