1. Write a script that calls the Open Alex API with the query “climate change mitigation”. We focus on
the search outcome at different time period:1850-1900; 1900-1920; 1921-1940; 1941-1960; 1961-
1980; 1981-2000; 2001-2012; 2013-2024. For each time period, select the 30 results that have the
highest citations. For each outcome create a metadata, which include the title of the paper, its
abstract, the corresponding topic in the OpenAlex category, list of authors and their institution. For
each time period, create a dataframe gathering all the thirty datasets.

In [None]:
import requests
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import matplotlib.pyplot as plt
import glob
from collections import Counter
from itertools import cycle
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import random

BASE_URL = "https://api.openalex.org/works"
query = "climate change mitigation"
time_periods = [
    (1850, 1900),
    (1900, 1920),
    (1921, 1940),
    (1941, 1960),
    (1961, 1980),
    (1981, 2000),
    (2001, 2012),
    (2013, 2024)
]

MAX_REQUESTS_PER_SECOND = 10
REQUEST_DELAY = 1 / MAX_REQUESTS_PER_SECOND

def fetch_openalex_data(start_year, end_year, query, limit=30):
    """
    Fetch data from OpenAlex API for a specific time period with pagination.
    """
    collected_results = []
    page = 1
    per_page = 200
    
    while len(collected_results) < limit:
        url = (f"{BASE_URL}?search={query}&filter=from_publication_date:{start_year}-01-01,"
               f"to_publication_date:{end_year}-12-31&sort=cited_by_count:desc&per_page={per_page}&page={page}")
        
        response = requests.get(url)
        
        if response.status_code == 200:
            results = response.json().get("results", [])
            if not results:
                print(f"No more results for period {start_year}-{end_year}. Collected {len(collected_results)} articles.")
                break
            
            collected_results.extend(results[:limit - len(collected_results)])
            page += 1
        else:
            print(f"Error: {response.status_code} for period {start_year}-{end_year}")
            break
        time.sleep(REQUEST_DELAY)
    
    print(f"Fetched {len(collected_results)} articles for period {start_year}-{end_year}")
    return collected_results[:limit]

def reconstruct_abstract(abstract_inverted_index):
    """
    Reconstructs the abstract from abstract_inverted_index.
    """
    if not abstract_inverted_index:
        return ""
    
    max_position = max([max(positions) for positions in abstract_inverted_index.values()])
    abstract_words = [None] * (max_position + 1)
    
    for word, positions in abstract_inverted_index.items():
        for position in positions:
            abstract_words[position] = word
    
    return " ".join(filter(None, abstract_words))

def process_metadata(results):
    """
    Process and extract fields from API results.
    """
    data = []
    for item in results:
        abstract_text = reconstruct_abstract(item.get("abstract_inverted_index", {}))
        
        paper_info = {
            "Title": item.get("title", ""),
            "Abstract": abstract_text,
            "Category": item.get("concepts", [{}])[0].get("display_name", "") if item.get("concepts") else "",
            "Authors": [author.get("author", {}).get("display_name", "") for author in item.get("authorships", [])],
            "Institutions": [
                author.get("institutions", [{}])[0].get("display_name", "")
                if author.get("institutions") else ""
                for author in item.get("authorships", [])
            ]
        }
        
        if paper_info["Title"]:
            data.append(paper_info)
    
    df = pd.DataFrame(data[:30])
    if len(df) < 30:
        empty_rows = pd.DataFrame([{"Title": "", "Abstract": "", "Category": "", "Authors": [], "Institutions": []} for _ in range(30 - len(df))])
        df = pd.concat([df, empty_rows], ignore_index=True)
    
    return df

def fetch_and_process_period(period):
    start_year, end_year = period
    print(f"Fetching data for period: {start_year}-{end_year}")
    results = fetch_openalex_data(start_year, end_year, query)
    if results:
        return process_metadata(results)
    else:
        print(f"No data retrieved for period: {start_year}-{end_year}")
        return pd.DataFrame()
all_dataframes = {}

with ThreadPoolExecutor(max_workers=8) as executor:
    future_to_period = {executor.submit(fetch_and_process_period, period): period for period in time_periods}

    for future in as_completed(future_to_period):
        period = future_to_period[future]
        start_year, end_year = period
        period_key = f"{start_year}-{end_year}"
        
        try:
            df = future.result()
            all_dataframes[period_key] = df
            print(f"Data for period {period_key} saved.")
        except Exception as e:
            print(f"An error occurred for period {period_key}: {e}")

for period, df in all_dataframes.items():
    if not df.empty:
        df.to_csv(f"climate_change_mitigation_{period}.csv", index=False)
        print(f"Saved data for period {period} to CSV.")


In [None]:
def calculate_abstract_availability(dataframes):
    availability_data = {}
    for period, df in dataframes.items():
        total_entries = len(df)
        available_abstracts = df["Abstract"].apply(lambda x: x != "").sum()
        if total_entries > 0:
            percent_available = round((available_abstracts / total_entries) * 100, 2)
        else:
            percent_available = 0
        
        availability_data[period] = percent_available
        print(f"Period: {period}, Total Entries: {total_entries}, Available Abstracts: {available_abstracts}, Percentage: {percent_available}%")
    
    return availability_data

abstract_availability = calculate_abstract_availability(all_dataframes)
sorted_periods = sorted(abstract_availability.keys(), key=lambda x: int(x.split('-')[0]))
time_periods = sorted_periods
percentages = [abstract_availability[period] for period in time_periods]

plt.figure(figsize=(16, 9))
plt.grid(True, linestyle='--', alpha=0.5)
plt.plot(time_periods, percentages, 'o-', color='royalblue', linewidth=2, markersize=8, label="Abstract Availability")

for i, (period, percentage) in enumerate(zip(time_periods, percentages)):
    plt.text(period, percentage + 1, f"{percentage:.1f}%", ha='center', va='bottom', fontsize=12, color='darkblue')

plt.title("Percentage of Abstracts Available per Period", fontsize=16)
plt.xlabel("Time Period", fontsize=14)
plt.ylabel("Percentage of Available Abstracts", fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim(72, max(percentages) + 5)
plt.savefig("abstract_availability_chart.svg", dpi=300, bbox_inches='tight', format='svg')
plt.show()


In [None]:
folder_path = './'
all_files = glob.glob(folder_path + "climate_change_mitigation_*.csv")
abstract_lengths = []

for filename in all_files:
    df = pd.read_csv(filename)
    if 'Abstract' in df.columns:
        abstract_lengths.extend(df['Abstract'].dropna().apply(lambda x: len(x.split())))
        
plt.figure(figsize=(16, 9))
plt.hist(abstract_lengths, bins=30, edgecolor='black')
plt.xlabel('Abstract Length (words)')
plt.ylabel('Frequency')
plt.title('Distribution of Abstract Lengths Across All CSV Files (Same Directory)')
plt.grid(axis='y')
plt.savefig("abstract_length_distribution.svg", dpi=300, bbox_inches='tight', format='svg')
plt.show()

2. Write a script that scrutinizes the categories associated to documents of the same time period, and
output the percentage of each category in the thirty outcomes. Then, for a given category that
appears in more than one time period, draw a plot that shows the evolution of each category across
different time periods (Trace the categories on the same plot).

In [None]:
def calculate_category_distribution(df):
    """
    Function to calculate category distribution for each period.
    """
    category_counts = df["Category"].value_counts(normalize=True) * 100
    return category_counts

category_distributions = {}

for period, df in all_dataframes.items():
    category_distribution = calculate_category_distribution(df)
    category_distributions[period] = category_distribution

sorted_periods = sorted(category_distributions.keys(), key=lambda x: int(x.split('-')[0]))
all_categories = [set(distribution.index) for distribution in category_distributions.values()]
category_counter = Counter(category for distribution in all_categories for category in distribution)
multi_period_categories = {category for category, count in category_counter.items() if count > 1}

for period in sorted_periods:
    distribution = category_distributions[period]
    print(f"\nPeriod: {period}")
    print(distribution, "\n")

plt.figure(figsize=(16, 9))
plt.grid(True, linestyle='--', alpha=0.5)

markers = cycle(['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'X', 'P'])
styles = cycle(['-', '--', '-.', ':'])

for category in multi_period_categories:
    time_periods = []
    percentages = []
    
    for period in sorted_periods:
        distribution = category_distributions[period]
        time_periods.append(period)
        percentages.append(distribution.get(category, 0))
    
    plt.plot(
        time_periods, 
        percentages, 
        linestyle=next(styles),
        marker=next(markers), 
        label=category, 
        linewidth=1.5
    )
    for period, percentage in zip(time_periods, percentages):
        if percentage > 0:
            plt.annotate(
                f"{percentage:.1f}%", 
                (period, percentage), 
                textcoords="offset points", 
                xytext=(0, 5), 
                ha='center', 
                fontsize=10
            )

plt.title("Evolution of Categories Over Time", fontsize=16)
plt.xlabel("Time Period", fontsize=14)
plt.ylabel("Percentage of Articles", fontsize=14)
plt.legend(title="Categories", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='medium', title_fontsize='medium')
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.savefig("category_evolution_over_time.svg", dpi=300, bbox_inches='tight', format='svg')
plt.show()


3. Now we want to explore the titles of the documents. For this purpose, suggest a script that gathers
all titles of the same time period, then uses DistilBERT to generate an embedding vector, then uses
the t-SNE Visualizing Word Vectors with t-SNE | Kaggle, to project the embedding into a 2D space,
so that all titles of the same time period are represented as a single 2D point. Finally, draw a plot
showing the evolution of the titles across various time periods.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

def get_batch_embeddings(texts):
    """
    Generate embeddings for a batch of texts using DistilBERT.
    """
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def calculate_average_embedding(df):
    """
    Calculate the average embedding for all titles within a period.
    """
    embeddings = []
    batch_size = 16
    titles = df["Title"].dropna().tolist()

    for i in range(0, len(titles), batch_size):
        batch_titles = titles[i:i + batch_size]
        batch_embeddings = get_batch_embeddings(batch_titles)
        embeddings.append(batch_embeddings)
    
    embeddings = np.vstack(embeddings)
    avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding

period_embeddings = {}

for period, df in all_dataframes.items():
    print(f"Calculating embeddings for period: {period}")
    avg_embedding = calculate_average_embedding(df)
    period_embeddings[period] = avg_embedding

period_labels = list(period_embeddings.keys())
embedding_matrix = np.vstack(list(period_embeddings.values()))

tsne = TSNE(n_components=2, init='random', random_state=42, perplexity=3)
tsne_results = tsne.fit_transform(embedding_matrix)

plt.figure(figsize=(16, 9))

for i, period in enumerate(period_labels):
    plt.scatter(tsne_results[i, 0], tsne_results[i, 1], label=period, s=100)
    plt.text(tsne_results[i, 0] + 0.1, tsne_results[i, 1] + 0.1, period, fontsize=9)

for i in range(len(tsne_results) - 1):
    plt.plot([tsne_results[i, 0], tsne_results[i + 1, 0]], 
             [tsne_results[i, 1], tsne_results[i + 1, 1]], 'k--', alpha=0.5)

plt.title("Evolution of Title Embeddings Across Time Periods")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.legend(title="Time Periods", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.savefig("title_embedding_evolution.svg", dpi=300, bbox_inches='tight', format='svg')
plt.show()

4. Repeat the process in 3) when considering all abstracts of documents falling under the same time 
period.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

def get_batch_embeddings(texts):
    """
    Generate embeddings for a batch of texts using DistilBERT.
    """
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def calculate_average_embedding_abstracts(df):
    """
    Calculate the average embedding for all abstracts within a period.
    """
    embeddings = []
    batch_size = 8
    abstracts = df["Abstract"].dropna().tolist()

    for i in range(0, len(abstracts), batch_size):
        batch_abstracts = abstracts[i:i + batch_size]
        batch_embeddings = get_batch_embeddings(batch_abstracts)
        embeddings.append(batch_embeddings)
    embeddings = np.vstack(embeddings)
    avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding

period_embeddings_abstracts = {}

for period, df in all_dataframes.items():
    print(f"Calculating embeddings for abstracts in period: {period}")
    avg_embedding = calculate_average_embedding_abstracts(df)
    period_embeddings_abstracts[period] = avg_embedding

period_labels_abstracts = list(period_embeddings_abstracts.keys())
embedding_matrix_abstracts = np.vstack(list(period_embeddings_abstracts.values()))
tsne_abstracts = TSNE(n_components=2, init='random', random_state=42, perplexity=5)
tsne_results_abstracts = tsne_abstracts.fit_transform(embedding_matrix_abstracts)

plt.figure(figsize=(16, 9))

for i, period in enumerate(period_labels_abstracts):
    plt.scatter(tsne_results_abstracts[i, 0], tsne_results_abstracts[i, 1], label=period, s=100)
    plt.text(tsne_results_abstracts[i, 0] + 0.1, tsne_results_abstracts[i, 1] + 0.1, period, fontsize=9)

for i in range(len(tsne_results_abstracts) - 1):
    plt.plot([tsne_results_abstracts[i, 0], tsne_results_abstracts[i + 1, 0]], 
             [tsne_results_abstracts[i, 1], tsne_results_abstracts[i + 1, 1]], 'k--', alpha=0.5)

plt.title("Evolution of Abstract Embeddings Across Time Periods")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.legend(title="Time Periods", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()


5. Now we want to comprehend the diversity of documents falling on the same period. For this 
purpose, use the DistilBERT embedding for each abstract, followed by t-SNE decomposition for 2D 
representation, and draw a plot for each time period highlighting the distribution of abstracts in the 
2D plot (each abstract corresponds to a single point), at the same time, draw the distribution of 
categories over the same time period. Comment whether some analogies between embedding-based 
representation and category-based representation. 

In [None]:
def get_batch_embeddings(texts):
    """
    Generate embeddings for a batch of texts using DistilBERT.
    """
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def get_abstract_embeddings(df):
    """
    Get embeddings for all abstracts in the dataframe in batches and track their categories.
    """
    embeddings = []
    categories = df["Category"].values
    batch_size = 8
    abstracts = df["Abstract"].dropna().tolist()

    for i in range(0, len(abstracts), batch_size):
        batch_abstracts = abstracts[i:i + batch_size]
        batch_embeddings = get_batch_embeddings(batch_abstracts)
        embeddings.append(batch_embeddings)
    embeddings = np.vstack(embeddings)
    return embeddings, categories[:len(embeddings)]

for period, df in all_dataframes.items():
    print(f"Processing period: {period}")
    
    embeddings, categories = get_abstract_embeddings(df)
    pca = PCA(n_components=20, random_state=42)
    reduced_embeddings = pca.fit_transform(embeddings)
    tsne = TSNE(n_components=2, init='pca', random_state=42, perplexity=5)
    tsne_results = tsne.fit_transform(reduced_embeddings)

    category_counts = Counter(categories)
    total_abstracts = len(categories)
    category_percentages = {cat: (count / total_abstracts) * 100 for cat, count in category_counts.items()}

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 14), gridspec_kw={'height_ratios': [3, 1]})
    unique_categories = list(category_counts.keys())
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_categories)))
    color_map = {cat: colors[i] for i, cat in enumerate(unique_categories)}

    for i, (x, y) in enumerate(tsne_results):
        category = categories[i]
        ax1.scatter(x, y, color=color_map[category], s=50, alpha=0.7)

    for category, color in color_map.items():
        ax1.scatter([], [], color=color, label=category, s=50)

    ax1.set_title(f"Abstract Embedding Distribution for {period}")
    ax1.set_xlabel("t-SNE Dimension 1")
    ax1.set_ylabel("t-SNE Dimension 2")
    ax1.legend(title="Categories", bbox_to_anchor=(1.05, 1), loc='upper left')
    ax1.grid(True)
    ax2.bar(
        category_percentages.keys(),
        category_percentages.values(),
        color=[color_map[cat] for cat in category_percentages.keys()],
        alpha=0.7
    )
    ax2.set_title(f"Category Distribution for {period}")
    ax2.set_ylabel("Percentage of Abstracts")
    ax2.set_xticks(range(len(category_percentages)))
    ax2.set_xticklabels(category_percentages.keys(), rotation=45, ha="right")

    plt.savefig(f"abstract_embedding_distribution_{period}.svg", dpi=300, bbox_inches='tight', format='svg')
    plt.show()


6. We want to assess the compatibility of category result outputted by OpenAlex with abstract topic
based analysis. For this purpose, given the set of all abstracts falling on the same time scale, with the 
corresponding embedding as performed in Task 4), create a DistilBERT embedding vector for each 
category title as described in OpenAlex, and then write a program that computes the cosine 
similarity between each topic title embedding and the abstracts embedding, so that the category that 
yields the highest cosine similarity score will be assigned to it. Comment whether the compatibility 
is feasible when taking the overall set of abstracts. 

In [None]:
def get_embedding(text):
    """
    Generate a DistilBERT embedding for a single text input.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

def get_category_embeddings(unique_categories):
    """
    Generate embeddings for a list of unique category titles.
    """
    category_embeddings = {}
    for category in unique_categories:
        category_embedding = get_embedding(category)
        category_embeddings[category] = category_embedding
    return category_embeddings

def assign_category_to_abstracts(abstract_embeddings, category_embeddings):
    """
    Compute cosine similarity between each abstract embedding and each category embedding,
    and assign the most similar category to each abstract.
    """
    assigned_categories = []
    category_names = list(category_embeddings.keys())
    category_vectors = np.array(list(category_embeddings.values()))
    
    for abstract_embedding in abstract_embeddings:
        similarities = cosine_similarity([abstract_embedding], category_vectors)[0]
        best_category_index = np.argmax(similarities)
        assigned_category = category_names[best_category_index]
        assigned_categories.append((assigned_category, similarities[best_category_index]))

    return assigned_categories

compatibility_results = {}

for period, df in all_dataframes.items():
    print(f"\nProcessing period: {period}")
    abstract_embeddings, actual_categories = get_abstract_embeddings(df)
    
    unique_categories = df["Category"].dropna().unique()
    category_embeddings = get_category_embeddings(unique_categories)
    assigned_categories = assign_category_to_abstracts(abstract_embeddings, category_embeddings)
    compatible_count = sum(1 for i, actual_cat in enumerate(actual_categories) if actual_cat == assigned_categories[i][0])
    compatibility_ratio = compatible_count / len(abstract_embeddings) * 100
    compatibility_results[period] = compatibility_ratio

    print(f"Compatibility of OpenAlex categories with topic-based assignment for period {period}:")
    print(f"Percentage of abstracts where assigned category matches OpenAlex category: {compatibility_ratio:.2f}%")
    print("Examples of assigned categories with highest cosine similarity:")
    for i in range(min(5, len(assigned_categories))):
        print(f"Abstract {i+1}: Actual Category = {actual_categories[i]}, Assigned Category = {assigned_categories[i][0]}, Similarity Score = {assigned_categories[i][1]:.4f}")
    print("\n" + "="*60)
    
print("\nSummary of Compatibility Ratios Across Periods:")
for period, ratio in compatibility_results.items():
    print(f"{period}: {ratio:.2f}%")

7. Repeat 6) when considering individual abstracts instead of the concatenated list of all abstracts. 

In [None]:
def get_embedding(text):
    """
    Generate a DistilBERT embedding for a single text input.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

def get_category_embeddings(unique_categories):
    """
    Generate embeddings for a list of unique category titles.
    """
    category_embeddings = {}
    for category in unique_categories:
        category_embedding = get_embedding(category)
        category_embeddings[category] = category_embedding
    return category_embeddings

def assign_category_to_individual_abstract(abstract_embedding, category_embeddings):
    """
    Compute cosine similarity between a single abstract embedding and each category embedding,
    and assign the category with the highest similarity score.
    """
    category_names = list(category_embeddings.keys())
    category_vectors = np.array(list(category_embeddings.values()))
    similarities = cosine_similarity([abstract_embedding], category_vectors)[0]
    best_category_index = np.argmax(similarities)
    assigned_category = category_names[best_category_index]
    return assigned_category, similarities[best_category_index]

compatibility_results_individual = {}

for period, df in all_dataframes.items():
    print(f"\nProcessing period: {period}")
    abstract_embeddings = []
    actual_categories = []
    
    for _, row in df.iterrows():
        if pd.notnull(row["Abstract"]):
            abstract_embeddings.append(get_embedding(row["Abstract"]))
            actual_categories.append(row["Category"])

    unique_categories = df["Category"].dropna().unique()
    category_embeddings = get_category_embeddings(unique_categories)
    assigned_categories = []

    for i, abstract_embedding in enumerate(abstract_embeddings):
        assigned_category, similarity_score = assign_category_to_individual_abstract(abstract_embedding, category_embeddings)
        assigned_categories.append((assigned_category, similarity_score))
    
    compatible_count = sum(1 for i, actual_cat in enumerate(actual_categories) if actual_cat == assigned_categories[i][0])
    compatibility_ratio = compatible_count / len(abstract_embeddings) * 100 if abstract_embeddings else 0
    compatibility_results_individual[period] = compatibility_ratio

    print(f"Compatibility of OpenAlex categories with topic-based assignment for period {period} (individual abstracts):")
    print(f"Percentage of abstracts where assigned category matches OpenAlex category: {compatibility_ratio:.2f}%")
    print("Examples of assigned categories with highest cosine similarity:")
    sorted_examples = sorted(assigned_categories, key=lambda x: x[1], reverse=True)
    for i in range(min(5, len(sorted_examples))):
        actual_category = actual_categories[i]
        assigned_category, similarity_score = sorted_examples[i]
        print(f"Abstract {i+1}: Actual Category = {actual_category}, Assigned Category = {assigned_category}, Similarity Score = {similarity_score:.4f}")
    print("\n" + "="*60)

print("\nSummary of Compatibility Ratios Across Periods (Individual Abstracts):")
for period, ratio in compatibility_results_individual.items():
    print(f"{period}: {ratio:.2f}%")

8. Repeat 6) and 7) when word2vec was used instead of DistilBERT embeddings.  Summarize the 
findings in a table highlighting the performance of various embeddings and data processing pipeline. 

In [None]:
random.seed(11)
np.random.seed(11)

corpus = [abstract.split() for df in all_dataframes.values() for abstract in df["Abstract"].dropna()]
word2vec_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=2, workers=4, seed=42)

def get_word2vec_embedding(text, model):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

def get_category_embeddings_word2vec(unique_categories, model):
    category_embeddings = {}
    for category in unique_categories:
        category_embedding = get_word2vec_embedding(category, model)
        category_embeddings[category] = category_embedding
    return category_embeddings

def assign_category(abstract_embedding, category_embeddings):
    category_names = list(category_embeddings.keys())
    category_vectors = np.array(list(category_embeddings.values()))
    similarities = cosine_similarity([abstract_embedding], category_vectors)[0]
    best_category_index = np.argmax(similarities)
    assigned_category = category_names[best_category_index]
    return assigned_category, similarities[best_category_index]

word2vec_results = []

def process_period_word2vec(period, df):
    print(f"Processing period: {period} with Word2Vec")
    concatenated_abstracts = " ".join(df["Abstract"].dropna())
    if concatenated_abstracts:
        concat_embedding = get_word2vec_embedding(concatenated_abstracts, word2vec_model)
        unique_categories = df["Category"].dropna().unique()
        category_embeddings = get_category_embeddings_word2vec(unique_categories, word2vec_model)
        
        assigned_category, _ = assign_category(concat_embedding, category_embeddings)
        compatible_count = sum(1 for cat in unique_categories if cat == assigned_category)
        compatibility_ratio_concat = compatible_count / len(unique_categories) * 100 if unique_categories.size > 0 else 0
        word2vec_results.append({
            "Period": period,
            "Embedding": "Word2Vec",
            "Method": "Concatenated",
            "Compatibility (%)": compatibility_ratio_concat
        })
    abstract_embeddings = []
    actual_categories = []
    
    for _, row in df.iterrows():
        if pd.notnull(row["Abstract"]):
            abstract_embedding = get_word2vec_embedding(row["Abstract"], word2vec_model)
            assigned_category, _ = assign_category(abstract_embedding, category_embeddings)
            abstract_embeddings.append(assigned_category)
            actual_categories.append(row["Category"])
    compatible_count = sum(1 for i, actual_cat in enumerate(actual_categories) if actual_cat == abstract_embeddings[i])
    compatibility_ratio_individual = compatible_count / len(abstract_embeddings) * 100 if abstract_embeddings else 0
    word2vec_results.append({
        "Period": period,
        "Embedding": "Word2Vec",
        "Method": "Individual",
        "Compatibility (%)": compatibility_ratio_individual
    })
for period, df in all_dataframes.items():
    process_period_word2vec(period, df)

distilbert_concat_results = [{"Period": period, "Embedding": "DistilBERT", "Method": "Concatenated", "Compatibility (%)": ratio} for period, ratio in compatibility_results.items()]
distilbert_individual_results = [{"Period": period, "Embedding": "DistilBERT", "Method": "Individual", "Compatibility (%)": ratio} for period, ratio in compatibility_results_individual.items()]

word2vec_df = pd.DataFrame(word2vec_results)
distilbert_concat_df = pd.DataFrame(distilbert_concat_results)
distilbert_individual_df = pd.DataFrame(distilbert_individual_results)
combined_df = pd.concat([word2vec_df, distilbert_concat_df, distilbert_individual_df], ignore_index=True)

print("\nSummary Table of Compatibility Ratios for DistilBERT vs Word2Vec:")
print(combined_df)
avg_compatibility = combined_df.groupby(["Embedding", "Method"])["Compatibility (%)"].mean()
print("\nAverage Compatibility Ratios for Each Embedding and Method:")
print(avg_compatibility)
results_df_pivot = combined_df.pivot(index="Period", columns=["Embedding", "Method"], values="Compatibility (%)")

plt.figure(figsize=(16, 9))
results_df_pivot.plot(kind="bar", figsize=(12, 8), width=0.8)
plt.title("Compatibility Ratios for DistilBERT vs Word2Vec by Period and Method")
plt.ylabel("Compatibility (%)")
plt.xlabel("Period")
plt.xticks(rotation=45)
plt.legend(title="Embedding & Method")
plt.savefig("compatibility_ratios_distilbert_vs_word2vec.svg", dpi=300, bbox_inches='tight', format='svg')
plt.show()
