In [3]:
import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, acf, pacf, ccf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from typing import List, Tuple


In [6]:
def similarity_cosine(vec1, vec2):
    cosine_similarity_arr = []
    for v1,v2 in zip(vec1, vec2):
        cosine_similarity = np.dot(v1, v2)/(np.linalg.norm(v1)* np.linalg.norm(v2))
        cosine_similarity_arr.append(cosine_similarity)
    return np.array(cosine_similarity_arr)

def keep_words_with_underscore(input_string):
    # Define a regular expression pattern to match words with underscores
    pattern = r'\b\w*_[\w_]*\b'

    # Use re.findall to extract words that match the pattern
    matching_words = re.findall(pattern, input_string)

    # Join the matching words to form the final string
    result = ' '.join(matching_words)

    return result


def update_co_occurrences(word_year_list,word_co_occurrences):
    # Iterate through the words in the list
    word_list, year = word_year_list
    
    for word in word_list:
        # If the word is not already in the dictionary, add it with an empty list
        if word not in word_co_occurrences:
            word_co_occurrences[word] = {}
        
        # Add words from the list to the co-occurrence list for the current word
        for other_word in word_list:
            # if other_word != word and other_word not in word_co_occurrences[word]:
            #     word_co_occurrences[word].append(other_word)
            if other_word != word and other_word not in word_co_occurrences[word]:
                word_co_occurrences[word][other_word] = [year] 
            
            elif other_word != word and other_word in word_co_occurrences[word]:
                # word_co_occurrences[word][other_word][0] +=1
                word_co_occurrences[word][other_word].append(year)

In [7]:
concept_filtered_arr = np.load("files/overlapping_filtered_concepts.npy")
ngram_abstracts = np.load("files/ngram_abstracts.npy", mmap_mode="r")
saved_year_arr = np.load("files/year_arr.npy", mmap_mode="r")

print("Concepts which were tracked",concept_filtered_arr.shape)
print("Abstracts",ngram_abstracts.shape)
print("Year associated to abstract",saved_year_arr.shape)

phys_filtered_concept_dict = {k:1 for k in concept_filtered_arr}
ocurr_arr = []
for abstract, year in zip(ngram_abstracts, saved_year_arr):
    temp = keep_words_with_underscore(abstract)
    if temp.count(" ") > 0:
        temp = temp.split(" ") 
        temp = [s for s in temp if s in phys_filtered_concept_dict]
        ocurr_arr.append([list(filter(("_").__ne__, temp)),year])
                        
word_co_occurrences = {}

for word_list in tqdm(ocurr_arr):
    update_co_occurrences(word_list,word_co_occurrences)


Concepts which were tracked (12770,)
Abstracts (157821,)
Year associated to abstract (157821,)


100%|██████████| 152310/152310 [00:06<00:00, 21960.71it/s]


In [13]:
# class TimeSeriesDataset(Dataset):
#     def __init__(self, data, word_co_occurrences, year_arr, c_inx_arr, input_window_size = 5, output_window_size = 3, offset_to_current_year = 1):
#         self.train_window_data = data[:,-input_window_size-output_window_size-offset_to_current_year:-output_window_size-offset_to_current_year]
#         if offset_to_current_year == 0:
#             # self.label_window_data = data[:,-output_window_size:]
#             self.label_year_range = year_arr[-output_window_size:]
#         else: 
#             # self.label_window_data = data[:,-output_window_size-offset_to_current_year:-offset_to_current_year]
#             self.label_year_range = year_arr[-output_window_size-offset_to_current_year:-offset_to_current_year]

        
#         self.co_occur_concept_pair_arr = get_co_occur_concept_pair_after_year_arr(word_co_occurrences, first_occ_year=self.label_year_range[0], final_occ_year=self.label_year_range[-1])
#         self.c_inx_arr = c_inx_arr
#         self.input_window_size = input_window_size 
#         self.output_window_size = output_window_size 
#         self.offset_to_current_year = offset_to_current_year 

#     def __len__(self):
#         return 64*500

#     def __getitem__(self, idx):
        
#         if np.random.rand() < 0.5:
#             return self._get_positive_sample()
#         else:
#             return self._get_negative_sample()
    

#     def _get_positive_sample(self):
#         while True:
#             sampled_pairs = np.random.choice(len(self.co_occur_concept_pair_arr), size=1)
#             c_pair = self.co_occur_concept_pair_arr[sampled_pairs][0]
#             inx_0 = np.where(self.c_inx_arr == c_pair[0])[0]
#             inx_1 = np.where(self.c_inx_arr == c_pair[1])[0]
#             if inx_0.size > 0 and inx_1.size > 0:
#                 break
#         enc_0 = self.train_window_data[inx_0][0]
#         enc_1 = self.train_window_data[inx_1][0]
#         enc_01 = np.concatenate((enc_0, enc_1), axis=-1)
#         return torch.from_numpy(enc_01), torch.ones(1), torch.from_numpy(np.array([inx_0,inx_1]))

#     def _get_negative_sample(self):
#         while True:
#             sampled_pair = np.random.choice(self.train_window_data.shape[0], size=2)
#             if self.c_inx_arr[sampled_pair[1]] not in word_co_occurrences[self.c_inx_arr[sampled_pair[0]]]:
#                 break
#         inx_0 = np.where(self.c_inx_arr == self.c_inx_arr[sampled_pair[0]])[0]
#         inx_1 = np.where(self.c_inx_arr == self.c_inx_arr[sampled_pair[1]])[0]
#         enc_0 = self.train_window_data[inx_0][0]
#         enc_1 = self.train_window_data[inx_1][0]
#         enc_01 = np.concatenate((enc_0, enc_1), axis=-1)
#         return torch.from_numpy(enc_01), torch.zeros(1), torch.from_numpy(np.array([inx_0,inx_1]))
    
#     def _check_indexing(self):
#         if self.offset_to_current_year != 0 :
#             print(f"... {np.unique(saved_year_arr)[-self.input_window_size-self.output_window_size-self.offset_to_current_year-3:-self.input_window_size-self.output_window_size-self.offset_to_current_year]}",f" -> Training Window {np.unique(saved_year_arr)[-self.input_window_size-self.output_window_size-self.offset_to_current_year:-self.output_window_size-self.offset_to_current_year]}",f" <- {np.unique(saved_year_arr)[-self.output_window_size-self.offset_to_current_year:]}")
#             print(f"... {np.unique(saved_year_arr)[-self.output_window_size-self.offset_to_current_year-3:-self.output_window_size-self.offset_to_current_year]}",f" -> Label Window {np.unique(saved_year_arr)[-self.output_window_size-self.offset_to_current_year:-self.offset_to_current_year]}",f" <- {np.unique(saved_year_arr)[-self.offset_to_current_year:]}")
#         else:
#             print(f"... {np.unique(saved_year_arr)[-self.input_window_size-self.output_window_size-3:-self.input_window_size-self.output_window_size]}",f" -> Training Window {np.unique(saved_year_arr)[-self.input_window_size-self.output_window_size:-self.output_window_size]}",f" <- {np.unique(saved_year_arr)[-self.output_window_size:]}")
#             print(f"... {np.unique(saved_year_arr)[-self.output_window_size-3:-self.output_window_size]}",f" -> Label Window {np.unique(saved_year_arr)[-self.output_window_size:]}",f" <- {[]}")

class NovelSeriesDataset(Dataset):
    def __init__(self, data: np.ndarray, c_inx_arr: np.ndarray, input_window_size: int = 5):
        """
        Dataset for novel series data.

        Args:
            data (np.ndarray): The input data.
            c_inx_arr (np.ndarray): Array of concept indices.
            input_window_size (int, optional): Size of the input window. Defaults to 5.
        """
        self.train_window_data = data[:, -input_window_size:]
        self.c_inx_arr = c_inx_arr
        self.input_window_size = input_window_size

    def __len__(self) -> int:
        return 64 * 500

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        while True:
            sampled_pair = np.random.choice(self.train_window_data.shape[0], size=2)
            if self.c_inx_arr[sampled_pair[1]] not in word_co_occurrences[self.c_inx_arr[sampled_pair[0]]]:
                break
        inx_0 = np.where(self.c_inx_arr == self.c_inx_arr[sampled_pair[0]])[0]
        inx_1 = np.where(self.c_inx_arr == self.c_inx_arr[sampled_pair[1]])[0]
        enc_0 = self.train_window_data[inx_0][0]
        enc_1 = self.train_window_data[inx_1][0]
        enc_01 = np.concatenate((enc_0, enc_1), axis=-1)
        return torch.from_numpy(enc_01), torch.from_numpy(np.array([inx_0, inx_1]))

    def _check_indexing(self):
        print(f"Training Window: {np.unique(saved_year_arr)[-self.input_window_size:]}")

class TimeSeriesDataset(Dataset):
    def __init__(self, data: np.ndarray, word_co_occurrences: dict, year_arr: np.ndarray, c_inx_arr: np.ndarray, 
                 input_window_size: int = 5, output_window_size: int = 3, offset_to_current_year: int = 1):
        """
        Dataset for time series data.

        Args:
            data (np.ndarray): The input data.
            word_co_occurrences (dict): Dictionary of word co-occurrences.
            year_arr (np.ndarray): Array of years.
            c_inx_arr (np.ndarray): Array of concept indices.
            input_window_size (int, optional): Size of the input window. Defaults to 5.
            output_window_size (int, optional): Size of the output window. Defaults to 3.
            offset_to_current_year (int, optional): Offset to the current year. Defaults to 1.
        """
        self.train_window_data = data[:, -input_window_size-output_window_size-offset_to_current_year:-output_window_size-offset_to_current_year]
        self.label_year_range = (year_arr[-output_window_size:] if offset_to_current_year == 0 
                                 else year_arr[-output_window_size-offset_to_current_year:-offset_to_current_year])

        self.co_occur_concept_pair_arr = self.get_co_occur_concept_pair_after_year_arr(
            word_co_occurrences, self.label_year_range[0], self.label_year_range[-1])
        self.c_inx_arr = c_inx_arr
        self.input_window_size = input_window_size
        self.output_window_size = output_window_size
        self.offset_to_current_year = offset_to_current_year

    def __len__(self) -> int:
        return 64 * 5000

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        return (self._get_positive_sample() if np.random.rand() < 0.5 else self._get_negative_sample())

    def _get_positive_sample(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        while True:
            sampled_pairs = np.random.choice(len(self.co_occur_concept_pair_arr), size=1)
            c_pair = self.co_occur_concept_pair_arr[sampled_pairs][0]
            inx_0 = np.where(self.c_inx_arr == c_pair[0])[0]
            inx_1 = np.where(self.c_inx_arr == c_pair[1])[0]
            if inx_0.size > 0 and inx_1.size > 0:
                break
        enc_0 = self.train_window_data[inx_0][0]
        enc_1 = self.train_window_data[inx_1][0]
        enc_01 = np.concatenate((enc_0, enc_1), axis=-1)
        return torch.from_numpy(enc_01), torch.ones(1), torch.from_numpy(np.array([inx_0, inx_1]))

    def _get_negative_sample(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        while True:
            sampled_pair = np.random.choice(self.train_window_data.shape[0], size=2)
            if self.c_inx_arr[sampled_pair[1]] not in word_co_occurrences[self.c_inx_arr[sampled_pair[0]]]:
                break
        inx_0 = np.where(self.c_inx_arr == self.c_inx_arr[sampled_pair[0]])[0]
        inx_1 = np.where(self.c_inx_arr == self.c_inx_arr[sampled_pair[1]])[0]
        enc_0 = self.train_window_data[inx_0][0]
        enc_1 = self.train_window_data[inx_1][0]
        enc_01 = np.concatenate((enc_0, enc_1), axis=-1)
        return torch.from_numpy(enc_01), torch.zeros(1), torch.from_numpy(np.array([inx_0, inx_1]))

    def _check_indexing(self):
        print(f"Training Window: {self._get_years_range(-self.input_window_size-self.output_window_size-self.offset_to_current_year, -self.output_window_size-self.offset_to_current_year)}")
        print(f"Label Window: {self._get_years_range(-self.output_window_size-self.offset_to_current_year, -self.offset_to_current_year)}")

    def _get_years_range(self, start: int, end: int) -> np.ndarray:
        # return np.unique(saved_year_arr)[start:end]
    
        return (np.unique(saved_year_arr)[start:] if end == -0 
                                 else np.unique(saved_year_arr)[start:end])

    @staticmethod
    def get_co_occur_concept_pair_after_year_arr(word_co_occurrences: dict, first_occ_year: int, final_occ_year: int) -> np.ndarray:
        
        co_occur_concept_pair_arr = []
        for concept, v in word_co_occurrences.items():
            for co_concept, years in v.items():
                if np.min(years) >= first_occ_year and np.max(years)<=final_occ_year:
                    co_occur_concept_pair_arr.append([concept,co_concept])
        return np.array(co_occur_concept_pair_arr)

        



# Example usage:
num_samples_per_class = 32
num_features = 128
seq_length = 5
batch_size = 128

encoding_dat = np.load("c_encoding_arr.npy")
c_inx_arr = np.load("c_inx_arr.npy")
print("Representation Vectors for tracked concepts",encoding_dat.shape)
print("Concept associted with representation", c_inx_arr.shape)
scaler = RobustScaler()
reshaped_data = encoding_dat.reshape(-1, encoding_dat.shape[-1])  # Shape: (10000*31, 128)
normalized_data = scaler.fit_transform(reshaped_data)
encoding_data = normalized_data.reshape(encoding_dat.shape)

dataset = TimeSeriesDataset(data=encoding_data, word_co_occurrences=word_co_occurrences, year_arr=np.unique(saved_year_arr), 
                            c_inx_arr=c_inx_arr, input_window_size = 10, output_window_size = 3, offset_to_current_year = 3)
dataset._check_indexing()
print()
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

testing_dataset = TimeSeriesDataset(data=encoding_data, word_co_occurrences=word_co_occurrences, year_arr=np.unique(saved_year_arr), 
                            c_inx_arr=c_inx_arr, input_window_size = 10, output_window_size = 3, offset_to_current_year = 0)
testing_dataset._check_indexing()
testing_dataloader = DataLoader(testing_dataset, batch_size=batch_size, shuffle=True)

print()
novel_dataset = NovelSeriesDataset(data=encoding_data, c_inx_arr=c_inx_arr, input_window_size = 10)
novel_dataset._check_indexing()
novel_dataloader = DataLoader(novel_dataset, batch_size=batch_size, shuffle=True)



Representation Vectors for tracked concepts (12368, 31, 128)
Concept associted with representation (12368,)
Training Window: [2009 2010 2011 2012 2013 2014 2015 2016 2017 2018]
Label Window: [2019 2020 2021]

Training Window: [2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]
Label Window: [2022 2023 2024]

Training Window: [2015 2016 2017 2018 2019 2020 2021 2022 2023 2024]


In [31]:
# class MLP(nn.Module):
#     def __init__(self, input_dim):
#         super(MLP, self).__init__()
#         self.fc1 = nn.Linear(input_dim, 256)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(256, 128)
#         self.fc3 = nn.Linear(128, 64)
#         self.fc4 = nn.Linear(64, 1)
        
#         self.sigmoid = nn.Sigmoid()
    
#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.fc2(x)
#         x = self.relu(x)
#         x = self.fc3(x)
#         x = self.relu(x)
#         x = self.fc4(x)
#         x = self.sigmoid(x)
#         return x
    
# # Define the model, loss function, optimizer, and scheduler
# input_dim = dataset.train_window_data.shape[1] * dataset.train_window_data.shape[2] * 2  # Flattened size * 2 for concatenated pairs
# model = MLP(input_dim=input_dim)

# criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# scheduler = CosineAnnealingLR(optimizer, T_max=10)

# # Training the model with early stopping
# num_epochs = 50
# patience = 5
# best_val_loss = float('inf')
# early_stopping_counter = 0

# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     correct_train = 0
#     total_train = 0
    
#     for data, labels, _ in train_loader:
#         data = data.view(data.size(0), -1).float()  # Flatten the input data
#         labels = labels.float()
        
#         optimizer.zero_grad()
#         outputs = model(data)
        
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
        
#         running_loss += loss.item()
#         predicted = (outputs > 0.5).float()
#         total_train += labels.size(0)
#         correct_train += (predicted == labels).sum().item()
    
#     scheduler.step()
#     train_accuracy = 100 * correct_train / total_train
#     train_loss = running_loss / len(train_loader)

#     # Validation phase
#     model.eval()
#     running_val_loss = 0.0
#     correct_val = 0
#     total_val = 0
    
#     with torch.no_grad():
#         for data, labels, _ in val_loader:
#             data = data.view(data.size(0), -1).float()  # Flatten the input data
#             labels = labels.float()
#             outputs = model(data)
#             loss = criterion(outputs, labels)
#             running_val_loss += loss.item()
#             predicted = (outputs > 0.5).float()
#             total_val += labels.size(0)
#             correct_val += (predicted == labels).sum().item()
    
#     val_loss = running_val_loss / len(val_loader)
#     val_accuracy = 100 * correct_val / total_val

#     print(f'Epoch [{epoch+1}/{num_epochs}], '
#           f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
#           f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

#     # Early stopping
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         early_stopping_counter = 0
#         torch.save(model.state_dict(), 'best_model.pth')
#     else:
#         early_stopping_counter += 1
#         if early_stopping_counter >= patience:
#             print("Early stopping triggered")
#             break

Epoch [1/50], Train Loss: 0.6096, Train Accuracy: 65.98%, Val Loss: 0.5685, Val Accuracy: 71.39%
Epoch [2/50], Train Loss: 0.5494, Train Accuracy: 73.30%, Val Loss: 0.5295, Val Accuracy: 75.03%
Epoch [3/50], Train Loss: 0.5318, Train Accuracy: 74.49%, Val Loss: 0.5194, Val Accuracy: 74.83%
Epoch [4/50], Train Loss: 0.5242, Train Accuracy: 74.78%, Val Loss: 0.5320, Val Accuracy: 74.64%
Epoch [5/50], Train Loss: 0.5143, Train Accuracy: 75.38%, Val Loss: 0.5162, Val Accuracy: 75.20%
Epoch [6/50], Train Loss: 0.5014, Train Accuracy: 76.34%, Val Loss: 0.5112, Val Accuracy: 75.38%
Epoch [7/50], Train Loss: 0.4929, Train Accuracy: 76.71%, Val Loss: 0.5055, Val Accuracy: 75.23%
Epoch [8/50], Train Loss: 0.4963, Train Accuracy: 76.38%, Val Loss: 0.4894, Val Accuracy: 76.72%
Epoch [9/50], Train Loss: 0.4968, Train Accuracy: 76.28%, Val Loss: 0.4849, Val Accuracy: 77.61%
Epoch [10/50], Train Loss: 0.4969, Train Accuracy: 76.13%, Val Loss: 0.4873, Val Accuracy: 77.05%
Epoch [11/50], Train Loss: 0.

In [14]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.optim.lr_scheduler import CosineAnnealingLR
# from torch.utils.data import DataLoader, random_split
# import numpy as np
# from sklearn.preprocessing import RobustScaler
import logging

# Define the MLP model with enhancements
class MLP(nn.Module):
    def __init__(self, input_dim: int):
        """
        Initialize the MLP model.
        
        Args:
            input_dim (int): Dimension of the input features.
        """
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the model.
        
        Args:
            x (torch.Tensor): Input tensor.
        
        Returns:
            torch.Tensor: Output tensor.
        """
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x

# Define the training and validation functions
def train_one_epoch(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for data, labels, _ in train_loader:
        data = data.view(data.size(0), -1).float()  # Flatten the input data
        labels = labels.float()
        
        optimizer.zero_grad()
        outputs = model(data)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        predicted = (outputs > 0.5).float()
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
    
    train_accuracy = 100 * correct_train / total_train
    train_loss = running_loss / len(train_loader)
    return train_loss, train_accuracy

def validate_one_epoch(model, val_loader, criterion):
    model.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val = 0
    
    with torch.no_grad():
        for data, labels, _ in val_loader:
            data = data.view(data.size(0), -1).float()  # Flatten the input data
            labels = labels.float()
            outputs = model(data)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
            predicted = (outputs > 0.5).float()
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()
    
    val_loss = running_val_loss / len(val_loader)
    val_accuracy = 100 * correct_val / total_val
    return val_loss, val_accuracy

# Training loop with early stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=50, patience=5):
    best_val_loss = float('inf')
    early_stopping_counter = 0

    for epoch in range(num_epochs):
        train_loss, train_accuracy = train_one_epoch(model, train_loader, criterion, optimizer)
        val_loss, val_accuracy = validate_one_epoch(model, val_loader, criterion)

        scheduler.step()
        
        logging.info(f'Epoch [{epoch+1}/{num_epochs}], '
                     f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
                     f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stopping_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                logging.info("Early stopping triggered")
                break

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Example usage
input_dim = dataset.train_window_data.shape[1] * dataset.train_window_data.shape[2] * 2  # Flattened size * 2 for concatenated pairs
model = MLP(input_dim=input_dim)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = CosineAnnealingLR(optimizer, T_max=10)

train_model(model, train_loader, val_loader, criterion, optimizer, scheduler)

2024-06-13 21:32:35,466 - INFO - Epoch [1/50], Train Loss: 0.5369, Train Accuracy: 73.99%, Val Loss: 0.5135, Val Accuracy: 75.62%
2024-06-13 21:33:22,465 - INFO - Epoch [2/50], Train Loss: 0.5196, Train Accuracy: 75.04%, Val Loss: 0.4962, Val Accuracy: 76.69%
2024-06-13 21:34:11,741 - INFO - Epoch [3/50], Train Loss: 0.5103, Train Accuracy: 75.69%, Val Loss: 0.4892, Val Accuracy: 76.90%
2024-06-13 21:35:00,300 - INFO - Epoch [4/50], Train Loss: 0.5044, Train Accuracy: 75.98%, Val Loss: 0.4827, Val Accuracy: 77.31%
2024-06-13 21:35:54,531 - INFO - Epoch [5/50], Train Loss: 0.4963, Train Accuracy: 76.61%, Val Loss: 0.4773, Val Accuracy: 77.58%
2024-06-13 21:36:44,152 - INFO - Epoch [6/50], Train Loss: 0.4913, Train Accuracy: 76.86%, Val Loss: 0.4718, Val Accuracy: 77.92%
2024-06-13 21:37:34,850 - INFO - Epoch [7/50], Train Loss: 0.4899, Train Accuracy: 76.84%, Val Loss: 0.4689, Val Accuracy: 78.18%
2024-06-13 21:38:24,698 - INFO - Epoch [8/50], Train Loss: 0.4872, Train Accuracy: 77.17%,

In [27]:
import torch
from sklearn.linear_model import LinearRegression
import numpy as np
import logging

# Define the cosine similarity function
# def similarity_cosine(vec1, vec2):
#     """
#     Compute the cosine similarity between two vectors.

#     Args:
#         vec1 (numpy.ndarray): First vector.
#         vec2 (numpy.ndarray): Second vector.

#     Returns:
#         float: Cosine similarity between vec1 and vec2.
#     """
#     cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
#     return cos_sim
def similarity_cosine(vec1, vec2):
    cosine_similarity_arr = []
    for v1,v2 in zip(vec1, vec2):
        cosine_similarity = np.dot(v1, v2)/(np.linalg.norm(v1)* np.linalg.norm(v2))
        cosine_similarity_arr.append(cosine_similarity)
    return np.array(cosine_similarity_arr)

def load_best_model(model, path='best_model.pth'):
    """
    Load the best model from the given path.

    Args:
        model (torch.nn.Module): The model to load the state dictionary into.
        path (str): Path to the saved model state dictionary.
    """
    model.load_state_dict(torch.load(path))

def evaluate_model(model, dataloader, encoding_dat, c_inx_arr):
    """
    Evaluate the model on the provided dataloader.

    Args:
        model (torch.nn.Module): The trained model.
        dataloader (torch.utils.data.DataLoader): DataLoader for the dataset.
        encoding_dat (numpy.ndarray): Encoded data.
        c_inx_arr (numpy.ndarray): Concept index array.

    Returns:
        float: Validation accuracy.
        numpy.ndarray: Indices of the samples.
        numpy.ndarray: Model outputs.
        numpy.ndarray: Ground truth labels.
        numpy.ndarray: Correct predictions.
    """
    model.eval()
    correct_val = 0
    total_val = 0

    indices = []
    outputs_list = []
    correct_indices = []
    labels_list = []

    with torch.no_grad():
        for data, labels, inx in dataloader:
            data = data.view(data.size(0), -1).float()  # Flatten the input data
            labels = labels.float()
            outputs = model(data)
            predicted = (outputs > 0.5).float()
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()
            
            # Collect indices, outputs, labels, and correct predictions
            indices.extend(inx.cpu().numpy())
            outputs_list.extend(outputs.cpu().numpy())
            labels_list.extend(labels.cpu().numpy())
            correct_indices.extend((predicted == labels).cpu().numpy())
    
    # Convert lists to numpy arrays
    indices = np.array(indices)
    outputs_list = np.array(outputs_list).flatten()
    labels_list = np.array(labels_list).flatten()
    correct_indices = np.array(correct_indices).flatten()

    val_accuracy = 100 * correct_val / total_val
    return val_accuracy, indices, outputs_list, labels_list, correct_indices

def test_model(model, dataloader):
    """
    Evaluate the model on the provided dataloader.

    Args:
        model (torch.nn.Module): The trained model.
        dataloader (torch.utils.data.DataLoader): DataLoader for the dataset.
        encoding_dat (numpy.ndarray): Encoded data.
        c_inx_arr (numpy.ndarray): Concept index array.

    Returns:
        float: Validation accuracy.
        numpy.ndarray: Indices of the samples.
        numpy.ndarray: Model outputs.
        numpy.ndarray: Ground truth labels.
        numpy.ndarray: Correct predictions.
    """
    model.eval()

    indices = []
    outputs_list = []  
    predicted_list = []  

    with torch.no_grad():
        for data, inx in dataloader:
            data = data.view(data.size(0), -1).float()  # Flatten the input data
            
            outputs = model(data)
            predicted = (outputs > 0.5).float()
            
            # Collect indices, outputs, labels, and correct predictions
            indices.extend(inx.cpu().numpy())
            outputs_list.extend(outputs.cpu().numpy())
            predicted_list.extend(predicted.cpu().numpy())
            
    # Convert lists to numpy arrays for sorting
    indices = np.array(indices)
    outputs_list = np.array(outputs_list).flatten()
    predicted_list = np.array(predicted_list).flatten()
    
    return indices, outputs_list, predicted_list

def analyze_novel_predictions(indices, outputs_list, predicted_list, encoding_dat, c_inx_arr):
    """
    Analyze the predictions and print correct predictions for co-occurrence and non-co-occurrence.

    Args:
        indices (numpy.ndarray): Indices of the samples.
        outputs_list (numpy.ndarray): Model outputs.
        labels_list (numpy.ndarray): Ground truth labels.
        correct_indices (numpy.ndarray): Correct predictions.
        encoding_dat (numpy.ndarray): Encoded data.
        c_inx_arr (numpy.ndarray): Concept index array.
    """
    sorted_indices = np.argsort(outputs_list)

    correct_0 = []
    correct_1 = []

    for i in sorted_indices:
        if predicted_list[i]:
            correct_1.append(indices[i])
        else:
            correct_0.append(indices[i])

    print("Correct predictions to have no co-occurrence:")
    print_correct_predictions(correct_0, encoding_dat, c_inx_arr, "no co-occurrence")

    print("\nCorrect predictions to have co-occurrence:")
    print_correct_predictions(correct_1, encoding_dat, c_inx_arr, "co-occurrence")

def analyze_predictions(indices, outputs_list, labels_list, correct_indices, encoding_dat, c_inx_arr):
    """
    Analyze the predictions and print correct predictions for co-occurrence and non-co-occurrence.

    Args:
        indices (numpy.ndarray): Indices of the samples.
        outputs_list (numpy.ndarray): Model outputs.
        labels_list (numpy.ndarray): Ground truth labels.
        correct_indices (numpy.ndarray): Correct predictions.
        encoding_dat (numpy.ndarray): Encoded data.
        c_inx_arr (numpy.ndarray): Concept index array.
    """
    sorted_indices = np.argsort(outputs_list)

    correct_0 = []
    correct_1 = []

    for i in sorted_indices:
        if correct_indices[i]:
            if labels_list[i] == 0:
                correct_0.append(indices[i])
            else:
                correct_1.append(indices[i])

    print("Correct predictions to have no co-occurrence:")
    print_correct_predictions(correct_0, encoding_dat, c_inx_arr, "no co-occurrence")

    print("\nCorrect predictions to have co-occurrence:")
    print_correct_predictions(correct_1, encoding_dat, c_inx_arr, "co-occurrence")

def print_correct_predictions(correct_list, encoding_dat, c_inx_arr, label):
    """
    Print the correct predictions with their slopes.

    Args:
        correct_list (list): List of correct predictions.
        encoding_dat (numpy.ndarray): Encoded data.
        c_inx_arr (numpy.ndarray): Concept index array.
        label (str): Label for the type of correct prediction (co-occurrence or no co-occurrence).
    """
    x = np.arange(31).reshape(-1, 1)
    lin_model = LinearRegression()

    for cnt, idx in enumerate(correct_list):
        sim = similarity_cosine(encoding_dat[idx[0]][0], encoding_dat[idx[1]][0])
        lin_model.fit(x, sim.reshape(-1, 1))
        slope = lin_model.coef_[0][0]
        print(c_inx_arr[idx[0]], c_inx_arr[idx[1]], np.round(slope, 3))
        if cnt == 5:
            break

In [20]:
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the best model
load_best_model(model)

# Final evaluation on the validation set
val_accuracy, indices, outputs_list, labels_list, correct_indices = evaluate_model(model, testing_dataloader, encoding_dat, c_inx_arr)

# Analyze predictions
analyze_predictions(indices, outputs_list, labels_list, correct_indices, encoding_dat, c_inx_arr)

# Print final validation accuracy
logging.info(f"\nValidation Accuracy: {val_accuracy:.2f}%")

2024-06-14 00:18:30,801 - INFO - 
Validation Accuracy: 76.20%


Correct predictions to have no co-occurrence:
['average_phonon_number'] ['quantum_pcp_conjecture'] -0.004
['memory_function'] ['boolean_algebra'] 0.003
['mechanical_resonance'] ['boolean_algebra'] -0.0
['network_coding'] ['electric_polarization'] -0.003
['linear_constraint'] ['ultracold_rubidium_atom'] -0.009
['expander_graph'] ['spin_wave_mode'] 0.003

Correct predictions to have co-occurrence:
['spectral_gap'] ['chaos_theory'] 0.004
['robust_quantum_information_processing'] ['schrodinger_equation'] -0.005
['classical_field_theory'] ['inner_product_space'] 0.003
['gauss_law'] ['governing_equation'] -0.003
['fermi_hubbard_model'] ['block_encoding'] -0.002
['fermi_hubbard_model'] ['block_encoding'] -0.002


In [29]:
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the best model
load_best_model(model)

# Final evaluation on the validation set
indices, outputs_list, predicted_list = test_model(model, novel_dataloader)

# Analyze predictions
analyze_novel_predictions(indices, outputs_list, predicted_list, encoding_dat, c_inx_arr)

# Print final validation accuracy
logging.info(f"\nValidation Accuracy: {val_accuracy:.2f}%")

2024-06-14 00:20:23,045 - INFO - 
Validation Accuracy: 76.20%


Correct predictions to have no co-occurrence:
['deterministic_computation'] ['single_cesium_atom'] -0.003
['orthogonal_array'] ['infrared_wavelength'] -0.003
['deterministic_computation'] ['signal_light'] -0.005
['quantum_homomorphic_encryption'] ['esr_measurement'] -0.008
['collective_atomic_excitation'] ['complexity_function'] -0.004
['structural_similarity'] ['molecular_bond'] -0.006

Correct predictions to have co-occurrence:
['superfluid_pairing'] ['metastable_helium_atom'] -0.003
['quantum_information_encoded'] ['single_phase'] -0.007
['quantum_engineering'] ['discrete_rotational_symmetry'] 0.003
['multivariate_polynomial'] ['volume_element'] 0.006
['linear_velocity'] ['aspect_ratio'] 0.011
['spectral_decomposition'] ['antiferromagnetic_interaction'] -0.006


In [17]:
# # Load the best model
# model.load_state_dict(torch.load('best_model.pth'))

# # Final evaluation on the validation set
# model.eval()
# correct_val = 0
# total_val = 0

# indices = []
# outputs_list = []
# correct_indices = []
# labels_list = []

# x = np.arange(31).reshape(-1, 1)
# lin_model = LinearRegression()

# with torch.no_grad():
#     for data, labels, inx in testing_dataloader:
#         data = data.view(data.size(0), -1).float()  # Flatten the input data
#         labels = labels.float()
#         outputs = model(data)
#         predicted = (outputs > 0.5).float()
#         total_val += labels.size(0)
#         correct_val += (predicted == labels).sum().item()
        
#         # Collect indices, outputs, labels, and correct predictions
#         indices.extend(inx.cpu().numpy())
#         outputs_list.extend(outputs.cpu().numpy())
#         labels_list.extend(labels.cpu().numpy())
#         correct_indices.extend((predicted == labels).cpu().numpy())
        

# # Convert lists to numpy arrays for sorting
# indices = np.array(indices)
# outputs_list = np.array(outputs_list).flatten()
# labels_list = np.array(labels_list).flatten()
# correct_indices = np.array(correct_indices).flatten()

# # Get sorted indices of the outputs
# sorted_indices = np.argsort(outputs_list)

# # Separate the indices of correct predictions into two categories
# correct_0 = []
# correct_1 = []

# for i in sorted_indices:
#     if correct_indices[i]:
#         if labels_list[i] == 0:
#             correct_0.append(indices[i])
#         else:
#             correct_1.append(indices[i])

# # Print indices of correct predictions
# print("Correct predictions to have no co-occurance:")
# for cnt,idx in enumerate(correct_0):
#     sim = similarity_cosine(encoding_dat[idx[0]][0],encoding_dat[idx[1]][0])
#     lin_model.fit(x, sim.reshape(-1, 1))
#     slope = lin_model.coef_[0][0]
#     print(c_inx_arr[idx[0]],c_inx_arr[idx[1]], np.round(slope,3))
#     if cnt ==5:
#         break

# print("\n Correct predictions to have co-occurance:")
# for cnt,idx in enumerate(correct_1):
#     sim = similarity_cosine(encoding_dat[idx[0]][0],encoding_dat[idx[1]][0])
#     lin_model.fit(x, sim.reshape(-1, 1))
#     slope = lin_model.coef_[0][0]
#     print(c_inx_arr[idx[0]],c_inx_arr[idx[1]], np.round(slope,3))
#     if cnt ==5:
#         break

# print(f"\nValidation Accuracy: {100 * correct_val / total_val:.2f}%")

In [37]:
# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

# Final evaluation on the validation set
model.eval()
correct_val = 0
total_val = 0

indices = []
outputs_list = []
predicted_list = []

with torch.no_grad():
    for data, inx in novel_dataloader:
        data = data.view(data.size(0), -1).float()  # Flatten the input data
        
        outputs = model(data)
        predicted = (outputs > 0.5).float()
        
        # Collect indices, outputs, labels, and correct predictions
        indices.extend(inx.cpu().numpy())
        outputs_list.extend(outputs.cpu().numpy())
        predicted_list.extend(predicted.cpu().numpy())
        
# Convert lists to numpy arrays for sorting
indices = np.array(indices)
outputs_list = np.array(outputs_list).flatten()
predicted_list = np.array(predicted_list).flatten()


# Get sorted indices of the outputs
sorted_indices = np.argsort(outputs_list)

# Separate the indices of correct predictions into two categories
correct_0 = []
correct_1 = []

for i in sorted_indices:
    if predicted_list[i]:
        correct_1.append(indices[i])
    else:
        correct_0.append(indices[i])

# Print indices of correct predictions
print(" Predictions to have no co-occurance:")
for cnt,idx in enumerate(correct_0):
    sim = similarity_cosine(encoding_dat[idx[0]][0],encoding_dat[idx[1]][0])
    lin_model.fit(x, sim.reshape(-1, 1))
    slope = lin_model.coef_[0][0]
    print(c_inx_arr[idx[0]],c_inx_arr[idx[1]], np.round(slope,3))
    if cnt ==5:
        break

print("\n Predictions to have co-occurance:")
for cnt,idx in enumerate(correct_1):
    sim = similarity_cosine(encoding_dat[idx[0]][0],encoding_dat[idx[1]][0])
    lin_model.fit(x, sim.reshape(-1, 1))
    slope = lin_model.coef_[0][0]
    print(c_inx_arr[idx[0]],c_inx_arr[idx[1]],np.round(slope,3))
    if cnt ==5:
        break

 Predictions to have no co-occurance:
['promise_problem'] ['photon_radiation'] -0.011
['jones_polynomial'] ['superradiant_emission'] -0.003
['quantum_adiabatic_algorithm'] ['excited_state_lifetime'] -0.002
['quantum_adiabatic_algorithm'] ['rydberg_excitons'] -0.001
['spin_orientation'] ['quantum_secret_sharing_scheme'] -0.001
['quantum_pcp_conjecture'] ['reflected_signal'] 0.004

 Predictions to have co-occurance:
['long_range_anisotropic_interaction'] ['deep_strong_coupling'] -0.0
['time_optimal_control'] ['harmonic_oscillator_mode'] 0.009
['transfer_protocol'] ['variational_quantum_state'] -0.001
['zitterbewegung_effect'] ['circular_ring'] -0.001
['electric_field_fluctuation'] ['relaxation_oscillation'] -0.007
['particle_number_fluctuation'] ['spinless_particle'] 0.008
