# Imports

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from typing import List, Dict, Tuple

# Macros

In [2]:
DEVICE = 'cuda'

PREPROCESS_DATA_BASE = 'dataset/preprocessing'

QUESTIONS_FILE_NAME = 'questions.csv'
SOLUTIONS_FILE_NAME = 'solutions.csv'

# Dataloading and Preprocessing

In [3]:
pp_qs = pd.read_csv(os.path.join(PREPROCESS_DATA_BASE, QUESTIONS_FILE_NAME))
pp_sols = pd.read_csv(os.path.join(PREPROCESS_DATA_BASE, SOLUTIONS_FILE_NAME))

In [4]:
def encode_numeric_cols(df: pd.DataFrame) -> Tuple[Dict[str, int], Dict[int, str]]:
    df_cols = df.columns
    df_to_numeric_cols = {df_cols[i]: i for i in range(len(df_cols))}
    df_to_str_cols = dict(zip(df_to_numeric_cols.values(), df_to_numeric_cols.keys()))
    return df_to_numeric_cols, df_to_str_cols

In [5]:
# convert columm names to numeric values
pp_qs_to_numeric_cols, pp_qs_to_str_cols = encode_numeric_cols(pp_qs)

pp_sols_to_numeric_cols, pp_sols_to_str_cols = encode_numeric_cols(pp_sols)

In [6]:
print(pp_qs_to_numeric_cols)
print(pp_qs_to_str_cols)
print()
print(pp_sols_to_numeric_cols)
print(pp_sols_to_str_cols)

{'Is n (number of features) >> p (number of class labels)': 0, ' Are there outliers?': 1, ' Are features on vastly different scales?': 2, ' Is the data collected from multiple sources or devices and grouped together?': 3}
{0: 'Is n (number of features) >> p (number of class labels)', 1: ' Are there outliers?', 2: ' Are features on vastly different scales?', 3: ' Is the data collected from multiple sources or devices and grouped together?'}

{'PCA': 0, 'Correlation_Based_Feature_Selection': 1, 'Standard_Scaling': 2, 'Per_Batch_Standard_Scaling': 3}
{0: 'PCA', 1: 'Correlation_Based_Feature_Selection', 2: 'Standard_Scaling', 3: 'Per_Batch_Standard_Scaling'}


In [7]:
pp_qs = pp_qs.rename(columns=pp_qs_to_numeric_cols)
pp_sols = pp_sols.rename(columns=pp_sols_to_numeric_cols)

In [8]:
def df_pretty_print(df: pd.DataFrame, df_title: str):
    print('-' * 50)
    print(df_title.upper())
    print()
    print(df)
    print('-' * 50)
    print()

In [9]:
df_pretty_print(pp_qs, "Preprocessing Questions")
df_pretty_print(pp_sols, "Preprocessing Solutions")

--------------------------------------------------
PREPROCESSING QUESTIONS

    0  1  2  3
0   1  1  1  1
1   1  1  1  0
2   1  1  0  1
3   1  0  1  1
4   0  1  1  1
5   1  1  0  0
6   1  0  1  0
7   0  1  1  0
8   1  0  0  1
9   0  1  0  1
10  0  0  1  1
11  1  0  0  0
12  0  1  0  0
13  0  0  1  0
14  0  0  0  1
--------------------------------------------------

--------------------------------------------------
PREPROCESSING SOLUTIONS

    0  1  2  3
0   0  1  1  1
1   0  1  1  0
2   0  1  0  1
3   1  0  1  1
4   0  0  1  1
5   0  1  0  0
6   1  0  1  0
7   0  0  1  0
8   1  0  0  1
9   0  0  0  1
10  0  0  1  1
11  1  1  0  0
12  0  1  0  0
13  0  0  1  0
14  0  0  0  1
--------------------------------------------------



# Define our Data Dict

In [10]:
def get_data_dict(qs: pd.DataFrame, sols: pd.DataFrame) -> Dict[Tuple, torch.Tensor]:
    qs_numpy = qs.to_numpy()
    sols_numpy = sols.to_numpy()
    assert qs_numpy.shape[0] == sols_numpy.shape[0]
    
    data_dict = {}
    
    for i, q in enumerate(qs_numpy):
        data_dict[tuple(q)] = torch.from_numpy(sols_numpy[i]).float()
        
    return data_dict

In [11]:
pp_data_dict = get_data_dict(pp_qs, pp_sols)

# Define Model Somehow??

In [12]:
def shuffle_dict(dict: Dict):
    keys = list(dict.keys())
    num_keys = len(keys)
    perm = torch.randperm(num_keys)
    return {keys[perm[i]]: dict[keys[perm[i]]] for i in range(num_keys)}

In [13]:
class ModSeek(torch.nn.Module):
    def __init__(self, num_qs: int, num_sols: int):
        super().__init__()
        
        self.num_qs = num_qs
        self.num_sols = num_sols
        self.W_yes = torch.nn.Parameter(torch.randn(num_qs, num_sols))
        self.W_no = torch.nn.Parameter(torch.randn(num_qs, num_sols))
    
    def forward(self, answered_qs: Tuple):
        init_prob = 1 / self.num_sols
        
        p = torch.full((self.num_sols,), init_prob)
        z = torch.log(p)
        for i, answered in enumerate(answered_qs):
            if answered == 1:
                z = z + self.W_yes[i]
            else:
                z = z + self.W_no[i]
        
        return z
                    
                

In [14]:
mod_seek = ModSeek(len(pp_data_dict), len(pp_data_dict[list(pp_data_dict.keys())[0]]))
optimizer = torch.optim.Adam(mod_seek.parameters(), lr=1e-3)
criterion = torch.nn.MSELoss()
num_epochs = 1000


In [15]:
mod_seek.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for q in pp_data_dict:
        optimizer.zero_grad()
        probs = torch.sigmoid(mod_seek(q))
        loss = criterion(probs, pp_data_dict[q])
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch}: loss = {total_loss / len(pp_data_dict)}")


Epoch 0: loss = 0.4022134363651276
Epoch 1: loss = 0.40052655041217805
Epoch 2: loss = 0.39893012344837187
Epoch 3: loss = 0.39732459584871926
Epoch 4: loss = 0.39570443431536356
Epoch 5: loss = 0.3940690070390701
Epoch 6: loss = 0.39241829911867776
Epoch 7: loss = 0.3907523175080617
Epoch 8: loss = 0.3890709469715754
Epoch 9: loss = 0.3873740593592326
Epoch 10: loss = 0.38566143413384757
Epoch 11: loss = 0.38393285870552063
Epoch 12: loss = 0.3821881343921026
Epoch 13: loss = 0.3804269671440125
Epoch 14: loss = 0.37864917814731597
Epoch 15: loss = 0.3768545061349869
Epoch 16: loss = 0.3750427265961965
Epoch 17: loss = 0.37321363588174183
Epoch 18: loss = 0.37136701345443723
Epoch 19: loss = 0.36950267950693766
Epoch 20: loss = 0.36762046019236244
Epoch 21: loss = 0.3657201965649923
Epoch 22: loss = 0.3638017584880193
Epoch 23: loss = 0.3618649850289027
Epoch 24: loss = 0.3599098106225332
Epoch 25: loss = 0.35793616573015846
Epoch 26: loss = 0.3559439649184545
Epoch 27: loss = 0.353933

In [17]:
first_qs_answered = list(pp_data_dict.keys())[6]
print(f"Actual solution vector: {pp_data_dict[first_qs_answered]}")
print(f"Predicted solution vector: {torch.sigmoid(mod_seek(first_qs_answered))}")

Actual solution vector: tensor([1., 0., 1., 0.])
Predicted solution vector: tensor([0.9561, 0.2252, 0.9928, 0.0124], grad_fn=<SigmoidBackward0>)
