## Importing libraries and directory settings

In [1]:
import numpy as np
from torch_geometric.data import HeteroData
import torch
import random 

data_dir = '/home/deependra/project/23-hetero-smote/HeteroG/data/imdb_han/'

# Loading the data from the files (already created in the preprocess)

In [2]:
# node numbers and file_names
 
N_m = 4666
N_d = 2272
N_a = 5850

data_dir = '/home/deependra/project/23-hetero-smote/HeteroG/data/imdb_han/'

md_file = 'movie_director.txt'
ma_file = 'movie_actors.txt'

m_fullclass_file = 'm_class_local.txt'

m_emb_file = 'movie_embeddings.txt'

## Loading full data as PyG HeteroData

In [3]:
# creating full classes

full_y = np.loadtxt(data_dir + m_fullclass_file, delimiter=',', dtype=int)

ratio_train_val_test = [0.2, 0.2, 0.6] # for test whatever rest of data is used not the given value

class0 = full_y[full_y[:,1] == 0]
class1 = full_y[full_y[:,1] == 1]
class2 = full_y[full_y[:,1] == 2]

np.random.shuffle(class0)
np.random.shuffle(class1)
np.random.shuffle(class2)

# train and val numbers needed correctly calculated rest will be assigned to test
class0_train_val_test = np.array([int(len(class0)*i) for i in ratio_train_val_test])
class1_train_val_test = np.array([int(len(class1)*i) for i in ratio_train_val_test])
class2_train_val_test = np.array([int(len(class2)*i) for i in ratio_train_val_test])

print(f"number of class 0 movies: {len(class0)}")
print(f"number of class 1 movies: {len(class1)}")
print(f"number of class 2 movies: {len(class2)}")

m_fullclass_train = []
m_fullclass_val = []
m_fullclass_test = []


m_fullclass_train.extend(class0[:class0_train_val_test[0]])
m_fullclass_train.extend(class1[:class1_train_val_test[0]])
m_fullclass_train.extend(class2[:class2_train_val_test[0]])

m_fullclass_val.extend(class0[class0_train_val_test[0]:class0_train_val_test[0]+class0_train_val_test[1]])
m_fullclass_val.extend(class1[class1_train_val_test[0]:class1_train_val_test[0]+class1_train_val_test[1]])
m_fullclass_val.extend(class2[class2_train_val_test[0]:class2_train_val_test[0]+class2_train_val_test[1]])

# test will have rest of the data
m_fullclass_test.extend(class0[class0_train_val_test[0]+class0_train_val_test[1]:])
m_fullclass_test.extend(class1[class1_train_val_test[0]+class1_train_val_test[1]:])
m_fullclass_test.extend(class2[class2_train_val_test[0]+class2_train_val_test[1]:])

number of class 0 movies: 543
number of class 1 movies: 924
number of class 2 movies: 1473


In [4]:
# loading full HeteroData

full_data = HeteroData()

# adding nodes
full_data['movie'].num_nodes = N_m
full_data['director'].num_nodes = N_d
full_data['actor'].num_nodes = N_a

# movie director edge index
full_data['movie','directed by','director'].edge_index = torch.tensor(np.loadtxt(data_dir + md_file, delimiter=',', dtype=int)).T.long()
full_data['movie','has','actor'].edge_index = torch.tensor(np.loadtxt(data_dir + ma_file, delimiter=',', dtype=int)).T.long()

# movie embeddings
full_data['movie'].x = torch.tensor(np.loadtxt(data_dir + m_emb_file, delimiter=' ', dtype=float))


# movie class
full_data['movie'].y_train = torch.tensor(m_fullclass_train).long()
full_data['movie'].y_val = torch.tensor(m_fullclass_test).long()
full_data['movie'].y_test = torch.tensor(m_fullclass_val).long()


print(full_data)

HeteroData(
  [1mmovie[0m={
    num_nodes=4666,
    x=[4666, 128],
    y_train=[586, 2],
    y_test=[586, 2],
    y_val=[1768, 2]
  },
  [1mdirector[0m={ num_nodes=2272 },
  [1mactor[0m={ num_nodes=5850 },
  [1m(movie, directed by, director)[0m={ edge_index=[2, 4666] },
  [1m(movie, has, actor)[0m={ edge_index=[2, 13990] }
)


  full_data['movie'].y_train = torch.tensor(m_fullclass_train).long()


## Loading balanced data as PyG Heterodata



In [5]:
# creating balanced train

m_all_class = np.loadtxt(data_dir + 'm_class_local.txt',delimiter=',', dtype=int)

# for each class, creating train index and class
n_train = 100

training_idx = []


for i in range(3):
    class_idx = [m_all_class[i] for i in np.where(m_all_class[:,1] == i)[0]]
    print(f"number of class {i} movies: {len(class_idx)}")
    random.shuffle(class_idx)
    class_idx = np.array(class_idx)
    training_idx.extend(class_idx[:n_train,:])

training_idx = np.array(training_idx)


print(f"\ntraining_idxs: {len(training_idx)}")
for i in range(3):
    print(f"number of class {i} movies: {len(np.where(training_idx[:,1] == i)[0])}")

number of class 0 movies: 543
number of class 1 movies: 924
number of class 2 movies: 1473

training_idxs: 300
number of class 0 movies: 100
number of class 1 movies: 100
number of class 2 movies: 100


In [6]:
# loading imbalanced HeteroData

balanced_data = HeteroData()

# adding nodes
balanced_data['movie'].num_nodes = N_m
balanced_data['director'].num_nodes = N_d
balanced_data['actor'].num_nodes = N_a

# movie director edge index
balanced_data['movie','directed by','director'].edge_index = torch.tensor(np.loadtxt(data_dir + md_file, delimiter=',', dtype=int)).T.long()
balanced_data['movie','has','actor'].edge_index = torch.tensor(np.loadtxt(data_dir + ma_file, delimiter=',', dtype=int)).T.long()

# movie embeddings
balanced_data['movie'].x = torch.tensor(np.loadtxt(data_dir + m_emb_file, delimiter=' ', dtype=float))

# movie class
balanced_data['movie'].y_train = torch.tensor(training_idx).long()
balanced_data['movie'].y_val = torch.tensor(m_fullclass_val).long()
balanced_data['movie'].y_test = torch.tensor(m_fullclass_test).long()


print(balanced_data)

HeteroData(
  [1mmovie[0m={
    num_nodes=4666,
    x=[4666, 128],
    y_train=[300, 2],
    y_val=[586, 2],
    y_test=[1768, 2]
  },
  [1mdirector[0m={ num_nodes=2272 },
  [1mactor[0m={ num_nodes=5850 },
  [1m(movie, directed by, director)[0m={ edge_index=[2, 4666] },
  [1m(movie, has, actor)[0m={ edge_index=[2, 13990] }
)


## Loading imbalanced data as PyG Heterodata

In [7]:
# DATA imbalance settings
classes = [0,1,2]
n_full = [100,100,300] # take only less than [100,100,300] for now (train, val, test)
ratio_classes = [1,1,0.5] # for classes
random_seed_value = 42 # change this to get different datasets for same imbalance settings
random.seed(random_seed_value)

n_train_classes = [int(n_full[0]*cls) for cls in ratio_classes]

#setting imbalance to classes
balanced_train_idx = np.loadtxt(data_dir + m_fullclass_file, delimiter=',', dtype=int)

imbalanced_train_idx = []
for cls in classes:
    imbalanced_train_idx.extend(random.sample(list(balanced_train_idx[balanced_train_idx[:,1] == [cls for i in range(len(balanced_train_idx))]]), n_train_classes[cls]))
    
imbalanced_train_idx = np.array(imbalanced_train_idx)


print(f"imbalanced_train_idx: {len(imbalanced_train_idx)}")
# for cls in classes:
#     print(f"number of class {cls} movies: {len(np.where(imbalanced_train_idx[:,1] == cls)[0])}")

imbalanced_train_idx: 250


In [8]:
# loading imbalanced HeteroData

imbalanced_data = HeteroData()

imbalanced_data['movie'].num_nodes = N_m
imbalanced_data['director'].num_nodes = N_d
imbalanced_data['actor'].num_nodes = N_a

# movie director edge index
imbalanced_data['movie','directed by','director'].edge_index = torch.tensor(np.loadtxt(data_dir + md_file, delimiter=',', dtype=int)).T.long()
imbalanced_data['movie','has','actor'].edge_index = torch.tensor(np.loadtxt(data_dir + ma_file, delimiter=',', dtype=int)).T.long()

# movie embeddings
imbalanced_data['movie'].x = torch.tensor(np.loadtxt(data_dir + m_emb_file, delimiter=' ', dtype=float))

# movie class
imbalanced_data['movie'].y_train = torch.tensor(imbalanced_train_idx).long()
imbalanced_data['movie'].y_test = torch.tensor(m_fullclass_test).long()
imbalanced_data['movie'].y_val = torch.tensor(m_fullclass_val).long()

print(imbalanced_data)

HeteroData(
  [1mmovie[0m={
    num_nodes=4666,
    x=[4666, 128],
    y_train=[250, 2],
    y_test=[1768, 2],
    y_val=[586, 2]
  },
  [1mdirector[0m={ num_nodes=2272 },
  [1mactor[0m={ num_nodes=5850 },
  [1m(movie, directed by, director)[0m={ edge_index=[2, 4666] },
  [1m(movie, has, actor)[0m={ edge_index=[2, 13990] }
)
