# Degeneracy Check

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import datetime
from scipy.sparse import csr_matrix, save_npz, load_npz

In [2]:
# Function to load saved vectors
def vec2csr(vec, csr_file_name, columns_file_name):
    csr = csr_matrix(vec)
    save_npz(csr_file_name, csr)
    if columns_file_name != None:
        columns_arr = np.array(vec.columns)
        np.save(columns_file_name, columns_arr)
def csr2vec(csr_file_name, columns_file_name):
    if columns_file_name == None:
        vec = load_npz(csr_file_name).toarray()
    else:
        vec = pd.DataFrame(load_npz(csr_file_name).toarray(),
             columns=np.load(columns_file_name,allow_pickle=True))
    return vec

In [3]:
y = pd.read_csv("data/raw/Perovskite_36937data.csv")["JV_default_PCE"]

# Find the index of a degenerate vector

In [4]:
X_0 = csr2vec("data/csr/all_sp0_oliynyk_zero_csr.npz",
            "data/csr/all_sp0_oliynyk_zero_columns.npy")

In [5]:
sum_dup=[]
df = X_0[0:]
grouped = df.groupby(df.sum(axis=1))
for key, indices in grouped.groups.items():
    if len(indices) > 1:
        sum_dup.append(list(df.index[indices]))
sum_dup.sort()
print(len(sum_dup))

5323


In [6]:
print(datetime.datetime.now())
t1 = datetime.datetime.now()
genuine_dup = []
for j in range(len(sum_dup)):
    df = pd.DataFrame(np.array(X_0)).iloc[sum_dup[j]]
    grouped = df.groupby(df.columns.tolist(), as_index=False)
    duplicated = grouped.filter(lambda x: len(x) > 1)
    grouped_index = [duplicated.index[duplicated[df.columns.tolist()].eq(val).all(axis=1)].tolist() for val in duplicated[df.columns.tolist()].drop_duplicates().values]
    genuine_dup.append(grouped_index)
    if j % 200 == 0:
        t = datetime.datetime.now()
        print(j, t-t1)
t2 = datetime.datetime.now()
print("processing time:", t2-t1)
dup = [genuine_dup[j][k] for j in range(len(genuine_dup)) for k in range(len(genuine_dup[j]))]
dup.sort()
np.save("X0_dup_index.npy",np.array(dup))
print(datetime.datetime.now())

2023-06-28 21:34:58.518142
0 0:00:04.065601
200 0:13:29.126056
400 0:26:11.677782
600 0:38:58.880621
800 0:51:14.609282
1000 1:03:51.227214
1200 1:16:49.105950
1400 1:29:20.942536
1600 1:41:53.727451
1800 1:54:01.637465
2000 2:06:32.572922
2200 2:18:52.479868
2400 2:31:30.142466
2600 2:43:32.550469
2800 2:55:26.062557
3000 3:07:33.619090
3200 3:19:38.597485
3400 3:31:50.902741
3600 3:44:13.210359
3800 3:56:51.960452
4000 4:09:13.648128
4200 4:21:30.392349
4400 4:33:56.072685
4600 4:45:55.262770
4800 4:58:00.626728
5000 5:10:12.039864
5200 5:22:25.360768
processing time: 5:29:34.233039
2023-06-29 03:04:32.760125


In [7]:
X_1 = csr2vec("data/csr/all_sp1_oliynyk_zero_csr.npz",
            "data/csr/all_sp1_oliynyk_zero_columns.npy")

In [8]:
sum_dup=[]
df = X_1[0:]
grouped = df.groupby(df.sum(axis=1))
for key, indices in grouped.groups.items():
    if len(indices) > 1:
        sum_dup.append(list(df.index[indices]))
sum_dup.sort()
print(len(sum_dup))

5592


In [9]:
print(datetime.datetime.now())
t1 = datetime.datetime.now()
genuine_dup = []
for j in range(len(sum_dup)):
    df = pd.DataFrame(np.array(X_1)).iloc[sum_dup[j]]
    grouped = df.groupby(df.columns.tolist(), as_index=False)
    duplicated = grouped.filter(lambda x: len(x) > 1)
    grouped_index = [duplicated.index[duplicated[df.columns.tolist()].eq(val).all(axis=1)].tolist() for val in duplicated[df.columns.tolist()].drop_duplicates().values]
    genuine_dup.append(grouped_index)
    if j % 200 == 0:
        t = datetime.datetime.now()
        print(j, t-t1)
t2 = datetime.datetime.now()
print("processing time:", t2-t1)
dup = [genuine_dup[j][k] for j in range(len(genuine_dup)) for k in range(len(genuine_dup[j]))]
dup.sort()
np.save("X1_dup_index.npy",np.array(dup))
print(datetime.datetime.now())

2023-06-29 03:04:35.718746
0 0:00:02.745892
200 0:12:38.710519
400 0:25:04.705582
600 0:37:28.593908
800 0:49:33.686007
1000 1:01:23.461165
1200 1:13:49.939973
1400 1:26:05.849154
1600 1:38:20.416081
1800 1:50:29.752544
2000 2:02:18.125382
2200 2:14:53.986734
2400 2:26:53.427293
2600 2:39:17.377331
2800 2:50:59.405020
3000 3:02:48.041265
3200 3:14:41.113240
3400 3:26:39.691201
3600 3:38:32.942386
3800 3:50:50.022384
4000 4:03:07.862284
4200 4:15:18.024187
4400 4:27:15.108116
4600 4:39:31.131815
4800 4:51:44.750176
5000 5:03:31.950056
5200 5:15:18.189419
5400 5:27:24.634256
processing time: 5:38:36.066556
2023-06-29 08:43:11.801077


In [10]:
X_2 = csr2vec("data/csr/all_sp2_oliynyk_zero_csr.npz",
            "data/csr/all_sp2_oliynyk_zero_columns.npy")

In [11]:
sum_dup=[]
df = X_2[0:]
grouped = df.groupby(df.sum(axis=1))
for key, indices in grouped.groups.items():
    if len(indices) > 1:
        sum_dup.append(list(df.index[indices]))
sum_dup.sort()
print(len(sum_dup))

5655


In [12]:
print(datetime.datetime.now())
t1 = datetime.datetime.now()
genuine_dup = []
for j in range(len(sum_dup)):
    df = pd.DataFrame(np.array(X_2)).iloc[sum_dup[j]]
    grouped = df.groupby(df.columns.tolist(), as_index=False)
    duplicated = grouped.filter(lambda x: len(x) > 1)
    grouped_index = [duplicated.index[duplicated[df.columns.tolist()].eq(val).all(axis=1)].tolist() for val in duplicated[df.columns.tolist()].drop_duplicates().values]
    genuine_dup.append(grouped_index)
    if j % 200 == 0:
        t = datetime.datetime.now()
        print(j, t-t1)
t2 = datetime.datetime.now()
print("processing time:", t2-t1)
dup = [genuine_dup[j][k] for j in range(len(genuine_dup)) for k in range(len(genuine_dup[j]))]
dup.sort()
np.save("X2_dup_index.npy",np.array(dup))
print(datetime.datetime.now())

2023-06-29 08:43:14.162642
0 0:00:03.092916
200 0:09:58.074834
400 0:19:33.706084
600 0:29:10.394933
800 0:38:27.336782
1000 0:47:41.327697
1200 0:57:13.277213
1400 1:06:54.754844
1600 1:16:28.699589
1800 1:25:57.800617
2000 1:35:11.891460
2200 1:44:54.581856
2400 1:54:18.725230
2600 2:03:54.406687
2800 2:13:06.930589
3000 2:22:11.170115
3200 2:31:34.327096
3400 2:40:33.160919
3600 2:50:10.035130
3800 2:59:30.779407
4000 3:09:09.716519
4200 3:18:44.130831
4400 3:27:51.677659
4600 3:37:21.770433
4800 3:46:56.055467
5000 3:56:12.365051
5200 4:05:18.498945
5400 4:14:53.055902
5600 4:24:14.282552
processing time: 4:26:33.861298
2023-06-29 13:09:48.035095


In [13]:
X_3 = csr2vec("data/csr/all_sp3_oliynyk_zero_csr.npz",
            "data/csr/all_sp3_oliynyk_zero_columns.npy")

In [14]:
sum_dup=[]
df = X_3[0:]
grouped = df.groupby(df.sum(axis=1))
for key, indices in grouped.groups.items():
    if len(indices) > 1:
        sum_dup.append(list(df.index[indices]))
sum_dup.sort()
print(len(sum_dup))

5750


In [15]:
print(datetime.datetime.now())
t1 = datetime.datetime.now()
genuine_dup = []
for j in range(len(sum_dup)):
    df = pd.DataFrame(np.array(X_3)).iloc[sum_dup[j]]
    grouped = df.groupby(df.columns.tolist(), as_index=False)
    duplicated = grouped.filter(lambda x: len(x) > 1)
    grouped_index = [duplicated.index[duplicated[df.columns.tolist()].eq(val).all(axis=1)].tolist() for val in duplicated[df.columns.tolist()].drop_duplicates().values]
    genuine_dup.append(grouped_index)
    if j % 200 == 0:
        t = datetime.datetime.now()
        print(j, t-t1)
t2 = datetime.datetime.now()
print("processing time:", t2-t1)
dup = [genuine_dup[j][k] for j in range(len(genuine_dup)) for k in range(len(genuine_dup[j]))]
dup.sort()
np.save("X3_dup_index.npy",np.array(dup))
print(datetime.datetime.now())

2023-06-29 13:09:50.288115
0 0:00:03.002436
200 0:09:26.854255
400 0:18:48.495597
600 0:28:01.611908
800 0:36:57.515058
1000 0:45:45.931410
1200 0:54:55.075416
1400 1:04:13.777653
1600 1:13:10.402462
1800 1:22:12.687321
2000 1:31:08.424845
2200 1:40:13.699729
2400 1:49:20.916050
2600 1:58:38.436197
2800 2:07:21.008064
3000 2:16:00.948116
3200 2:24:47.316810
3400 2:33:22.760940
3600 2:42:32.461109
3800 2:51:14.901365
4000 3:00:27.392037
4200 3:09:51.014219
4400 3:18:54.470424
4600 3:27:45.503863
4800 3:36:53.237578
5000 3:45:59.775527
5200 3:54:46.014403
5400 4:03:50.405571
5600 4:12:58.543737
processing time: 4:19:31.694662
2023-06-29 17:29:21.993817


# How many index pairs exist?

In [5]:
X0_dup_index = np.load("X0_dup_index.npy", allow_pickle=True)
X1_dup_index = np.load("X1_dup_index.npy", allow_pickle=True)
X2_dup_index = np.load("X2_dup_index.npy", allow_pickle=True)
X3_dup_index = np.load("X3_dup_index.npy", allow_pickle=True)

In [6]:
print(len(X0_dup_index))
print(len(X1_dup_index))
print(len(X2_dup_index))
print(len(X3_dup_index))

6441
6446
6453
6472


# How many data points are degenerated?

In [7]:
def count_elements(nested_list):
    count = 0
    for element in nested_list:
        if isinstance(element, list):
            count += count_elements(element)
        else:
            count += 1  # If element is not a list, count the number of elements as 1.
    return count

In [8]:
# nested_list element count
print(count_elements(X0_dup_index))
print(count_elements(X1_dup_index))
print(count_elements(X2_dup_index))
print(count_elements(X3_dup_index))

20677
20695
20737
21056


In [9]:
# data point
print(36937-20677+6441)
print(36937-20695+6446)
print(36937-20737+6453)
print(36937-21056+6472)

22701
22688
22653
22353
