# How many cells are identified?
This notebook contains code to check the results of the Identification pileline.

In [1]:
import pickle
import pandas as pd

## Prepare data

In [2]:
# load dictionary
with open('data_RO1/dict_all_cells_v2.pkl', 'rb') as f:
    name_to_CID_dict = pickle.load(f)
    f.close()
    
# add empty dictionary entry for cells without an etl or htl
name_to_CID_dict['none'] = 0

# load dataframe
with open('data_RO1/df_all_ctls.pkl', 'rb') as f:
    df_all_ctls = pickle.load(f)
    
# ETLs: transform entries where multiple materials are separated by a ";" into lists
for index, _ in df_all_ctls.iterrows():
    i = 0
    while i < len(df_all_ctls.at[index, "etl"]):
        if ";" in df_all_ctls.at[index, "etl"][i]:
            elements = df_all_ctls.at[index, "etl"][i].split(";")
            df_all_ctls.at[index, "etl"] = df_all_ctls.at[index, "etl"][:i] + elements + df_all_ctls.at[index, "etl"][i+1:]
        i += 1
        
# HTLs: transform entries where multiple materials are separated by a ";" into lists
for index, _ in df_all_ctls.iterrows():
    i = 0
    while i < len(df_all_ctls.at[index, "htl"]):
        if ";" in df_all_ctls.at[index, "htl"][i]:
            elements = df_all_ctls.at[index, "htl"][i].split(";")
            df_all_ctls.at[index, "htl"] = df_all_ctls.at[index, "htl"][:i] + elements + df_all_ctls.at[index, "htl"][i+1:]
        i += 1

## Combine dictionary info with data

In [3]:
# Create a DataFrame from the dictionary
df_new = pd.DataFrame(list(name_to_CID_dict.items()), columns=['Material', 'CID'])

# Create the 'etl' and 'htl' columns
df_new['etl'] = df_new['Material'].apply(lambda x: df_all_ctls[df_all_ctls['etl'].apply(lambda y: x in y)].index.tolist())
df_new['htl'] = df_new['Material'].apply(lambda x: df_all_ctls[df_all_ctls['htl'].apply(lambda y: x in y)].index.tolist())

In [4]:
# Count how many ETLs are identified
etl_identified = []
etl_count = 0

for index, row in df_all_ctls.iterrows():
    num_etl_materials = len(row['etl'])
    current_num = 0
    for material in row['etl']:
        if pd.isna(df_new[df_new['Material'] == material]['CID'].values[0]):
            break
        else:
            current_num = current_num + 1
    if current_num == num_etl_materials:
        etl_count = etl_count + 1
        etl_identified.append(True)
    else:
        etl_identified.append(False)

# Count how many HTLs are identified
htl_count = 0
htl_identified = []

for index, row in df_all_ctls.iterrows():
    num_etl_materials = len(row['htl'])
    current_num = 0
    for material in row['htl']:
        if pd.isna(df_new[df_new['Material'] == material]['CID'].values[0]):
            break
        else:
            current_num = current_num + 1
    if current_num == num_etl_materials:
        htl_count = htl_count + 1
        htl_identified.append(True)
    else:
        htl_identified.append(False)
        
# add identification to materials dataframe
df_all_ctls['etls_identified'] = etl_identified
df_all_ctls['htls_identified'] = htl_identified

# create a column stating whether both CTLs were identified
df_all_ctls['both_identified'] = df_all_ctls['etls_identified'] & df_all_ctls['htls_identified']

In [5]:
# ETL: resulting count of identified entries
print(f"{etl_count} entries have an identified ETL.")
print(f"Of these, {len(df_new.at[2533, 'etl'])} entries use no ETL material.")

# HTL: resulting count of identified entries
print(f"{htl_count} entries have an identified HTL.")
print(f"Of these, {len(df_new.at[2533, 'htl'])} entries use no HTL material.")

# Both: resulting count of identified entries
print(f"Both CTLs are identified for {df_all_ctls['both_identified'].sum()} cells.")
# how many cells have no CTL at all? (no HTL, no ETL)
print(f"Of these, {len(list(set(df_new.at[2533, 'htl']) & set(df_new.at[2533, 'etl'])))} cells have neither ETL nor HTL.")

40950 entries have an identified ETL.
Of these, 281 entries use no ETL material.
39643 entries have an identified HTL.
Of these, 2626 entries use no HTL material.
Both CTLs are identified for 37713 cells.
Of these, 31 cells have neither ETL nor HTL.


In [6]:
# save into pickles folder
with open('data_RO1/df_all_ctls_identified_v2.pkl', 'wb') as f:
    pickle.dump(df_all_ctls, f)

In [7]:
# save as excel
df_all_ctls.to_excel('excel_tables/all_cells_identification_info.xlsx', sheet_name='All_cells')

In [22]:
for entry in name_to_CID_dict:
    if name_to_CID_dict[entry]:
        print(f"{entry}:{name_to_CID_dict[entry]}")

TiO2-c:26042
CsI:70918
AAO:4369393
SM13:6217392
 PDI:151809
X36:66553046
PCBM-70:71777692
Spiro-TAD:16134428
NiO-c:14805
Theobromine:5429
Ba(OH)2:6093286
BuO-DATPA:139204204
HL-2:168296954
 F8BT:59704068
OAI:444763
CdSe:9837007
CT4:86287519
Ethylene glycol:174
 PS:6423
 Methoxypropionitrile:141829
Spiro-TTB:16161851
BPTI:16130295
TBP:31357
PTQ10:139035630
JW6:167311785
Pt-np:23939
CdI2:277692
1,2-diaminoethane:3301
Boron subphthalocyanine chloride:11826144
N719 dye:51346531
 PBDB-T-SF:169550405
4-bromobenzenediazonium tetrafluoroborate:2734810
pFN-Br:168009989
V1160:7010037
3TPYMB:53426694
V1004:144364
Al2O3-mp:9989226
CJ-01:70680263
T101:16197323
PDA:12599
P3OT:566852
KOH:14797
MCA:1674
NPD:18609
PFN-Br:168009989
LiF:224478
Z25:135566693
MEH-PPV-20:57015767
 CuSCN:11029823
M105:731767
ITO:16217324
graphite nanofibers:5462310
X59:169503067
CD:23973
BenMeIM-Cl:10560335
SnS:426379
H3PW12O4:452313
Phenethylamine:1001
P3HT:22950249
BAI:1281
PCBM-60:53384373
KCl:4873
TPPI:7540
 Propylene ca

## Extra: How often were identified materials used?
This question is less relevant and just for the interested.
How often does an identified material appear in the data?

In [8]:
sum_etl = 0
sum_htl = 0

for index, material in enumerate(df_new['Material']):
    if not pd.isna(df_new[df_new['Material'] == material]['CID'].values[0]):
        current_etl_sum = len(df_new[df_new['Material'] == material]['etl'].values[0])
        current_htl_sum = len(df_new[df_new['Material'] == material]['htl'].values[0])
        print(f"Etls with {material}: {current_etl_sum}")
        print(f"Htls with {material}: {current_htl_sum}")
        print(f"{index+1}/2534")
        sum_etl = sum_etl + current_etl_sum
        sum_htl = sum_htl + current_htl_sum

Etls with TiO2-c: 22019
Htls with TiO2-c: 0
1/2534
Etls with CsI: 1
Htls with CsI: 0
5/2534
Etls with AAO: 19
Htls with AAO: 0
6/2534
Etls with SM13: 0
Htls with SM13: 2
11/2534
Etls with  PDI: 5
Htls with  PDI: 0
12/2534
Etls with X36: 0
Htls with X36: 1
15/2534
Etls with PCBM-70: 277
Htls with PCBM-70: 0
20/2534
Etls with Spiro-TAD: 0
Htls with Spiro-TAD: 3
24/2534
Etls with NiO-c: 23
Htls with NiO-c: 1966
27/2534
Etls with Theobromine: 0
Htls with Theobromine: 1
30/2534
Etls with Ba(OH)2: 6
Htls with Ba(OH)2: 0
31/2534
Etls with BuO-DATPA: 0
Htls with BuO-DATPA: 1
33/2534
Etls with HL-2: 0
Htls with HL-2: 3
42/2534
Etls with  F8BT: 2
Htls with  F8BT: 0
44/2534
Etls with OAI: 0
Htls with OAI: 3
47/2534
Etls with CdSe: 3
Htls with CdSe: 0
51/2534
Etls with CT4: 0
Htls with CT4: 1
52/2534
Etls with Ethylene glycol: 0
Htls with Ethylene glycol: 2
53/2534
Etls with  PS: 1
Htls with  PS: 0
55/2534
Etls with  Methoxypropionitrile: 0
Htls with  Methoxypropionitrile: 1
57/2534
Etls with Spir

In [9]:
print(f"ETL identified materials appearances: {sum_etl}")
print(f"HTL identified materials appearances: {sum_htl}")

ETL identified materials appearances: 70592
HTL identified materials appearances: 42207


## Unidentified materials: How many cells pertain to each?

This is interesting to see if there are unidentified materials which are used in many cells. In such cases it might be worth the time to identify them manually.

In [10]:
# Construct dataframe of unidentified materials and their frequency
unidentified_materials = []
no_cells = []

for index, row in df_new.iterrows():
    if pd.isna(row['CID']):
        unidentified_materials.append(row['Material'])
        no_cells.append(len(row['etl']) + len(row['htl']))

df_unid = pd.DataFrame({'Material': unidentified_materials, 'Cell_amount': no_cells})

In [13]:
# save into excel file
df_unid.sort_values(by = 'Cell_amount', ascending=False).to_excel('excel_tables/unidentified.xlsx', sheet_name='Unidentified')

In [12]:
df_unid.sort_values(by = 'Cell_amount', ascending=False).head(40)

Unnamed: 0,Material,Cell_amount
1263,PEIE,173
12,NiMgLiO,161
1420,P3CT-Na,136
302,AZO-np,121
10,PEDOT,60
623,P3CT-N,57
154,PEAI,50
1561,P3CT,41
309,TIPD,36
508,NiMgLiO-c,33
