In [1]:
import os
import math
import pickle
import csv
import numpy as np
import pandas as pd

from ase import atoms
from pprint import pprint

from tqdm import notebook as tqdm
from tqdm.auto import tqdm as tqdm_pandas
tqdm_pandas.pandas()

### 2a. Load in the structures dataframe

In [26]:
save_path = os.path.join(os.getcwd(), 'semi-supervised_supporting_files/structures_df_3p8_post_sanitize_w_simplifications.pkl')
open_file = open(save_path, 'rb')
structures_df = pickle.load(open_file)
open_file.close()

In [5]:
structures_df.head()

Unnamed: 0,index,structure,ICSD_ID,MP_ID,pretty_formula,spacegroup,bandgap,e_hull,ase_structure,composition,replacement,structure_A,structure_AM,structure_CAN,structure_CAMN,structure_A40,structure_AM40,structure_CAN40,structure_CAMN40
0,0,"[[0. 0. 2.6255595] Li0+, [0. ...",180565,mp-1001790,LiO3,Imm2,0.0854,0.22542,"(Atom('Li', [0.0, 0.0, 2.6255594986], index=0)...",Li1 O3,,"[[0. 0. 5.35128645] S0+, [5.55...",[[0. 0. 2.6255595] Li0+],"[[0. 0. 5.35128645] Mg0+, [5.5...","[[0. 0. 2.6255595] Li0+, [0. ...","[[0. 0. 7.5441549] S0+, [-1.1102...",[[0. 0. 3.70147024] Li0+],"[[0. 0. 7.5441549] Mg0+, [-1.110...","[[0. 0. 3.70147024] Li0+, [0. ..."
1,1,"[[2.79072525 1.34013453 0.79373764] Li0+, [0.9...",188829,mp-1001825,LiBe,P2_1/m,0.0,0.166972,"(Atom('Li', [2.79072525, 1.34013452726, 0.7937...",Li2 Be2,,"[[2.79072525 1.17476284 3.09734748] S0+, [0.93...","[[2.79072525 1.34013453 0.79373764] Li0+, [0.9...","[[2.79072525 1.17476284 3.09734748] Mg0+, [0.9...","[[2.79072525 1.34013453 0.79373764] Li0+, [0.9...","[[3.39085782 1.42739017 3.76341775] S0+, [1.13...","[[3.39085782 1.62832427 0.96442725] Li0+, [1.1...","[[3.39085782 1.42739017 3.76341775] Mg0+, [1.1...","[[3.39085782 1.62832427 0.96442725] Li0+, [1.1..."
2,2,"[[2.412716 2.412716 2.412716] Li0+, [3.619074 ...",236959,mp-1001831,LiB,Fd-3m,1.4331,0.386054,"(Atom('Li', [2.412716, 2.412716, 2.412716], in...",Li2 B2,,"[[1.206358 1.206358 1.206358] S0+, [0. 0. 0.] ...","[[2.412716 2.412716 2.412716] Li0+, [3.619074 ...","[[1.206358 1.206358 1.206358] Mg0+, [0. 0. 0.]...","[[2.412716 2.412716 2.412716] Li0+, [3.619074 ...","[[1.70997595 1.70997595 1.70997595] S0+, [0. 0...","[[3.41995189 3.41995189 3.41995189] Li0+, [5.1...","[[1.70997595 1.70997595 1.70997595] Mg0+, [0. ...","[[3.41995189 3.41995189 3.41995189] Li0+, [5.1..."
3,3,"[[0. 0. 0.] Li+, [1.2797665 1.2797665 1.279766...",184904,mp-1009009,LiF,Pm-3m,7.5195,0.287542,"(Atom('Li', [0.0, 0.0, 0.0], index=0), Atom('F...",Li1 F1,,[[1.2797665 1.2797665 1.2797665] S-],"[[0. 0. 0.] Li+, [1.2797665 1.2797665 1.279766...",[[1.2797665 1.2797665 1.2797665] S-],"[[0. 0. 0.] Li+, [1.2797665 1.2797665 1.279766...",[[1.70997595 1.70997595 1.70997595] S-],"[[0. 0. 0.] Li+, [1.70997595 1.70997595 1.7099...",[[1.70997595 1.70997595 1.70997595] S-],"[[0. 0. 0.] Li+, [1.70997595 1.70997595 1.7099..."
4,4,"[[1.4805505 1.9928345 2.448045 ] Li0+, [0. 0. ...",180561,mp-1018789,LiO2,Pnnm,0.0,0.084218,"(Atom('Li', [1.4805505, 1.9928345, 2.448045], ...",Li2 O4,,"[[1.4805505 2.51837684 0.41921791] S0+, [1.48...","[[1.4805505 1.9928345 2.448045 ] Li0+, [0. 0. ...","[[1.4805505 2.51837684 0.41921791] Mg0+, [1.4...","[[1.4805505 1.9928345 2.448045 ] Li0+, [0. 0. ...","[[2.07903566 3.53638411 0.588679 ] S0+, [2.07...","[[2.07903566 2.798401 3.43762192] Li0+, [0. ...","[[2.07903566 3.53638411 0.588679 ] Mg0+, [2.0...","[[2.07903566 2.798401 3.43762192] Li0+, [0. ..."


### 2b. Create a pd.DataFrame for the labels (labels_df) from the structures_df

In [6]:
labels_df = structures_df.loc[:, ['composition', 'ICSD_ID', 'MP_ID', 'bandgap', 'e_hull']]

In [7]:
# add columns for the conductivity and the name of the structure
labels_df['conductivity'] = None
labels_df['name'] = None

In [8]:
labels_df.head(-50)

Unnamed: 0,composition,ICSD_ID,MP_ID,bandgap,e_hull,conductivity,name
0,Li1 O3,180565,mp-1001790,0.0854,0.225420,,
1,Li2 Be2,188829,mp-1001825,0.0000,0.166972,,
2,Li2 B2,236959,mp-1001831,1.4331,0.386054,,
3,Li1 F1,184904,mp-1009009,7.5195,0.287542,,
4,Li2 O4,180561,mp-1018789,0.0000,0.084218,,
...,...,...,...,...,...,...,...
25138,Li12 B4 N8,155126,mp-5914,3.4434,0.003077,,
25139,K6 Li3 Ta18 P9 O72,81336,,,,,
25140,Li2 V2 Cu2 O8,290741,,,,,
25141,Li1 Sm9 Si6 O26,83279,,,,,


### 2c. Check how many of the experimental labels appear in the DataFrame

Note: Some materials were removed due to errors: mostly inability to create an ordered structure or to charge decorate. Other materials simply do not have a matching ICSD ID. 

In [10]:
all_labels = []
match = []

with open('Semi-supervised_labels.csv', newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        all_labels.append(int(row[3]))
        for idx, entry in enumerate(labels_df['ICSD_ID']):
            if int(row[3]) == entry:
                match.append(int(row[3]))
                if row[1][0] == '<':
                    labels_df.at[idx, 'conductivity'] = float(1E-10)
                else:
                    labels_df.at[idx, 'conductivity'] = float(row[1])

print('{} out of {} inputs are labeled'.format(len(match), len(all_labels)))           

226 out of 344 inputs are labeled


In [11]:
# Check how many of the struct
labels_df[labels_df['conductivity'].notnull()]

Unnamed: 0,composition,ICSD_ID,MP_ID,bandgap,e_hull,conductivity,name
17498,Li28 Nd10 Si22 N38 O14 F4,262923,,,,0.0,
17508,Li12 P4 S16,35018,,,,0.0,
17575,Li1 La1 Nb2 O7,72566,,,,0.0,
17658,Li2 P2 S6,253894,,,,0.0,
17688,Li3 N1,26540,,,,0.0012,
...,...,...,...,...,...,...,...
25119,Li4 La4 O8,239278,mp-1020057,3.8908,0.021902,0.0,
25132,Li3 N1,156894,mp-2251,0.9986,0.000000,0.0003,
25138,Li12 B4 N8,155126,mp-5914,3.4434,0.003077,0.0,
25141,Li1 Sm9 Si6 O26,83279,,,,0.0,


### 2d. Save labels that do not appear in the dataframe

In [12]:
missing_labels = [value for value in all_labels if value not in match]

### 2e. Make pretty composition outputs for naming the structures

In [13]:
def pretty_compositions(composition):
    sub = str.maketrans("0123456789.", "₀₁₂₃₄₅₆₇₈₉,")
    return composition.translate(sub)


labels_df['name'] = labels_df['composition'].progress_apply(pretty_compositions)

  0%|          | 0/25193 [00:00<?, ?it/s]

In [14]:
pd.set_option('display.max_rows', None)
labels_df[labels_df['conductivity'].notnull()]

Unnamed: 0,composition,ICSD_ID,MP_ID,bandgap,e_hull,conductivity,name
17498,Li28 Nd10 Si22 N38 O14 F4,262923,,,,0.0,Li₂₈ Nd₁₀ Si₂₂ N₃₈ O₁₄ F₄
17508,Li12 P4 S16,35018,,,,0.0,Li₁₂ P₄ S₁₆
17575,Li1 La1 Nb2 O7,72566,,,,0.0,Li₁ La₁ Nb₂ O₇
17658,Li2 P2 S6,253894,,,,0.0,Li₂ P₂ S₆
17688,Li3 N1,26540,,,,0.0012,Li₃ N₁
17719,Li12 Zr4 O14,73835,mp-5418,3.9436,0.0,0.0,Li₁₂ Zr₄ O₁₄
17725,La3 Mg2 W2 O12,151901,,,,7e-06,La₃ Mg₂ W₂ O₁₂
17730,Li20 Sn2 P4 S24,255750,,,,0.00398,Li₂₀ Sn₂ P₄ S₂₄
17737,Li14 V2 Ge2 O16,66576,,,,1.8e-05,Li₁₄ V₂ Ge₂ O₁₆
17765,Li2 Mg2 S2 O8 F2,281119,,,,0.0,Li₂ Mg₂ S₂ O₈ F₂


### 2f. Save and load the labeled data

In [27]:
save_path = os.path.join(os.getcwd(), 'semi-supervised_supporting_files/labels_df.pkl')
save_file = open(save_path, 'wb')
pickle.dump(labels_df, save_file)
save_file.close()

In [28]:
save_path = os.path.join(os.getcwd(), 'semi-supervised_supporting_files/labels_df.pkl')
open_file = open(save_path, 'rb')
labels_df = pickle.load(open_file)
open_file.close()

### 2g. Load the BVSE labels and add them into labels_df

In [17]:
labels_df["BVSE"] = None

In [18]:
labels_df.head()

Unnamed: 0,composition,ICSD_ID,MP_ID,bandgap,e_hull,conductivity,name,BVSE
0,Li1 O3,180565,mp-1001790,0.0854,0.22542,,Li₁ O₃,
1,Li2 Be2,188829,mp-1001825,0.0,0.166972,,Li₂ Be₂,
2,Li2 B2,236959,mp-1001831,1.4331,0.386054,,Li₂ B₂,
3,Li1 F1,184904,mp-1009009,7.5195,0.287542,,Li₁ F₁,
4,Li2 O4,180561,mp-1018789,0.0,0.084218,,Li₂ O₄,


In [20]:
# load in BVSE results
save_path = os.path.join(os.getcwd(), 'semi-supervised_supporting_files/Li_BVSE_results.pkl')
open_file = open(save_path, 'rb')
BVSE_results = pickle.load(open_file)
open_file.close()

In [21]:
for row in tqdm.tqdm(BVSE_results, desc='Add BVSE labels to dataframe'):
    # if the row has an associated ICSD_ID
    if row[1]:
        for idx, entry in enumerate(labels_df['ICSD_ID']):
            if row[1] == entry:
                labels_df.at[idx, 'BVSE'] = row[3]
    else:
        pass
    
    # if the row has an associated MP_ID
    if row[2]:
        for idx, entry in enumerate(labels_df['MP_ID']):
            if row[2] == entry:
                labels_df.at[idx, 'BVSE'] = row[3]
    else:
        pass

Add BVSE labels to dataframe:   0%|          | 0/6836 [00:00<?, ?it/s]

In [22]:
print("{} rows have been labeled with BVSE data.".format(len(labels_df[labels_df['BVSE'].notnull()])))

6845 rows have been labeled with BVSE data.


### Save labels_df with BVSE results

In [23]:
save_path = os.path.join(os.getcwd(), 'semi-supervised_supporting_files/labeled_data_BVSE.pkl')
save_file = open(save_path, 'wb')
pickle.dump(labels_df, save_file)
save_file.close()

In [24]:
structures_df.shape

(25193, 19)

In [25]:
labels_df.shape

(25193, 8)