In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model

import sys
sys.path.insert(1, "../src/")
from material import Material
from materialsdataset import MaterialsDataset
from periodictable import PeriodicTable
from materialspredictiondata import MaterialsPredictionData
from bandgapdataset import BandGapDataset
from bandgapdataframe import BandGapDataFrame

In [2]:
csv_path = "../data/training/materialsproject_output/"
json_path = "../data/training/materialsproject_json/"

### MaterialsPredictionData class
- the user creates an object of this type to use the package
- takes in a MaterialsDatset type object which contains materials and their params to use for prediction
- houses array of data to run through the model to predict the bandgap
- also houses information needed to create the correct training data based on the user input

In [3]:
Si3Ge4 = Material(formula="Si3Ge4",density=1.23,volume=57)
Cd22S45 = Material(formula="Cd22S45",density=2.34)

print(Si3Ge4.training_params)
print(Cd22S45.training_params)

['density__gm_per_cc', 'volume']
['density__gm_per_cc']


In [4]:
material_dataset = MaterialsDataset([Si3Ge4,Cd22S45])

In [5]:
material_dataset.materials_dict["Si3Ge4"]

{'params': {'density__gm_per_cc': 1.23,
  'volume': 57,
  'formation_energy_eV': None},
 'stoichiometry': {'Si': 0.42857142857142855, 'Ge': 0.5714285714285714}}

In [6]:
periodic_table = PeriodicTable()
symbol_to_element_map = periodic_table.symbol_to_element_map
symbols = periodic_table.symbols

In [7]:
materials_prediction_data = MaterialsPredictionData(material_dataset,symbols)

In [17]:
print(materials_prediction_data.prediction_data["Si3Ge4"])

[1.23, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.42857142857142855, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5714285714285714, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [9]:
materials_to_predict = list(materials_prediction_data.prediction_data.keys())
materials_to_predict

['Si3Ge4', 'Cd22S45']

### BandGapDataset class
- need to create unique one for each material, as each material can have unique input parameters

In [40]:
for material in materials_to_predict:
    dataset = BandGapDataset(csv_path,json_path)
    data_dict = dataset.data_dict
    data_IDs = dataset.data_IDs
    
    material_training_params = [ 
        param for (param,value) in material_dataset.materials_dict[material]["params"].items() if value is not None
    ]
    
    bandgap_dataframe = BandGapDataFrame(data_dict, symbols, material_training_params)
    dataframe = bandgap_dataframe.dataframe
    non_element_keys = bandgap_dataframe.non_element_keys
    
    print(f"{material} params: {non_element_keys[1:]}")
    
    X_keys = list(dataframe.keys())[2:]
    X = np.asarray(dataframe[X_keys])
    y = np.asarray(dataframe['band_gap__eV'])
            
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle= True)
    
    print(f"X_train shape: {np.shape(X_train)}")
    print(f"y_train shape: {np.shape(y_train)}")
    print(f" X_test shape: {np.shape(X_test)}")
    print(f" y_test shape: {np.shape(y_test)}")
    
    model = linear_model.Ridge(alpha = .5)
    model.fit(X_train, y_train)
    print(f"Model Score: {model.score(X_test, y_test)}")
    #print('Weights: ', model.coef_)

    this_prediction_data = np.asarray(materials_prediction_data.prediction_data[material])
    this_prediction_data = np.reshape(this_prediction_data, (1,np.shape(this_prediction_data)[0]))
        
    band_gap_prediction = model.predict(this_prediction_data)
    print(f"{material}'s predicted bandgap = {band_gap_prediction} eV\n")

Si3Ge4 params: ['density__gm_per_cc', 'volume']
X_train shape: (1059, 120)
y_train shape: (1059,)
 X_test shape: (522, 120)
 y_test shape: (522,)
Model Score: 0.3267281511811151
Si3Ge4's predicted bandgap = [0.8039398] eV

Cd22S45 params: ['density__gm_per_cc']
X_train shape: (1059, 119)
y_train shape: (1059,)
 X_test shape: (522, 119)
 y_test shape: (522,)
Model Score: 0.2716404534025594
Cd22S45's predicted bandgap = [1.35876311] eV

