In [None]:
import csv
import pandas as pd

numerical_df = pd.read_csv("../dataset/cleaned_numerical_data.csv")

numerical_df.info()

feature_names = ["nsites", "nelements", "is_gap_direct", "cbm",
                 "energy_per_atom", "is_metal","efermi" ,"vbm" ,"formation_energy_per_atom","density"  ]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155361 entries, 0 to 155360
Data columns (total 45 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   nsites                                        155361 non-null  int64  
 1   nelements                                     155361 non-null  int64  
 2   volume                                        155361 non-null  float64
 3   density                                       155361 non-null  float64
 4   density_atomic                                155361 non-null  float64
 5   deprecated                                    155361 non-null  int64  
 6   energy_per_atom                               152149 non-null  float64
 7   formation_energy_per_atom                     152149 non-null  float64
 8   energy_above_hull                             152149 non-null  float64
 9   is_stable                                     15

In [65]:
#split data into real and theoretical subsets model on real values first

numerical_df.drop(['deprecated'], axis=1, inplace=True)

real_df = numerical_df[numerical_df['theoretical'] == 0].copy()
theoretical_df = numerical_df[numerical_df['theoretical'] == 1].copy()

real_df.drop(['theoretical'], axis=1, inplace=True)
theoretical_df.drop(['theoretical'], axis=1, inplace=True)

print("Real df shape:", real_df.shape)
print("Theoretical df shape:", theoretical_df.shape)

Real df shape: (49772, 43)
Theoretical df shape: (105589, 43)


In [66]:
real_df['band_gap'].describe()

#Note, band_gap is not normally distributed and is skewed right

count    49772.000000
mean         1.341899
std          1.739477
min          0.000000
25%          0.000000
50%          0.265150
75%          2.443400
max         17.891400
Name: band_gap, dtype: float64

In [67]:
real_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49772 entries, 36 to 155336
Data columns (total 43 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   nsites                                        49772 non-null  int64  
 1   nelements                                     49772 non-null  int64  
 2   volume                                        49772 non-null  float64
 3   density                                       49772 non-null  float64
 4   density_atomic                                49772 non-null  float64
 5   energy_per_atom                               48674 non-null  float64
 6   formation_energy_per_atom                     48674 non-null  float64
 7   energy_above_hull                             48674 non-null  float64
 8   is_stable                                     49772 non-null  int64  
 9   equilibrium_reaction_energy_per_atom          23056 non-null  fl

In [68]:
#Train a model for band_gap on real molecules

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

# Create the list of features to train with below
feature_names = ["nsites", "nelements", "is_gap_direct", "cbm",
                 "energy_per_atom", "is_metal","efermi" ,"vbm" ,"formation_energy_per_atom","density"  ]

# Select data corresponding to features in feature_names
X = real_df[feature_names]

y = real_df["band_gap"]

train_X, test_x, train_y, test_y = train_test_split(X, y, train_size = 0.2, random_state=1)

dtr_model = DecisionTreeRegressor(random_state=1)

dtr_model.fit(train_X, train_y)

In [69]:
predictions = dtr_model.predict(test_x)

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(test_y, predictions)

# mean squared error (MSE) and R-squared (R2)
mse = mean_squared_error(test_y, predictions)
r2 = r2_score(test_y, predictions)

print("Mean Squared Error: ", mse)
print("R-squared: ", r2)
print("Mean Absolute Error: ", mae)

Mean Squared Error:  0.03571236804937466
R-squared:  0.9881754323521292
Mean Absolute Error:  0.0716124139836255


In [70]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

#Check for and handle categorical variables
label_encoder = LabelEncoder()

# Fitting Random Forest Regression to the dataset
regressor = RandomForestRegressor(n_estimators=20, random_state=42, oob_score=True)
# train_X, val_X, train_y, val_y
regressor.fit(train_X, train_y)
# Fit the regressor with x and y data

  warn(


In [71]:
# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

# Access the OOB Score
# OOB Score is the number of correctly predicted data on OOB samples taken for validation, you want less than 1%
oob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {oob_score}')
# train_X, test_x, train_y, test_y
# Making predictions on the same data or new data
predictions = regressor.predict(test_x)

# Evaluating the model
mse = mean_squared_error(test_y, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(test_y, predictions)
print(f'R-squared: {r2}')

# using mae, mean absolute error, or the average error, we can see on average how far off our # model is

mae = mean_absolute_error(test_y, predictions)
print("Mean Absolute Error: ", mae)

Out-of-Bag Score: 0.9962463268603695
Mean Squared Error: 0.019596501850428202
R-squared: 0.9935114870716037
Mean Absolute Error:  0.035769111457129944


In [72]:
theoretical_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105589 entries, 0 to 155360
Data columns (total 43 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   nsites                                        105589 non-null  int64  
 1   nelements                                     105589 non-null  int64  
 2   volume                                        105589 non-null  float64
 3   density                                       105589 non-null  float64
 4   density_atomic                                105589 non-null  float64
 5   energy_per_atom                               103475 non-null  float64
 6   formation_energy_per_atom                     103475 non-null  float64
 7   energy_above_hull                             103475 non-null  float64
 8   is_stable                                     105589 non-null  int64  
 9   equilibrium_reaction_energy_per_atom          10675 n

In [73]:
# Replace NaN values in the entire DataFrame with the mean of each column
theoretical_df[feature_names] = theoretical_df[feature_names].fillna(theoretical_df[feature_names].mean())
theoretical_df = theoretical_df.dropna(subset=['band_gap'])

In [74]:
#Now predict band-gap with the theoretical elements with Random Forrest Regressor

# theoretical_df.info()

theoretical_x = theoretical_df[feature_names]
theoretical_y = theoretical_df['band_gap']

theoretical_predictions = regressor.predict(theoretical_x)

# Evaluating the model
mse = mean_squared_error(theoretical_y, theoretical_predictions)
mae = mean_absolute_error(theoretical_y, theoretical_predictions)
r2 = r2_score(theoretical_y, theoretical_predictions)

print(f'Mean Squared Error: {mse}')
print("Mean Absolute Error: ", mae)
print(f'R-squared: {r2}')

Mean Squared Error: 0.015797519889250905
Mean Absolute Error:  0.0394581569600591
R-squared: 0.9911788506131677
