In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [2]:
DATA_DIR = os.path.join(
    os.path.dirname(os.path.dirname(os.getcwd())), "data/"
)

# Loading Data

In [3]:
structure_names = ["Ge-1_Se-1", "Ge-1_Te-1"]

In [4]:
df = pd.concat([pd.read_csv(os.path.join(DATA_DIR, struct_name + ".csv"), index_col=0) for struct_name in structure_names])

In [5]:
cols_raw = list(df.columns)
cols_trash = ['converged', 'accuracy']
cols_independent = ['ecutrho', 'k_density', 'ecutwfc']
cols_drop = cols_trash + cols_independent

cols_dependent = cols_raw.copy()
for element in cols_drop:
    cols_dependent.remove(element)

In [6]:
X_raw = df[cols_dependent]
y_raw = df[cols_independent]

In [7]:
X_raw.describe()

Unnamed: 0,delta_E,Ag,Al,Ar,As,Au,B,Ba,Be,Bi,...,Ti,Tl,Tm,V,W,Xe,Y,Yb,Zn,Zr
count,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,...,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0
mean,0.087288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.193152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,-0.015443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.001996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.132741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.886049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train-Test-Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y_raw,
    test_size=0.2,
    random_state=42
)

# Model definitions

### Linear Model with feature augmentation

In [9]:
linear_augmented_model = Pipeline([
    ('scaler_init', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('exp_decay', FunctionTransformer(lambda x : np.exp(-x))),
    ('scaler_final', StandardScaler()),
    ('regressor', LinearRegression())
])

### Packing them together to reduce code size

In [10]:
models = {"Augmented Linear Regression": linear_augmented_model}

# Model training

In [11]:
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    print("Done!\n")

Training Augmented Linear Regression...
Done!



# Model evaluation

In [17]:
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    mse_test = mean_squared_error(y_test, y_pred_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
    print(f"MSE:\ttrain:{mse_train:.4E}\ttest:{mse_test:.4E}")

    mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
    mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
    print(f"MAPE:\ttrain:{mape_train:.4E}\ttest:{mape_test:.4E}")

    print("Done!\n")

Evaluating Augmented Linear Regression...
MSE:	train:2.8144E+03	test:2.8991E+03
MAPE:	train:3.9545E-01	test:3.9737E-01
Done!



# Let's see how bad the predictions are...

In [None]:
y_pred_test = linear_augmented_model.predict(X_test)

In [15]:
y_pred_test[:10]

array([[2.38625000e+02, 2.19863892e-01, 5.77187500e+01],
       [2.45250000e+02, 1.62963867e-01, 6.47031250e+01],
       [2.45375000e+02, 1.63726807e-01, 6.46875000e+01],
       [2.49875000e+02, 2.25067139e-01, 6.37500000e+01],
       [2.45750000e+02, 1.68273926e-01, 6.46406250e+01],
       [2.54875000e+02, 4.25262451e-01, 5.41250000e+01],
       [2.45625000e+02, 1.66290283e-01, 6.46718750e+01],
       [2.43875000e+02, 4.14916992e-01, 4.88125000e+01],
       [2.45625000e+02, 1.65588379e-01, 6.46718750e+01],
       [2.33875000e+02, 1.55654907e-01, 5.86406250e+01]])

In [16]:
y_test[:10]

Unnamed: 0,ecutrho,k_density,ecutwfc
92,180,0.166667,30
582,340,0.25,90
548,340,0.166667,85
113,340,0.125,30
174,180,0.1,40
420,260,0.5,70
309,300,0.1,55
337,260,0.5,60
413,180,0.125,70
393,380,0.25,65
