In [1]:
import os
import numpy as np
import pandas as pd
import sys

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

sys.path.append(os.path.dirname(os.getcwd()))
from tools.utils import encode_all_structures, Encoding

In [2]:
DATA_DIR = os.path.join(
    os.path.dirname(os.path.dirname(os.getcwd())), "data/"
)

# Loading Data

In [3]:
encoding = Encoding.COLUMN_MASS

df = pd.read_csv(os.path.join(DATA_DIR, "data.csv"), index_col=0, na_filter= False)
df = encode_all_structures(df, encoding)

In [4]:
cols_raw = list(df.columns)
cols_trash = ["structure", 'converged', 'accuracy', "n_iterations", "time", "fermi", "total_energy"]
cols_independent = ['ecutrho', 'k_density', 'ecutwfc']
cols_drop = cols_trash + cols_independent

cols_dependent = cols_raw.copy()
for element in cols_drop:
    cols_dependent.remove(element)

In [5]:
X_raw = df[cols_dependent]
y_raw = df[cols_independent]

In [6]:
X_raw.describe()

Unnamed: 0,delta_E,Lant,PTC1,PTC2,PTC3,PTC4,PTC5,PTC6,PTC7,PTC8,PTC9,PTC10,PTC11,PTC12,PTC13,PTC14,PTC15,PTC16,PTC17,PTC18
count,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0,56309.0
mean,0.047412,0.149177,0.031173,0.04324,0.022388,0.023752,0.031421,0.015835,0.026956,0.097564,0.05477,0.056758,0.06261,0.037938,0.046023,0.049814,0.091317,0.060502,0.024574,0.0
std,0.213111,0.287443,0.133047,0.133163,0.10728,0.107936,0.123172,0.094961,0.132098,0.252972,0.183287,0.190763,0.188731,0.146247,0.161086,0.138497,0.206135,0.166459,0.127998,0.0
min,-0.447845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.000707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.009537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.6,0.924714,0.733997,0.732088,0.741626,0.841555,0.885522,0.783026,0.890822,0.982272,0.876908,0.983118,0.89981,0.803422,0.967155,0.808676,0.918451,0.839999,0.833172,0.0


# Train-Test-Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y_raw,
    test_size=0.2,
    random_state=42
)

# Model definitions

### Linear Model with feature augmentation

In [8]:
linear_augmented_model = Pipeline([
    ('scaler_init', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('exp_decay', FunctionTransformer(lambda x : np.exp(-x))),
    ('scaler_final', StandardScaler()),
    ('regressor', LinearRegression())
])

### Packing them together to reduce code size

In [9]:
models = {"Augmented Linear Regression": linear_augmented_model}

# Model training

In [10]:
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    print("Done!\n")

Training Augmented Linear Regression...
Done!



# Model evaluation

In [11]:
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    mse_test = mean_squared_error(y_test, y_pred_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
    print(f"MSE:\ttrain:{mse_train:.4E}\ttest:{mse_test:.4E}")

    mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
    mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
    print(f"MAPE:\ttrain:{mape_train:.4E}\ttest:{mape_test:.4E}")

    print("Done!\n")

Evaluating Augmented Linear Regression...
MSE:	train:2.7734E+03	test:2.8012E+03
MAPE:	train:4.6358E-01	test:4.6674E-01
Done!



# Let's see how bad the predictions are...

In [12]:
y_pred_test = linear_augmented_model.predict(X_test)

In [13]:
y_pred_test[:10]

array([[242.63217076,   5.12250605,  29.71237338],
       [242.78451451,   6.11567012,  59.40524447],
       [245.94969213,   6.01568236,  59.3234917 ],
       [243.20443639,   6.16541377,  62.04635776],
       [247.61849889,   5.92298213,  59.07370151],
       [245.08822545,   5.85334101,  65.82296908],
       [249.3785177 ,   5.57034329,  66.05525393],
       [259.04818639,   5.42932978,  17.34909213],
       [244.99740514,   5.84430781,  59.90182651],
       [246.41928014,   5.97684565,  69.80905307]])

In [14]:
y_test[:10]

Unnamed: 0,ecutrho,k_density,ecutwfc
24997,300,8,20
11625,380,2,60
50381,140,2,100
15230,100,8,35
53052,140,6,30
40434,340,4,40
19633,180,6,80
47385,180,2,20
4524,380,4,45
18846,260,6,55
