In [None]:
#ライブラリ読み込み
#データ読み込み
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sys import argv
import joblib
from datetime import datetime, time

pd.set_option('display.max_columns',200)
pd.set_option("display.max_rows", 200)

import matplotlib.pyplot as plt
import seaborn as sns


#スペースアリはYとS
#大文字被りがLI
#原子の数が0になっているのが複数あったので消去
#Yoという謎原子があったので消去
#O6050という異常値があるがデータ生成の段階では影響がないと判断
#Nd185があったのでNd1.85に訂正(周囲にNd1.85のデータがあったため)
#O1013があったのでO10.13に訂正(周囲にO10.XXのデータがあったため)
#Cu285があったのでCu2.85に訂正(周囲にCu2.85のデータがあったため)

In [None]:
alldataCu = pd.read_pickle("alldataCu.pkl")

In [None]:
lenofcolumns = len(alldataCu.columns)

In [None]:
alldataCu_X = alldataCu.iloc[:, :lenofcolumns - 1]
alldataCu_y = alldataCu.filter(["tc"]).values
display(alldataCu_X)
print(alldataCu_y)

In [None]:
#説明変数の正規化
#なお、銅酸化物系はすべて酸素を含んでいるのでOarinashi列を削除
#その後に、欠損値全てを0で補完
alldataCu_X = alldataCu_X.drop(["Oarinashi"], axis = "columns")

# axis=columns で、列ではなく行単位で正規化に変更可能
scaled_alldataCu_X = alldataCu_X.apply(lambda x: (x-x.min())/(x.max() - x.min()), axis="index")
scaled_alldataCu_X = scaled_alldataCu_X.fillna(0)
display(scaled_alldataCu_X)

In [None]:
#目的変数の正規化
from sklearn.preprocessing import MinMaxScaler
yscaler=MinMaxScaler(feature_range=(0,1))
scaled_alldataCu_y = yscaler.fit_transform(alldataCu_y)
print(scaled_alldataCu_y)

In [None]:
#全体の行数を確認 80%をトレーニングデータに
training_size = int(scaled_alldataCu_X.shape[0] * 0.80)
print("training size:",training_size)

In [None]:
#トレーニングデータの大きさチェック
trainx_df = scaled_alldataCu_X.iloc[0:training_size, :]
y_train = scaled_alldataCu_y[0:training_size, :]
print(len(trainx_df))
print(len(y_train))

In [None]:
#テストデータの大きさチェック
testx_df = scaled_alldataCu_X.iloc[training_size:, :]
y_test = scaled_alldataCu_y[training_size:, :]
print(len(testx_df))
print(len(y_test))

In [None]:
display(trainx_df)

In [None]:
#説明変数データフレーム全体のテンソル化 テストデータも行う
x_train = tf.convert_to_tensor(trainx_df)
x_test = tf.convert_to_tensor(testx_df)

In [None]:
print(x_train)
print(x_test)

In [None]:
#show shape of train data
x_train.shape

In [None]:
y_train.shape

In [None]:
#show shape of test data
x_test.shape

In [None]:
#import keras libraries for the model
import math
from keras.models import Sequential
from keras.layers import Dense,Activation,Input
from keras.utils.vis_utils import plot_model
from keras.metrics import RootMeanSquaredError as rmse
from keras import optimizers
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
#グローバル変数
epochs = 100
batch_size = 8

In [None]:
#define the model
model = Sequential()
model.add(Input(shape=(len(trainx_df.columns),)))
model.add(Dense(units=128, activation="relu"))
model.add(Dense(units=128, activation="relu"))
model.add(Dense(units=128, activation="relu"))
model.add(Dense(units=1, activation="linear"))
model.compile(optimizer='adam', loss= 'mse' , metrics = [rmse()])

In [None]:
#show model
model.summary()

In [None]:
#measure time
import time 
time_calc_start = time.time()

#fit model
epochs = 100
history=model.fit(x_train,y_train,epochs=epochs,validation_data=(x_test,y_test),batch_size=batch_size,verbose=1)

#calculate time
fit_time_seconds = time.time() - time_calc_start
print("fit time =",fit_time_seconds," seconds.")

In [None]:
model.summary()

In [None]:
#evaluate training data
model.evaluate(x_train,y_train, batch_size = batch_size)

In [None]:
#evaluate testing data
model.evaluate(x_test,y_test, batch_size = batch_size)

In [None]:
#prediction using training data
train_predict = model.predict(x_train)
plot_y_train = y_train.reshape(-1,1)


# 予測結果と正解ラベルを用いてR2スコアを計算
r2train = r2_score(yscaler.inverse_transform(plot_y_train), yscaler.inverse_transform(train_predict))

# R2スコアを表示
print("R2スコア:", r2train)

In [None]:
plt.rcParams["figure.figsize"] = [8.0,8.0]

In [None]:
#show actual vs predicted (training) graph
plt.scatter(x = yscaler.inverse_transform(plot_y_train), y = yscaler.inverse_transform(train_predict))
plt.title("Cu train")
plt.xlabel("tc")
plt.ylabel("tc_pred")
plt.xlim(-10, 300)  # x 軸の範囲を指定
plt.ylim(-10, 200)  # y 軸の範囲を指定
plt.show()

In [None]:
#prediction using testing data
test_predict = model.predict(x_test)
plot_y_test = y_test.reshape(-1,1)

# 予測結果と正解ラベルを用いてR2スコアを計算
r2test = r2_score(yscaler.inverse_transform(plot_y_test), yscaler.inverse_transform(test_predict))

# R2スコアを表示
print("R2スコア:", r2test)

In [None]:
#show actual vs predicted (training) graph
plt.scatter(x = yscaler.inverse_transform(plot_y_test), y = yscaler.inverse_transform(test_predict))
plt.title("Cu test")
plt.xlabel("tc")
plt.ylabel("tc_pred")
plt.xlim(-10, 200)  # x 軸の範囲を指定
plt.ylim(-10, 200)  # y 軸の範囲を指定
plt.show()

In [None]:
ytestactual = pd.DataFrame(yscaler.inverse_transform(plot_y_test), columns = ["tc"])
ypredicted = pd.DataFrame(yscaler.inverse_transform(test_predict), columns = ["tc_pred"])

resultdf = pd.concat([ytestactual, ypredicted], axis = "columns")

In [None]:
mae = mean_absolute_error(resultdf["tc"], resultdf["tc_pred"])

# 散布図の作成
sns.lmplot(x = "tc", y = "tc_pred", data=resultdf, markers = ".", height=6, aspect=1.5, ci=None)

# 傾きの表示
plt.annotate("傾き          = {}".format(round(resultdf["tc_pred"].corr(resultdf["tc"]), 3)), xy=(0, 0), xytext=(-5, 100), fontsize=24)
plt.annotate("R2スコア　    = {}".format(round(r2test, 3)), xy=(0,0), xytext=(-5, 110), fontsize=24)
plt.annotate("平均絶対誤差　= {}".format(round(mae, 3)), xy=(0,0), xytext=(-5, 120), fontsize=24)

plt.title("NN　銅酸化物系超伝導体", fontsize=24)

# グラフの表示
plt.show()