# Cоздание моделей - дефекты стали

In [1]:
import numpy as np
import pandas as pd

In [3]:
num_data = pd.read_csv("data/data.csv")

In [4]:
target = "Class_b\'2\'"
pos = num_data[target].mean()
neg = 1 - pos

In [5]:
m = num_data.size / 2
weight_for_0 = (1 / neg) 
weight_for_1 = (1 / pos) 

class_weight = {0 : weight_for_0, 1 : weight_for_1}

print(weight_for_0, weight_for_1)

1.5307570977917981 2.884101040118871


## Поделить на train, test

In [6]:
!pip install keras_preprocessing 




[notice] A new release of pip is available: 23.1.2 -> 24.1.2
[notice] To update, run: C:\Users\kanad\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [7]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

from keras import models
from keras import layers
from keras import optimizers 
from keras import metrics

In [8]:
def stratify_sample(data, cut, target):
    assert(cut) >= 0
    return data.groupby(target, group_keys=False).apply(lambda x: x.sample(frac=1/cut))

In [9]:
def model_train(model, split, epochs, random_target=False):
    model.compile(optimizer=optimizers.Adam(), 
              loss='binary_crossentropy', 
              ) 
    
    x_train, x_test, y_train, y_test = split

    if random_target:
        y_train = np.random.randint(2, size = len(y_train))
        y_test = np.random.randint(2, size = len(y_test))  

    history = model.fit(x_train, y_train,
                    epochs=epochs,
                    batch_size=min(32, len(x_train)),  
                    validation_data=(x_test, y_test), 
                    class_weight=class_weight)
    return history, model

In [10]:
def sample(data, cut, target):
    assert(cut > 0)
    return data.groupby(target, group_keys=False).apply(lambda x: x.sample(frac=1/cut))    

In [11]:
def get_train_test_split(data, target, cut, random_state = None):
    num_data_new = sample(data, cut, target)
    
    train, test = train_test_split(num_data_new, test_size=0.2, random_state=random_state) 

    y_train = np.array(train[target])
    y_test = np.array(test[target])

    x_train = np.array(train.drop(columns=[target]))
    x_test = np.array(test.drop(columns=[target]))
    
    return x_train, x_test, y_train, y_test 

## Модель сети

In [12]:
params = [16, 32, 32, 1]

In [13]:
input_size = len(num_data.columns) - 1

In [14]:
input_size

33

In [15]:
def make_model(input_size, parameters):
    model = models.Sequential()

    model.add(layers.Dense(parameters[0], activation='relu', input_shape=(input_size,))) 
    for i in range(1, len(parameters)-1):
        model.add(layers.Dense(parameters[i], activation="relu"))
    model.add(layers.Dense(parameters[-1], activation='sigmoid'))
    
    model.compile(optimizer=optimizers.Adam(), 
              loss='binary_crossentropy', 
              metrics=["accuracy", metrics.Precision(), metrics.Recall()]
              )
    
    return model

In [16]:
volume = len(num_data)

In [17]:
cuts = [1, 2, 4, 8, 16, 32, 64, 128, 248, 512]

In [18]:
# data_vol = [volume, 970, 647, 485, 388, 323, 277, ]
# cuts = list(map(lambda x: int(volume / x), data_vol))

In [19]:
print("Cuts are {}".format(cuts))
print("Volume of data is {}".format(list(map(lambda x: int(volume / x), cuts))))

Cuts are [1, 2, 4, 8, 16, 32, 64, 128, 248, 512]
Volume of data is [1941, 970, 485, 242, 121, 60, 30, 15, 7, 3]


In [20]:
get_model_directory = lambda series, cut: "models/{}_series/{}_cut/".format(series, cut)
get_model_name = lambda number:  "m_{}.h5".format(number)

## Логи

In [21]:
import datetime 
def log_preamble(log_file_name):
    global tries, epochs, series, file_name
    time_stamp = datetime.datetime.now()

    with open(log_file_name, "a") as log_file:
        log_file.write(f"\n\nExecuted on time is {datetime.datetime.now()}\n")
        log_file.write(f"Tries: {tries}, epochs: {epochs}, series = {series}\n")
        log_file.write(f"Network configuration is {params}\n")
        log_file.write(f"Cuts are {cuts}\n")
    return time_stamp

def log_final(log_file_name, time_stamp):
    time_stamp_new = datetime.datetime.now()
    with open(log_file_name, "a") as log_file:
        log_file.write(f"Finished successfully at {time_stamp_new}\n")
        log_file.write(f"Total time = {(time_stamp_new - time_stamp).total_seconds()}\n")

## Обучение

In [22]:
# путь к файлу модели
model_name = lambda series_num, cut_val, model_num: get_model_directory(series_num, cut_val) + get_model_name(model_num)

In [23]:
def model_train(data_split, batch_size, epochs, 
                random_target=False):
    my_model = make_model(input_size, params)
    
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        verbose=1,
        patience=15,
        mode='auto',
        restore_best_weights=True)

    callbacks = [early_stopping]
        
    x_train, x_test, y_train, y_test = data_split

    if random_target:
        y_train = np.random.randint(2, size = len(y_train))
        y_test = np.random.randint(2, size = len(y_test))  

    history = my_model.fit(x_train, y_train,
                    epochs=epochs,
                    batch_size=min(batch_size, len(x_train)),  
                    validation_data=(x_test, y_test), 
                    callbacks = callbacks) 
    return history, my_model

In [24]:
!pip install keras==2.9.0




[notice] A new release of pip is available: 23.1.2 -> 24.1.2
[notice] To update, run: C:\Users\kanad\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [25]:
import os 
import pickle

series = [2607] 
tries = 30 
epochs = 200
batch_size = 128 
log_file = "model_log_new.txt"
t = log_preamble(log_file)

for i, s in enumerate(series):
    for cut_number, cut in enumerate(cuts):
        split = get_train_test_split(num_data, target, cut)
        for number in range(tries):
            model_name_current = model_name(s, cut, number)
            history, model_to_save = model_train(split, batch_size, epochs) 
            
            model_to_save.save(model_name_current)
            filename = f"history/hdir_{s}/h_cut{cut_number}/num{number}"
            os.makedirs(os.path.dirname(filename), exist_ok=True) 
            with open(filename, "wb") as save_file: 
                pickle.dump(history.history, save_file)

log_final(log_file, t)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

## Графы

In [None]:
# def display_loss_graph(history):
#     history_dict = history.history
#     loss_values = history_dict['loss'] 
#     val_loss_values = history_dict['val_loss']
#     epochs = range(1, len(loss_values)+1)
#     plt.plot(epochs, loss_values, 'bo', label='Training loss')
#     plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
#     plt.title('Training and validation loss')
#     plt.xlabel('Epochs')
#     plt.ylabel('Loss')
#     plt.legend()
#     plt.show()

# def display_accuracy_graph(history):
#     history_dict = history.history
#     acc = history_dict["accuracy"]
#     val_acc = history_dict['val_accuracy']
#     epochs = range(1, len(acc)+1)
#     plt.plot(epochs, acc, 'bo', label='Training accuracy')
#     plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
#     plt.title('Training and validation accuracy')
#     plt.xlabel('Epochs')
#     plt.ylabel('Loss')
#     plt.legend()
#     plt.show()

In [None]:
# import pickle
# import os

# # сохраняет все данные о серии
# for s in range(len(series)):
#     for i, cut in enumerate(log_cuts):
#         for x in range(tries):
#             filename = './histories/{}_series/{}_cut/m_{}'.format(s, i, x)

#             os.makedirs(os.path.dirname(filename), exist_ok=True)
#             with open(filename, "wb") as file:
#                 pickle.dump(histories[s][i][x], file)

IndexError: list index out of range

In [None]:
# accuracy_score = [[[-1 for _ in range(tries)] for _ in log_cuts] for _ in series]

In [None]:
# for s in range(len(series)):
#     for i, cut in enumerate(log_cuts):
#         for x in range(tries):
#             history_dict = histories[s][i][x].history["val_accuracy"]
#             accuracy_score[s][i][x] = np.max(history_dict)

In [None]:
# plt.boxplot(accuracy_score[0])

In [None]:
# tries = 10
# epochs = 30
# series = [0, 1]
# # histories = [[[] for cut in log_cuts] for i in series] 
# # before compiling
# initial_model = model
# log_file = open("model_log.txt", "w")
# log_file.write("Tries: {}, epochs: {}, series = {}".format(tries, epochs, series))
# for i in series:
#     for cut in log_cuts:
#         model_dir = get_model_directory(i, cut)
#         split = get_train_test_split(num_data, target, cut)
#         for number in range(tries):
#             # wipe the model every time
#             new_model = models.clone_model(initial_model)   
#             # recompile, train and save
#             history, model = model_train(new_model, split, epochs) 
            
#             model_name = model_dir + get_model_name(number)
#             models.save_model(model, filepath = model_name, include_optimizer = False)
#             # histories[i][cut].append(history)
#             # display_history(history)   

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

In [None]:
# import pickle
# import os

# for i, h in enumerate(histories):
#     filename = './histories/model_{}'.format(i)

#     os.makedirs(os.path.dirname(filename), exist_ok=True)
#     with open(filename, "wb") as file:
#         pickle.dump(h, file)

In [None]:
# models.save_model(model, filepath ="./models/RANDOM.h5", include_optimizer = False)



In [None]:
# _random_histories = []
# before compiling
# initial_model = model

# for i in range(5):
    # wipe the model every time
    # new_model = models.clone_model(initial_model)   
    # recompile, train and save
    # history = model_train_save(new_model, cuts[j], i, j, random_target=True) 
    # _random_histories.append(history)