Main notebook

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from random import seed
from random import random
from Utils.preprocessing import *
from sklearn.ensemble import GradientBoostingRegressor

plt.rcParams.update({
    'font.size': 16,
    'axes.linewidth': 2,
    'axes.titlesize': 20,
    'axes.edgecolor': 'black',
    'axes.labelsize': 20,
    'axes.grid': True,
    'lines.linewidth': 1.5,
    'lines.markersize': 3,
    'figure.figsize': (15, 6),
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'font.family': 'Arial',
    'legend.fontsize': 13,
    'legend.framealpha': 1,
    'legend.edgecolor': 'black',
    'legend.shadow': False,
    'legend.fancybox': True,
    'legend.frameon': True,
})

Importing datasets and extra information

In [10]:
large_training_path = './Datasets/Training/Large'
small_training_path = './Datasets/Training/Small'
extra_training_path = './Datasets/Training/TrainingProfiles.xlsx'
extra_validation_path = './Datasets/Validation/ValidationProfiles.xlsx'

large_file_names = get_file_names(folder_path=large_training_path)
small_file_names = get_file_names(folder_path=small_training_path)

File names:  ['Sample35.csv', 'Sample44.csv', 'Sample39.csv', 'Sample43.csv', 'Sample36.csv', 'Sample38.csv', 'Sample33.csv', 'Sample42.csv', 'Sample40.csv', 'Sample37.csv', 'Sample34.csv', 'Sample41.csv']
Number of files:  12
File names:  ['Sample02.csv', 'Sample01.csv', 'Sample06.csv', 'Sample05.csv', 'Sample09.csv', 'Sample11.csv', 'Sample07.csv', 'Sample10.csv', 'Sample03.csv', 'Sample12.csv', 'Sample04.csv', 'Sample08.csv']
Number of files:  12


Opening the datasets

In [36]:
large_datasets = [ pd.read_csv(large_training_path + '/' + df_name) for df_name in large_file_names]
small_datasets = [ pd.read_csv(small_training_path + '/' + df_name) for df_name in small_file_names]

extra_training_info  = pd.read_excel(extra_training_path)
extra_validation_info  = pd.read_excel(extra_validation_path)

Preparing the datasets

In [33]:
def get_pressure_drop(dataFrame):
    data_frame = dataFrame.copy()
    UP_Pessure = np.array(data_frame['Upstream_Pressure(psi)'])
    D_Pressure = np.array(data_frame['Downstream_Pressure(psi)'])
    drop_pressure = UP_Pessure-D_Pressure
    return drop_pressure

In [38]:
def prepare_datasets(small_data, large_data, extra):
    smallData = [df.copy() for df in small_data]
    LargeData = [df.copy() for df in large_data]

    large_dataset_size = len(large_datasets)
    small_dataset_size = len(smallData)

    particle_size = list(extra_training_info['Particle Size (micron)'])
    solid_ratio = list(extra_training_info['Solid Ratio(%)'])

    small_solid_ratio = solid_ratio[:large_dataset_size]
    large_solid_ratio = solid_ratio[large_dataset_size:]

    small_particle_size = particle_size[:small_dataset_size]
    large_particle_size = particle_size[small_dataset_size:]

    #inserting particle size into the small dataset
    for i in range(small_dataset_size):
        values = (small_particle_size[i]).split('-')
        min = int(values[0])
        max = int(values[1])
        smallData[i]['Particle_Size'] = [min + (random() * (max - min)) for i in range(len(smallData[i]))]
        smallData[i]['Solid Ratio'] = [small_solid_ratio[i]]*len(smallData[i])
        smallData[i]['Pressure_Drop'] = get_pressure_drop(smallData[i])
        smallData[i] = smallData[i].drop(['Upstream_Pressure(psi)', 'Downstream_Pressure(psi)'], axis = 1)

    for i in range(large_dataset_size):
        values = (large_particle_size[i]).split('-')
        min = int(values[0])
        max = int(values[1])
        LargeData[i]['Particle_Size'] = [min + (random() * (max - min)) for i in range(len(LargeData[i]))]
        LargeData[i]['Solid Ratio'] = [large_solid_ratio[i]]*len(LargeData[i])
        LargeData[i]['Pressure_Drop'] = get_pressure_drop(LargeData[i])
        LargeData[i] = LargeData[i].drop(['Upstream_Pressure(psi)', 'Downstream_Pressure(psi)'], axis = 1)
    
    return smallData, LargeData


In [40]:
small_dfs, large_dfs = prepare_datasets(small_datasets, large_datasets, extra_training_info)
large_dfs

[      Time(s)  Flow_Rate(ml/m)  Particle_Size  Solid Ratio  Pressure_Drop
 0         0.0        -0.627256      65.003025          0.4        0.95625
 1         0.1         5.492969      68.721918          0.4        0.13750
 2         0.2         4.999402      63.775202          0.4        0.63750
 3         0.3         5.196829      74.242267          0.4        0.18125
 4         0.4         2.136717      64.353995          0.4        0.08125
 ...       ...              ...            ...          ...            ...
 2395    239.5        94.236225      66.672384          0.4        7.36875
 2396    239.6        88.116000      74.426572          0.4        7.77500
 2397    239.7        92.952952      63.573276          0.4        7.54375
 2398    239.8        98.480897      67.060448          0.4        6.80000
 2399    239.9        97.296337      71.094930          0.4        6.53750
 
 [2400 rows x 5 columns],
       Time(s)  Flow_Rate(ml/m)  Particle_Size  Solid Ratio  Pressure_Dr