In [1]:
import os
import copy

import pandas as pd
import numpy as np
from sklearn import linear_model
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib qt
pd.set_option('mode.chained_assignment', None)

In [2]:
class Container:

    data: pd.DataFrame = None
    

    def __init__(self, data=None, name=''):
        self.data = data
        self.folder=None
        self.experiment_name = name
        self._log = []
        self.info = {}

    def load_csv(self, path=''):
        path, self.folder, self.experiment_name = self._input_path(path)

        self.data = pd.read_csv(path)
        print(f'Loaded: {self.experiment_name}')
        self.log(f'csv loaded from: {path}')
        return self.data

    def load_hdf5(self, path=''):
        path, self.folder, self.experiment_name = self._input_path(path)

        with pd.HDFStore(path) as file:
            data = file['data']
            meta = file.get_storer('data').attrs.meta
        
        self._log.extend(meta['log'])
        self.log(f'hdf5 loaded from: {path}')
        meta.pop('log')
        self.info.update(meta)
        return data

    def dump(self,**kwargs):
        path = f'{self.folder}\{self.experiment_name}'
        kwargs.update({'log':self.log()})
        with pd.HDFStore(f'{path}.hdf5') as file:
            file.put('data', self.data)
            file.get_storer('data').attrs.meta = kwargs 
                

    def initial_filter(
            self,
            time_in_minutes=False,
            temperature=(-np.inf, np.inf),
            viscosity=(0, np.inf),
            time=(0, np.inf),
    ):
        conditions = dict(
            temperature=temperature,
            viscosity=viscosity,
            time=time,
        )

        temperature_cond = (
            (temperature[0] < self.data['Temperature']) &\
            (self.data['Temperature'] < temperature[1])
            )
        viscosity_cond = (
            (viscosity[0] < self.data['Viscosity']) &\
            (self.data['Viscosity'] < viscosity[1])
            )
        time_cond = (
            (time[0] < self.data['time']) &\
            (self.data['time'] < time[1])
            )

        self.data = self.data[
            temperature_cond &\
            viscosity_cond &\
            time_cond]
        if time_in_minutes: self.data['time'] / 60
        self.log(('initial_filter', conditions))
        return self.data

    def mask_filter(
        self,
        filter_func,
        by='Temperature',
        column='Viscosity',
    ):
        group = self.data.groupby(by=by)[column]
        mask = group.apply(filter_func).droplevel([0]).sort_index().to_numpy()
        self.data = self.data[mask]
        self.log(('mask_filter', filter_func.__name__))
        return self.data

    def linearize(self):
        T_func = lambda t: 1 / (t+273.15)
        v_func = lambda nu: np.log(nu)
        self.data['Temperature'] = self.data['Temperature'].apply(T_func)
        self.data['Viscosity'] = self.data['Viscosity'].apply(v_func)
        self.log('linearize')
        return self.data

    def copy(self):
        new =copy.deepcopy(self)
        return new

    def log(self, msg=None):
        if msg is not None: self._log.append(msg)
        return self._log

    @staticmethod
    def _input_path(path):
        while (path == '') or (not os.path.isfile(path)):
            path = input(f"Input data path: ")
        _path_list = (path).split('\\')
        folder = '\\'.join(_path_list[:-1])
        experiment_name = _path_list[-1].split('.')[0]
        return path, folder, experiment_name


In [3]:
## Filtration
def z_filter(data: pd.Series):
    mean = data.mean()
    s = data.std(ddof=0)+1e-50
    z_score= np.abs((data-mean) / s) <1
    return z_score

# def z_filter(data: pd.Series):
#     return stats.zscore(data)<1

def whisker_iqr_filter(data: pd.Series):
    whisker_width=0.5
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    return (data >= q1 - whisker_width*iqr) & (data <= q3 + whisker_width*iqr)

def iqr_filter(data: pd.Series):
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    
    return np.abs((data - data.median()) / iqr) < 1

def my_z_filter(data: pd.Series):
    mean = data.median()
    s = data.std()+1e-50
    z_score= np.abs((data-mean) / s) <0.1
    return z_score

In [4]:
a = Container()
a.load_csv()
base = a.copy()

Loaded: Exp0_up


In [5]:
a = base.copy()

In [6]:
a.initial_filter(time=(0, np.inf), viscosity=(40, 360), temperature=(13, 40))

Unnamed: 0,time,Viscosity,Viscosity_verbose,Temperature,Temperature_verbose
0,1.0,344.9,OK,13.8,OK
1,2.0,344.9,OK,13.8,OK
2,3.0,344.9,OK,13.8,OK
3,4.0,345.4,image_sweep_check,13.8,OK
4,5.0,345.4,image_sweep_check,13.8,OK
...,...,...,...,...,...
18282,19111.0,48.9,OK,39.9,OK
18283,19112.0,49.3,OK,39.9,OK
18284,19113.0,49.0,OK,39.9,OK
18285,19114.0,48.7,OK,39.9,OK


In [7]:
a.linearize()

Unnamed: 0,time,Viscosity,Viscosity_verbose,Temperature,Temperature_verbose
0,1.0,5.843255,OK,0.003485,OK
1,2.0,5.843255,OK,0.003485,OK
2,3.0,5.843255,OK,0.003485,OK
3,4.0,5.844703,image_sweep_check,0.003485,OK
4,5.0,5.844703,image_sweep_check,0.003485,OK
...,...,...,...,...,...
18282,19111.0,3.889777,OK,0.003194,OK
18283,19112.0,3.897924,OK,0.003194,OK
18284,19113.0,3.891820,OK,0.003194,OK
18285,19114.0,3.885679,OK,0.003194,OK


In [8]:
a.mask_filter(iqr_filter)

Unnamed: 0,time,Viscosity,Viscosity_verbose,Temperature,Temperature_verbose
0,1.0,5.843255,OK,0.003485,OK
1,2.0,5.843255,OK,0.003485,OK
2,3.0,5.843255,OK,0.003485,OK
3,4.0,5.844703,image_sweep_check,0.003485,OK
4,5.0,5.844703,image_sweep_check,0.003485,OK
...,...,...,...,...,...
18277,19106.0,3.901973,OK,0.003195,OK
18278,19107.0,3.891820,OK,0.003194,OK
18281,19110.0,3.891820,OK,0.003194,OK
18282,19111.0,3.889777,OK,0.003194,OK


In [16]:
## Temporal plots
df =a.data
# df = test_data

fig, ax_v = plt.subplots()
ax_T = ax_v.twinx()

ax_v.scatter(df['time'], df['Viscosity'], color='red', marker='.')
ax_v.set_ylabel('Viscosity', color='red')
ax_v.set_xlabel('time')

ax_T.scatter(df['time'], df['Temperature'], color='blue', marker='.')
ax_T.set_ylabel('Temperature', color='blue')


Text(0, 0.5, 'Temperature')

In [9]:
## Temperature plots
df = a.data

fig, ax = plt.subplots()
colors= df['Viscosity_verbose'].replace({
    'OK':'g', 'OK_inner':'b', 'image_sweep_check':'r','combine_check':'w',
})
ax.scatter(x=df['Temperature'],y=df['Viscosity'],c= colors,s=5)

sns.lineplot(
    ax=ax,
    data=df,
    x="Temperature",
    y="Viscosity",
    estimator='mean',
    errorbar=("sd", 1),
    # err_style="bars", 
    label ='mean'  
)
sns.lineplot(
    ax=ax,
    data=df,
    x="Temperature",
    y="Viscosity",
    errorbar=('pi',50),
    estimator="median",
    label ='median'  
)

# df = b.data
# sns.lineplot(
#     ax=ax,
#     data=df,
#     x="Temperature",
#     y="Viscosity",
#     estimator='mean',
#     errorbar=("sd", 1),
#     # err_style="bars", 
#     label ='mean2'  
# )
# sns.lineplot(
#     ax=ax,
#     data=df,
#     x="Temperature",
#     y="Viscosity",
#     errorbar=('pi',68),
#     estimator="median",
#     label ='median2'  
# )

<Axes: xlabel='Temperature', ylabel='Viscosity'>

In [44]:
## fast OLS plot
def regress(data):
    reg = linear_model.LinearRegression(fit_intercept=True)
    X = np.array([data['Temperature']]).T
    Y = np.array(data['Viscosity'])
    reg.fit(X, Y)

    w_T = reg.coef_[0]
    w0_nu = reg.intercept_

    nu0 = np.exp(w0_nu)
    E = 8.314 * w_T /1000

    def TC_func(t,E=E,nu0 =nu0):
        Tt = 1 / (t+273.15)
        return nu0*np.exp(E/8.314*Tt)

    return nu0, E,TC_func

reg_data = a.data
nu0, EkJ, func = regress(reg_data)
sns.regplot(
    data=reg_data,
    x="Temperature",
    y="Viscosity",
    scatter=True,
    truncate=False,
    order=1,
    label=f'All:\nE= {EkJ:5.3f} kJ\nNu0= {nu0:5.3e} cP',
)

reg_data = a.data.groupby(by='Temperature')['Viscosity'].median().reset_index()
nu0, EkJ, func = regress(reg_data)
sns.regplot(
    data=reg_data,
    x="Temperature",
    y="Viscosity",
    scatter=False,
    truncate=False,
    order=1,
    label=f'Median:\nE= { EkJ:5.3f} kJ\nNu0= {nu0:5.3e} cP',
)

reg_data = a.data.groupby(by='Temperature')['Viscosity'].mean().reset_index()
nu0, EkJ, func = regress(reg_data)
sns.regplot(
    data=reg_data,
    x="Temperature",
    y="Viscosity",
    scatter=False,
    truncate=False,
    order=1,
    label=f'Mean:\nE= { EkJ:5.3f} kJ\nNu0= {nu0:5.3e} cP',
)
# reg_data = b.data
# nu0, EkJ, func = regress(reg_data)
# sns.regplot(
#     data=reg_data,
#     x="Temperature",
#     y="Viscosity",
#     scatter=True,
#     truncate=False,
#     order=1,
#     label=f'All2:\nE kJ= {EkJ:5.3f}\nNu0= {nu0:5.3e}',
# )

# reg_data = b.data.groupby(by='Temperature')['Viscosity'].median().reset_index()
# nu0, EkJ, func = regress(reg_data)
# sns.regplot(
#     data=reg_data,
#     x="Temperature",
#     y="Viscosity",
#     scatter=False,
#     truncate=False,
#     order=1,
#     label=f'Median2:\nE kJ= { EkJ:5.3f}\nNu0= {nu0:5.3e}',
# )

# reg_data = pd.concat([a.data, b.data])
# nu0, EkJ, func = regress(reg_data)
# sns.regplot(
#     data=reg_data,
#     x="Temperature",
#     y="Viscosity",
#     scatter=False,
#     truncate=False,
#     order=1,
#     label=f'All:\nE kJ= {EkJ:5.3f}\nNu0= {nu0:5.3e}',
# )

plt.legend()

<matplotlib.legend.Legend at 0x2a32be43340>

In [45]:
results = pd.DataFrame(columns=['E_kJ','dE_kJ','nu0_cP','dnu0_cP','name','w','desc'])

In [64]:
## Regression
reg_a = a.copy()

df = reg_a.data
df.rename(inplace=True, columns={'Temperature': 'E', 'Viscosity': 'nu'})
df['nu0'] = 1
result_OLS = sm.OLS(df['nu'], df[['E', 'nu0']]).fit()
means = result_OLS.params
conf_int = result_OLS.conf_int(0.005).loc

conf_int['nu0'] = np.exp(conf_int['nu0'])
nu0 = np.exp(means['nu0'])
nu0_diap = (conf_int['nu0'].max() - conf_int['nu0'].min()) / 2
nu0_power = round(np.log10(nu0_diap))

conf_int['E'] = 8.314 * conf_int['E'] / 1000
EkJ = 8.314 * means['E'] / 1000
EkJ_diap = (conf_int['E'].max() - conf_int['E'].min()) / 2
EkJ_power = round(np.log10(EkJ_diap))+1

exp_name = reg_a.experiment_name
w = reg_a.folder.split('(')[1][:-1]
print(
    f'Constants {exp_name} ({w}% mass):',
    f'E   = {EkJ/10**EkJ_power: <10.3f} ± {EkJ_diap/10**EkJ_power: <3.2f} kJ *1e{EkJ_power}',
    f'Nu0 = {nu0/10**nu0_power: <10.3f} ± {nu0_diap/10**nu0_power: <3.2f} cP *1e{nu0_power}',
    sep='\n',
)
print(result_OLS.summary2())

Constants Exp0_up (0% mass):
E   = 57.309     ± 0.10 kJ *1e0
Nu0 = 11.781     ± 0.46 cP *1e-9
                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.995      
Dependent Variable: nu               AIC:                -56071.1591
Date:               2023-11-12 20:31 BIC:                -56055.9950
No. Observations:   14502            Log-Likelihood:     28038.     
Df Model:           1                F-statistic:        2.846e+06  
Df Residuals:       14500            Prob (F-statistic): 0.00       
R-squared:          0.995            Scale:              0.0012254  
---------------------------------------------------------------------
          Coef.    Std.Err.      t       P>|t|     [0.025     0.975] 
---------------------------------------------------------------------
E       6893.0712    4.0859   1687.0545  0.0000  6885.0624  6901.0800
nu0      -18.2568    0.0138  -1323.2372  0.0000   -18.2838   -18.2297
-----------------------

In [None]:
## Save result
desc= input('Description')
results.loc[len(results)] = [
    EkJ,
    EkJ_diap,
    nu0,
    nu0_diap,
    reg_a.experiment_name,
    w,
    desc
]
reg_a.dump(**results.loc[len(results)-1].to_dict())
results