# Import libraries

In [1]:
import os
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.signal import savgol_filter

from sklearn.preprocessing import MinMaxScaler

In [2]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 50)

In [3]:
path_to_save_data = './../data/preprocessed_data'

# Loading data

In [4]:
path_to_uv_vis = './../data/initial_data/uv_vis_all.csv'

In [5]:
initial_data = pd.read_csv(path_to_uv_vis, sep=';')

print(initial_data.shape)
initial_data.head(3)

(1422, 50)


Unnamed: 0,1,Unnamed: 1,2,Unnamed: 3,3,Unnamed: 5,6,Unnamed: 7,8,Unnamed: 9,9,Unnamed: 11,10,Unnamed: 13,11,Unnamed: 15,12,Unnamed: 17,13,Unnamed: 19,17,Unnamed: 21,18,Unnamed: 23,19,Unnamed: 25,20,Unnamed: 27,21,Unnamed: 29,22,Unnamed: 31,23,Unnamed: 33,24,Unnamed: 35,25,Unnamed: 37,26,Unnamed: 39,27,Unnamed: 41,28,Unnamed: 43,29,Unnamed: 45,30,Unnamed: 47,31,Unnamed: 49
0,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs,Wavelength (nm),Abs
1,900,0022645092,900,0006305033,900,0102608979,900,0118629985,900,0065354921,900,0001846925,900,0137649536,900,0161295101,900,0002024334,900,0067173615,900,000220296,900,-0008435629,900,0036338687,900,0027574776,900,010399662,900,0029689232,900,0012554167,900,0016058626,900,0008771697,900,0022143204,900,0031004837,900,0018750776,900,0017436929,900,002081964,900,0013441184
2,8995,0020818962,8995,0004935609,8995,0101624325,8995,0118008055,8995,0066636138,8995,0002807588,8995,0137287468,8995,015835461,8995,0003689259,8995,0066256501,8995,0001883529,8995,-0007014269,8995,0033993617,8995,0027441682,8995,0101231448,8995,0027061818,8995,0012490356,8995,0015350689,8995,000940393,8995,0021141952,8995,003066529,8995,001767078,8995,0017800104,8995,002118264,8995,0011845477


# Data preprocessing

## Data wrangling

In [6]:
# deleting row with index 0 (it contains 'Wavelength (nm)' and Abs)
data = initial_data.loc[1:, :]

# getting indexes
index = [int(col) for col in data.columns if 'Unnamed' not in col]

# getting wavelength column and absorbance columns
wavelength = pd.DataFrame(data.iloc[:, 0]).rename(columns={'1': 'wavelength'})

abosrbance_columns = [col for col in data.columns if 'Unnamed' in col]
absorbances = data[abosrbance_columns]

# renaming absorbance columns
columns = {abosrbance_columns[i]: index[i] for i in range(len(abosrbance_columns))}
absorbances = absorbances.rename(columns=columns)

# data transposition an concatenating
data = pd.concat([wavelength.T, absorbances.T], axis=0)

# converting ',' t '.' in float numbers
data = data.replace(',', '.', regex=True).astype(float)

print(data.shape)
data.head(3)

(26, 1421)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,...,1397,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411,1412,1413,1414,1415,1416,1417,1418,1419,1420,1421
wavelength,900.0,899.5,899.0,898.5,898.0,897.5,897.0,896.5,896.0,895.5,895.0,894.5,894.0,893.5,893.0,892.5,892.0,891.5,891.0,890.5,890.0,889.5,889.0,888.5,888.0,...,202.0,201.5,201.0,200.5,200.0,199.5,199.0,198.5,198.0,197.5,197.0,196.5,196.0,195.5,195.0,194.5,194.0,193.5,193.0,192.5,192.0,191.5,191.0,190.5,190.0
1,0.022645,0.020819,0.021998,0.021703,0.022185,0.022751,0.02032,0.020227,0.021882,0.02147,0.020591,0.021204,0.020071,0.024153,0.021843,0.021908,0.020585,0.021721,0.022171,0.023517,0.022707,0.021063,0.020994,0.023019,0.022388,...,2.064677,2.060585,2.056036,2.055328,2.052195,2.049726,2.049473,2.046473,2.045135,2.045913,2.046444,2.045989,2.048325,2.050449,2.050894,2.054563,2.053803,2.057813,2.062204,2.067339,2.070449,2.074058,2.078953,2.08227,2.093128
2,0.006305,0.004936,0.004873,0.008104,0.008004,0.009556,0.006757,0.008274,0.008355,0.007562,0.007027,0.00615,0.00746,0.008957,0.00792,0.009472,0.00612,0.007262,0.008183,0.008034,0.008619,0.006561,0.007031,0.007658,0.007848,...,0.304009,0.30409,0.304512,0.304685,0.30477,0.304946,0.304966,0.30507,0.305535,0.305806,0.305882,0.305918,0.306099,0.306347,0.306832,0.306809,0.307365,0.30786,0.308164,0.309233,0.30879,0.309695,0.310111,0.310718,0.311803


In [7]:
data.to_csv(f'{path_to_save_data}/uv_vis.csv')

## Data smoothing

In [8]:
data_smoothed = data.copy()

for idx in data_smoothed.index[1:]:
    data_smoothed.loc[idx] = savgol_filter(data_smoothed.loc[idx], window_length=63, polyorder=3)

print(data_smoothed.shape)
data_smoothed.head(3)

(26, 1421)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,...,1397,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411,1412,1413,1414,1415,1416,1417,1418,1419,1420,1421
wavelength,900.0,899.5,899.0,898.5,898.0,897.5,897.0,896.5,896.0,895.5,895.0,894.5,894.0,893.5,893.0,892.5,892.0,891.5,891.0,890.5,890.0,889.5,889.0,888.5,888.0,...,202.0,201.5,201.0,200.5,200.0,199.5,199.0,198.5,198.0,197.5,197.0,196.5,196.0,195.5,195.0,194.5,194.0,193.5,193.0,192.5,192.0,191.5,191.0,190.5,190.0
1,0.02163,0.021605,0.021587,0.021575,0.021568,0.021567,0.021571,0.021581,0.021595,0.021614,0.021638,0.021666,0.021698,0.021733,0.021773,0.021816,0.021862,0.021911,0.021962,0.022017,0.022073,0.022132,0.022193,0.022255,0.022319,...,2.065248,2.06149,2.05805,2.054945,2.052192,2.049808,2.04781,2.046214,2.045037,2.044296,2.044008,2.044189,2.044856,2.046026,2.047716,2.049942,2.052722,2.056072,2.060008,2.064548,2.069708,2.075505,2.081956,2.089078,2.096887
2,0.006645,0.006748,0.006845,0.006938,0.007026,0.00711,0.007189,0.007264,0.007334,0.007401,0.007464,0.007524,0.00758,0.007633,0.007682,0.007729,0.007773,0.007814,0.007852,0.007889,0.007923,0.007955,0.007985,0.008013,0.00804,...,0.303936,0.304129,0.304325,0.304525,0.30473,0.304939,0.305155,0.305376,0.305605,0.305842,0.306087,0.306341,0.306605,0.30688,0.307165,0.307463,0.307772,0.308095,0.308432,0.308783,0.309149,0.309531,0.30993,0.310345,0.310778


In [9]:
data_smoothed.to_csv(f'{path_to_save_data}/uv_vis_smoothed.csv')

## Data scaling

In [10]:
data_scaled = data_smoothed.copy()

scaler = MinMaxScaler()

data_scaled.iloc[1:] = scaler.fit_transform(data_smoothed.iloc[1:].T).T

print(data_scaled.shape)
data_scaled.head(3)

(26, 1421)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,...,1397,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411,1412,1413,1414,1415,1416,1417,1418,1419,1420,1421
wavelength,900.0,899.5,899.0,898.5,898.0,897.5,897.0,896.5,896.0,895.5,895.0,894.5,894.0,893.5,893.0,892.5,892.0,891.5,891.0,890.5,890.0,889.5,889.0,888.5,888.0,...,202.0,201.5,201.0,200.5,200.0,199.5,199.0,198.5,198.0,197.5,197.0,196.5,196.0,195.5,195.0,194.5,194.0,193.5,193.0,192.5,192.0,191.5,191.0,190.5,190.0
1,2.8e-05,1.7e-05,9e-06,3e-06,4.766199e-07,0.0,2e-06,6e-06,1.3e-05,2.1e-05,3.2e-05,4.4e-05,5.8e-05,7.4e-05,9.2e-05,0.000111,0.000131,0.000153,0.000176,0.0002,0.000225,0.000251,0.000278,0.000306,0.000334,...,0.908611,0.90694,0.905411,0.90403,0.902806,0.901747,0.900858,0.900149,0.899625,0.899296,0.899168,0.899248,0.899545,0.900065,0.900816,0.901806,0.903042,0.904531,0.906281,0.9083,0.910594,0.913171,0.916039,0.919206,0.922678
2,0.0,0.000337,0.000658,0.000963,0.001252841,0.001527,0.001787,0.002034,0.002266,0.002486,0.002694,0.002889,0.003073,0.003247,0.00341,0.003563,0.003707,0.003842,0.003969,0.004088,0.0042,0.004306,0.004405,0.004498,0.004586,...,0.977502,0.978137,0.978782,0.97944,0.980112,0.980801,0.981509,0.982239,0.982991,0.983769,0.984575,0.985411,0.98628,0.987182,0.988121,0.989099,0.990117,0.991179,0.992286,0.99344,0.994644,0.9959,0.99721,0.998576,1.0


In [11]:
data_scaled.to_csv(f'{path_to_save_data}/uv_vis_scaled.csv')