Dr Oliviero Andreussi, olivieroandreuss@boisestate.edu

Boise State University, Department of Chemistry and Biochemistry

# Fitting and Data Analysis for the Differential Scanning Calorimetry Experiment {-}

## Preliminary Setup {-}

Before we start, let us import the main modules that we will need for this lecture. You may see some new modules in the list below, we will add more details in the right sections.

In [None]:
# @title Notebook Setup { display-mode: "form" }
# Import the main modules used in this worksheet
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from scipy.signal import medfilt
from scipy.integrate import simps, cumulative_trapezoid
# Load the google drive with your files 
#from google.colab import drive
#drive.mount('/content/drive')
# The following needs to be the path of the folder with all your datafile in .csv format
base_path = '/content/drive/MyDrive/'

In [None]:
# @title Functions to load the data { display-mode: "form" }
def load_data_to_file_dict(file_dict):
    """
    Load a DSC .txt file. 
    The format of the file should have two rows of headers, one extra line of text at the end, and five columns of data
    The columns are: index, time, heat flow, temperature of sample, temperature of reference

    Input variables:
        file_dict : a dictionary with 'path' and 'name' keys corresponding to the file to be loaded
    
    Action: 
        Add to file_dict a Pandas DataFrame with four columns: time (in seconds), heat flow (in W/g), 
        temperature of sample (in C), and temperature of reference (in C)
    """
    data = pd.read_csv(file_dict['path']+file_dict['name'],skiprows=2,skipfooter=1,names=['Time','Heat-Flow','Ts','Tr'],sep=' +',index_col=0,engine='python',encoding='unicode_escape') 
    file_dict['data'] = data
    return

def plot_peak(peak_dict,xaxis='Time'):
    """ 
    """
    if not ('data' in peak_dict): 
        load_data_to_file_dict(peak_dict)
    time_start = peak_dict['time_start']
    time_end = peak_dict['time_end']
    if time_end == 0 : time_end = peak_dict['data'].Time.iat[-1]
    # 
    filtered_data = peak_dict['data'].query(f'Time > {time_start} and Time < {time_end}')
    #
    if xaxis not in peak_dict['data']:
        print("ERROR: Invalid value for xaxis argument")
        return
    elif xaxis == 'Time' :
        plt.xlabel('Time [s]')
    elif xaxis == 'Ts' :
        plt.xlabel('Ts [C]')
    elif xaxis == 'Tr' :
        plt.xlabel('Tr [C]')
    plt.plot(filtered_data[xaxis],filtered_data['Heat-Flow'])
    plt.ylabel('Heat Flow [W/g]')
    plt.show()

def plot_peak_list(peak_list,xaxis='Time',key='',value=['']):
    """
    Given a list of dictionary files, plot temperature vs. time for each file into the same plot.
    If key/value are specified, only plot the files for which the key has the specified value.

    Input variables:
        file_list : a list of dictionary files, each with 'path' and 'name' keys corresponding to the file to be loaded
        key: a string with the name of the key to shortlist the files
        value: the value of the key used to select the shortlist of files
    
    Action: 
        Plot temperature vs. time for the selected files  
    """
    if value == '' or key == '':
        peak_shortlist = peak_list
    else :
        peak_shortlist = [f for f in peak_list if f[key] in value ]
    #
    fig, ax = plt.subplots()
    if xaxis == 'Time' :
        plt.xlabel('Time [s]')
    elif xaxis == 'Ts' :
        plt.xlabel('Ts [C]')
    elif xaxis == 'Tr' :
        plt.xlabel('Tr [C]')
    else:
        print("ERROR: unexpected xaxis label")
        return
    #
    for peak in peak_shortlist : 
        if not ('data' in peak): 
            load_data_to_file_dict(peak)
        filtered_data = peak['data'].query(f"Time > {peak['time_start']} and Time < {peak['time_end']}")
        plt.plot(filtered_data[xaxis],filtered_data['Heat-Flow'],label=peak['label'])
    plt.ylabel('Heat-Flow [W/g]')
    plt.legend()
    plt.show()

In [None]:
def filter_data(peak_dict,verbose=False):
    """
    """
    if not ('data' in peak_dict): 
        load_data_to_file_dict(peak_dict)
    time_start = peak_dict['time_start']
    time_end = peak_dict['time_end']
    if time_end == 0 : time_end = peak_dict['data'].Time.iat[-1]
    # 
    peak_dict['filtered_data'] = peak_dict['data'].query(f'Time > {time_start} and Time < {time_end}').copy()

def line_of_tuple(x,params):
    return params[0]*(x-params[1]) + params[2]

def intersection_of_lines(params1,params2):
    """ 
    Given the equations of two lines (y(x) = params[2] + params[0]*(x-params[1])) 
    find the intersection point: 
        y01 + slope1*(x01 - x) = y02 + slope2*(x02 - x) 
        (slope2 - slope1) * x = (y02 - y01) + (slope2*x02-slope1*x01)
        x = ((y02 - y01) + (slope2*x02-slope1*x01)) / (slope2 - slope1) 
    """
    if params1[0] == params2[0] :
        raise ValueError("ERROR: the two lines are parallel, no (unique) intersection")
    x = (params2[2] - params1[2] - params2[0]*params2[1] + params1[0]*params1[1])/(params1[0]-params2[0])
    y = line_of_tuple(x,params1)
    return x,y

def calc_baseline(peak_dict,xaxis='Time',verbose=False):
    """ 
    """
    filter_data(peak_dict)
    filtered_data = peak_dict['filtered_data']
    filtered_data['dHF_dx'] = np.gradient(filtered_data['Heat-Flow'], filtered_data[xaxis])
    # 
    if 'line' in peak_dict['baseline_type'] : 
        xi = filtered_data[xaxis].iloc[0]
        yi = filtered_data['Heat-Flow'].iloc[0]
        xf = filtered_data[xaxis].iloc[-1]
        yf = filtered_data['Heat-Flow'].iloc[-1]
        if 'horizontal' in peak_dict['baseline_type'] :
            slope = 0.
        else :
            slope = (yf-yi)/(xf-xi)
        if 'right' in peak_dict['baseline_type'] :
            peak_dict['baseline-left'] = (slope,xf,yf)
            peak_dict['baseline-right'] = (slope,xf,yf)
            filtered_data['baseline'] = line_of_tuple(filtered_data[xaxis],peak_dict['baseline-right'])
        else :
            peak_dict['baseline-left'] = (slope,xi,yi)
            peak_dict['baseline-right'] = (slope,xi,yi)
            filtered_data['baseline'] = line_of_tuple(filtered_data[xaxis],peak_dict['baseline-left'])
    elif 'integral' in peak_dict['baseline_type'] : 
        xi = filtered_data[xaxis].iloc[0]
        yi = filtered_data['Heat-Flow'].iloc[0]
        if 'horizontal' in peak_dict['baseline_type'] :
            dydxi = 0
        else :
            dydxi = filtered_data['dHF_dx'].iloc[0]
        peak_dict['baseline-left'] = (dydxi,xi,yi)
        #
        xf = filtered_data[xaxis].iloc[-1]
        yf = filtered_data['Heat-Flow'].iloc[-1]
        if 'horizontal' in peak_dict['baseline_type'] :
            dydxf = 0
        else :
            dydxf = filtered_data['dHF_dx'].iloc[-1]
        peak_dict['baseline-right'] = (dydxf,xf,yf)
        filtered_data['baseline-right'] = line_of_tuple(filtered_data[xaxis],peak_dict['baseline-right'])
        filtered_data['baseline-left'] = line_of_tuple(filtered_data[xaxis],peak_dict['baseline-left'])
        baseline = np.ones(filtered_data['Heat-Flow'].shape)*(yi+yf)*0.5
        for i in range(10): 
            gamma = cumulative_trapezoid(filtered_data['Heat-Flow']-baseline,filtered_data[xaxis],initial=0)/simps(filtered_data['Heat-Flow']-baseline,filtered_data[xaxis])
            baseline = (1-gamma)*filtered_data['baseline-left'] + gamma*filtered_data['baseline-right']
        filtered_data['baseline'] = baseline
    filtered_data['Heat-Flow-Clean'] = filtered_data['Heat-Flow'] - filtered_data['baseline']
    

In [None]:
def analyze_peak(peak_dict,verbose=False,xaxis='Time'):
    """ 
    """
    calc_baseline(peak_dict,xaxis)
    #
    filtered_data = peak_dict['filtered_data']
    # Compute the integral
    peak_integral = simps(filtered_data['Heat-Flow-Clean'],filtered_data['Time'])
    peak_dict['integral'] = peak_integral
    if verbose : print(f'The peak integral is {peak_integral:8.6f}')
    # Peak position
    if peak_integral < 0 : 
        index_peak = np.argmin(filtered_data['Heat-Flow'])
    else : 
        index_peak = np.argmax(filtered_data['Heat-Flow'])
    x_peak = filtered_data[xaxis].iloc[index_peak]
    y_peak = filtered_data['Heat-Flow'].iloc[index_peak]
    peak_dict['peak'] = (x_peak,y_peak)
    # Inflaction points (max and min of first derivative are zeros of second derivative)
    left_data = filtered_data.iloc[:index_peak]
    if peak_integral < 0 :
        index_inflection_left = np.argmin(left_data['dHF_dx'])
    else : 
        index_inflection_left = np.argmax(left_data['dHF_dx'])
    x_infl_left = left_data[xaxis].iloc[index_inflection_left]
    y_infl_left = left_data['Heat-Flow'].iloc[index_inflection_left]
    slope_infl_left = left_data['dHF_dx'].iloc[index_inflection_left]
    peak_dict['inflection1'] = (slope_infl_left, x_infl_left, y_infl_left)
    right_data = filtered_data.iloc[index_peak+1:]
    if peak_integral < 0 :
        index_inflection_right = np.argmax(right_data['dHF_dx'])
    else : 
        index_inflection_right = np.argmin(right_data['dHF_dx'])
    x_infl_right = right_data[xaxis].iloc[index_inflection_right]
    y_infl_right = right_data['Heat-Flow'].iloc[index_inflection_right]
    slope_infl_right = right_data['dHF_dx'].iloc[index_inflection_right]
    peak_dict['inflection2'] = (slope_infl_right, x_infl_right, y_infl_right)
    # Onset
    onset_x, onset_y = intersection_of_lines(peak_dict['inflection1'],peak_dict['baseline-left'])
    peak_dict['onset'] = (onset_x,onset_y)
    # Endset
    endset_x, endset_y = intersection_of_lines(peak_dict['inflection2'],peak_dict['baseline-right'])
    peak_dict['endset'] = (endset_x,endset_y)
    # Extrapolated Peak
    extrapolated_peak_x, extrapolated_peak_y = intersection_of_lines(peak_dict['inflection1'],peak_dict['inflection2'])
    peak_dict['extrapolated_peak'] = (extrapolated_peak_x,extrapolated_peak_y)
    if verbose :
        plot_analysis(peak_dict,xaxis)

def plot_analysis(peak_dict,xaxis='Time'):
    """ 
    """
    filtered_data = peak_dict['filtered_data']
    plt.plot(filtered_data[xaxis],filtered_data['Heat-Flow'])
    plt.plot(filtered_data[xaxis],line_of_tuple(filtered_data[xaxis],peak_dict['baseline-left']))
    plt.plot(filtered_data[xaxis],line_of_tuple(filtered_data[xaxis],peak_dict['baseline-right']))
    plt.plot(filtered_data[xaxis],filtered_data['baseline'])
    plt.scatter(peak_dict['extrapolated_peak'][0],peak_dict['extrapolated_peak'][1],label=f"Extrapolated Peak = {peak_dict['extrapolated_peak'][0]:6.2f} s")
    plt.scatter(peak_dict['onset'][0],peak_dict['onset'][1],label=f"Onset = {peak_dict['onset'][0]:6.2f} s")
    plt.scatter(peak_dict['endset'][0],peak_dict['endset'][1],label=f"Endset = {peak_dict['endset'][0]:6.2f} s")
    inflaction_points = [[peak_dict['inflection1'][1],peak_dict['inflection2'][1]],[peak_dict['inflection1'][2],peak_dict['inflection2'][2]]]
    plt.scatter(inflaction_points[0],inflaction_points[1],marker='x',label='Inflections')
    x_tmp = np.linspace(peak_dict['onset'][0],peak_dict['extrapolated_peak'][0],10)
    plt.plot(x_tmp,line_of_tuple(x_tmp,peak_dict['inflection1']),':',color='grey')
    x_tmp = np.linspace(peak_dict['extrapolated_peak'][0],peak_dict['endset'][0],10)
    plt.plot(x_tmp,line_of_tuple(x_tmp,peak_dict['inflection2']),':',color='grey')
    plt.legend()
    

## Visualize the Systems {-}

The following module needs to be installed on Colab. We won't need it too much for this analysis, but they offer a lot of nice features for chemistry programming. 

In [None]:
# @title Install and load RDKit { display-mode: "form" }
!pip install rdkit
from rdkit import Chem
from rdkit.Chem import Draw
!pip install cirpy
import cirpy

In particular we can use them to draw the molecules in our experiments. While for some molecules you can just write their names and RDKit will plot them, for most molecules you will need to provide their SMILES or their CAS numbers. Luckily, CIRpy can usually find SMILES for you, if you type the common name correctly or if you know the CAS number. 

These are the SMILES for the molecules in your DSC experiments:
* paracetamol: 'CC(=O)Nc1ccc(O)cc1'

In [None]:
# @title Choose the molecule to draw { display-mode: "form" }
input = 'paracetamol' # @param {type:"string"}
input_type = 'name' # @param ["smiles", "name", "cas"] {allow-input: true}
if input_type != 'smiles' :
    smiles=cirpy.resolve( input, 'smiles')
else:
    smiles=input
img = Draw.MolToImage( Chem.MolFromSmiles(smiles), size=(300, 300) )
display(img)

## Loading the Data {-}

We need to load the Google Drive and access an example of a dataset from a kinetics experiment. You can use the same set that I am using by downloading it from Canvas, [here](). Or you can use your own files. I am assuming the file in question will be located in a `Kinetics_Data/` subfolder in your `Colab Notebook/` folder. 

In [None]:
# @title Set Local Path { display-mode: "form" }
# The following needs to be the path of the folder with all your collected data in .csv format
local_path="Colab Notebooks/DSC_Data/" # @param {type:"string"}
path = base_path+local_path

In [None]:
path = './DSC_Data/'

In order to streamline the fitting of the data, we will be storing the file that corresponds to each experiment into a Python dictionary (`dict`), together with all the relevant information of that experiment and the parameters that we need for the fit. You can use the same statement in the following, but make sure to change the file name from 'indium.txt' (the one that I am using) to the one you want to use.

In [None]:
indium_melting = {'path':path, 'name':'indium.txt', 'label':'In melting', 'time_start': 300, 'time_end': 350, 'baseline_width': 61}
indium_freezing = {'path':path, 'name':'indium.txt', 'label':'In freezing', 'time_start': 0, 'time_end': 0, 'baseline_width': 61}

indium_peaks = [indium_melting, indium_freezing]

In [None]:
plot_peak_list(indium_peaks,'Ts')

In [None]:
load_data_to_file_dict(indium_melting)

We should always check that the loaded data looks ok, say by checking the number of columns and rows and, maybe, plotting the two columns

In [None]:
indium_melting['data'].info()

## Adjust the Analysis Parameters {-}

In [None]:
# @title Find a reasonable time interval around the melting peak { display-mode: "form" }
time_start = 300 # @param {type:"number"}
time_end = 350 # @param {type:"number"}
baseline_type = 'line' # @param ['line','horizontal-line','integral','horizontal-integral'] {allow-input: true}
indium_melting['time_start'] = time_start
indium_melting['time_end'] = time_end
indium_melting['baseline_type'] = baseline_type
calc_baseline(indium_melting,verbose=True,xaxis='Ts')

In [None]:
plot = analyze_peak(indium_melting,xaxis='Ts',verbose=True)


In [None]:
# @title Find a reasonable time interval around the melting peak { display-mode: "form" }
time_start = 580 # @param {type:"number"}
time_end = 620 # @param {type:"number"}
baseline_type = 'integral' # @param ['line','horizontal-line','integral','horizontal-integral'] {allow-input: true}
indium_freezing['time_start'] = time_start
indium_freezing['time_end'] = time_end
indium_freezing['baseline_type'] = baseline_type
calc_baseline(indium_freezing,verbose=True,xaxis='Ts')

In [None]:
analyze_peak(indium_freezing,xaxis='Ts',verbose=True)

In [None]:
para1 = {'path':path, 'name':'para1.txt', 'label':'In melting', 'time_start': 0, 'time_end': 0}
para2 = {'path':path, 'name':'para2.txt', 'label':'In melting', 'time_start': 0, 'time_end': 0}


In [None]:
# @title Find a reasonable time interval around the melting peak { display-mode: "form" }
time_start = 320 # @param {type:"number"}
time_end = 400 # @param {type:"number"}
baseline_type = 'integral' # @param ['line','horizontal-line','integral','horizontal-integral'] {allow-input: true}
para1['time_start'] = time_start
para1['time_end'] = time_end
para1['baseline_type'] = baseline_type
calc_baseline(para1,verbose=True,xaxis='Ts')

In [None]:
analyze_peak(para1,xaxis='Ts',verbose=True)

In [None]:
para2_1 = {'path':path, 'name':'para2.txt', 'label':'In melting', 'time_start': 60, 'time_end': 200, 'baseline_type' : 'integral'}
para2_2 = {'path':path, 'name':'para2.txt', 'label':'In melting', 'time_start': 300, 'time_end': 380, 'baseline_type' : 'integral'}

In [None]:
calc_baseline(para2_1,xaxis='Ts',verbose=True)

In [None]:
analyze_peak(para2_1,xaxis='Ts',verbose=True)

In [None]:
analyze_peak(para2_2,verbose=True,xaxis='Ts')

In [None]:
cotton = {'path':path, 'name':'cotton.txt', 'label':'In melting', 'time_start':600, 'time_end': 850, 'baseline_type': 'integral'}


In [None]:
analyze_peak(cotton,verbose=True,xaxis='Ts')