In [None]:
import os
import warnings
import csv
from collections import defaultdict
import matplotlib.pyplot as plt
from pathlib import Path
import scipy.stats as stats


# Let's define the path to the current directory.
script_path = Path(os.getcwd())
# Raise a warning if the path does not end with "scripts"
if script_path.name != "scripts":
    warnings.warn(
        "Please run this script from the 'scripts' directory to ensure that the data is loaded correctly."
    )
    raise SystemExit

results_dir = script_path.parent / "results"
data_dir = script_path.parent / "data"
if not data_dir.exists():
    raise FileNotFoundError(
        f"Data directory {data_dir} not found. Please make sure to run this script from the 'scripts' directory."
    )
if not results_dir.exists():
    results_dir.mkdir(exist_ok=True)
print(f"Current directory is {script_path}.")
print(f"Data is loaded from {data_dir}.")
print(f"Results will be saved in {results_dir}.")

warnings.filterwarnings('ignore')

%config InlineBackend.figure_formats = ['svg']


In [None]:

T1 = 90
T2 = 105
T3 = 120

def analyze_data(file_path):
    data = defaultdict(lambda: defaultdict(list))

    with open(file_path, 'r') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            base = row['Base']
            ligand = row['Ligand']
            solvent = row['Solvent']
            concentration = float(row['Concentration'])
            temp = int(row['Temp_C'])
            yield_value = float(row['yield'])

            config_key = (base, ligand, solvent, concentration)
            data[config_key][temp].append(yield_value)

    temp_combinations = [(T1, T2), (T1, T3), (T2, T3)]

    for temp_comb in temp_combinations:
        t1, t2 = temp_comb
        slopes = []

        yields_t1: list[float] = []
        yields_t2: list[float] = []

        for config, temp_data in data.items():
            if t1 in temp_data and t2 in temp_data:
                yield_t1 = temp_data[t1][0]
                yields_t1.append(yield_t1)
                yield_t2 = temp_data[t2][0]
                yields_t2.append(yield_t2)
                slope = (yield_t2 - yield_t1) / (t2 - t1)
                slopes.append(slope)

        plt.figure(figsize=(8, 6))
        plt.scatter(range(len(slopes)), slopes, color='#0e69af', edgecolor='black', s=15, alpha=0.8, linewidth=0.5)
        plt.xlabel('Reaction')
        plt.ylabel('Slope (Yield Change per °C)')
        plt.title(f"Temperature Combination: {t1}°C to {t2} °C")
        plt.grid(True)
        # save the plot
        plt.savefig(results_dir / f"temp_comb_{t1}_{t2}_slope.svg")
        plt.show()

        ## Correlation between yields at T1 and T2
        # calculate the Pearson correlation coefficient
        corr, _ = stats.pearsonr(yields_t1, yields_t2)
        print(f"Correlation between yields at {t1} °C and {t2} °C: {corr:.2f}")
        # calculate the R2 value
        slope, intercept, r_value, p_value, std_err = stats.linregress(yields_t1, yields_t2)
        print(f"R2 value: {r_value**2:.2f}")
        # plot the correlation between the yields at T1 and T2
        fig, ax = plt.subplots(figsize=(8, 6))
        # use color "rgba(14,105,175,255)"
        ax.scatter(yields_t1, yields_t2, color='#0e69af', edgecolor='black', s=15, alpha=0.8, linewidth=0.5)
        # include the regression line
        x = [min(yields_t1), max(yields_t1)]
        y = [slope*x_i + intercept for x_i in x]
        ax.plot(x, y, color='grey', label=f"R$^2$ = {r_value**2:.2f}, PCC = {corr:.2f}")
        ax.set_xlabel(f'Yield at {t1} °C')
        ax.set_ylabel(f'Yield at {t2} °C')
        ax.set_title(f"Temperature Combination: {t1} °C to {t2} °C")
        # include 1:1 line
        ax.plot(x, x, color='black', linestyle='--', label='1:1 line')
        ax.grid(True)
        # show the legend
        ax.legend()
        plt.savefig(results_dir / f"temp_comb_{t1}_{t2}_correlation.svg")
        plt.show()


# Specify the path to your CSV file
csv_file_path = data_dir / "DirectArylation.csv"

# Call the function to analyze the data and generate plots
analyze_data(csv_file_path)