<a href="https://colab.research.google.com/github/AkiraNom/data-analysis-notebook/blob/main/Analysis_automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [189]:
import pandas as pd
import scipy.stats as stats
import plotly.express as px
import glob

import chardet
# pip3 install chardet

from tqdm.notebook import tqdm

In [11]:
file_paths = glob.glob('./drive/MyDrive/set2/*.csv')
file_paths

['./drive/MyDrive/set2/Week18_Gr4_11606.csv',
 './drive/MyDrive/set2/Week18_Gr4_11609.csv',
 './drive/MyDrive/set2/Week18_Gr4_11610.csv',
 './drive/MyDrive/set2/Week18_Gr5_11612.csv',
 './drive/MyDrive/set2/Week18_Gr5_11613.csv',
 './drive/MyDrive/set2/Week18_Gr5_11618.csv',
 './drive/MyDrive/set2/Week18_Gr5_11619.csv',
 './drive/MyDrive/set2/Week18_Gr5_11620.csv',
 './drive/MyDrive/set2/Week18_Gr5_11621.csv',
 './drive/MyDrive/set2/Week18_Gr5_11622.csv',
 './drive/MyDrive/set2/Week18_Gr5_11623.csv',
 './drive/MyDrive/set2/Week18_Gr5_11625.csv',
 './drive/MyDrive/set2/Week18_Gr5_11626.csv',
 './drive/MyDrive/set2/Week18_Gr5_11627.csv',
 './drive/MyDrive/set2/Week18_Gr4_11608.csv']

In [98]:
read_csv_files(file_paths)

reading csv files...


  0%|          | 0/15 [00:00<?, ?it/s]

Unnamed: 0,Exp_Id,TimeDate,RR Interval (ms),Heart Rate (BPM),PR Interval (ms),P Duration (ms),QRS Interval (ms),QT Interval (ms),QTc (ms),JT Interval (ms),Tpeak Tend Interval (ms)
0,1606,4/28/2022 0.28175,111.5,538.2,45.52,24.16,9.25,52.56,49.78,43.31,41.31
1,1609,4/29/2022 26.32075,119.3,503.1,42.51,23.14,10.5,48.57,44.48,38.07,
2,1610,4/30/2022 6.923,145.3,413.0,47.14,19.73,10.98,43.19,35.84,32.22,30.45
3,1612,4/26/2022 3.12825,112.4,534.0,44.42,21.55,10.42,46.43,43.8,36.02,34.17
4,1613,4/26/2022 1.52775,113.8,527.2,41.86,24.08,11.75,48.59,45.55,36.84,34.09
5,1618,4/30/2022 57.46375,109.2,549.6,46.69,24.91,10.49,52.21,49.97,41.72,40.22
6,1619,4/30/2022 5.4755,110.3,544.0,44.41,27.97,10.96,53.31,50.76,42.35,40.35
7,1620,4/30/2022 28.29975,138.2,434.1,43.25,27.5,10.75,57.07,48.54,46.32,45.07
8,1621,4/30/2022 3.3255,134.5,446.2,44.24,26.1,13.25,58.52,50.46,45.27,43.52
9,1622,5/2/2022 7.752,111.0,540.4,39.78,22.87,10.66,42.14,39.99,31.48,29.48


In [244]:
# define a folder path
def get_file_paths(folder_path):
  return glob.glob(f'{folder_path}*.csv')

# Step : check encoding type in csv
'''
CSV files exported by LabChart, ADInstrument are 'ISO-8859-1'.
In case you do not change an encoding option, 'UnicodeDecodeError' will be raised.
To avoid a error, check an encoding option before reading a file with pandas
'''
def get_csv_encoding(file_path):
  with open(file_path, "rb") as file:
    # Read the csv file in binary and check the encoding
    result = chardet.detect(file.read())
    encoding = result["encoding"]
  return encoding

# Step 1: Read multiple CSV files and create a DataFrame
def read_csv_files(file_paths):
    dfs = []

    print('reading csv files...')
    for file_path in tqdm(file_paths):
      try:
        # specify rows and columns to import
        import_rows = [0,1]
        import_cols = list(range(9))

        # check csv encoding type
        encoding = get_csv_encoding(file_path)

        df = pd.read_csv(file_path,
                         index_col = 0,
                         usecols=import_cols,
                         skiprows = lambda x: x not in import_rows,
                         encoding=encoding
                        )
        # add experimental ID
        myid = int(file_path[-8:-4])
        idx = 0
        df.insert(loc=idx,column = 'Exp_Id', value = myid)
        dfs.append(df)

      except UnicodeDecodeError:
        myid = int(file_path[-8:-4])
        print (f'Encording type error for id {myid}')

    return pd.concat(dfs, axis=0, ignore_index=True)

# Step load meta data
def read_meta_file(folder_path):
  file_path = glob.glob(f'{folder_path}meta/*.csv')[0]
  try:
    return pd.read_csv(file_path)

  except FileNotFoundError:
    print('Meta info is not present')

# Step 2: Run normality test on each column
def run_normality_tests(data):
    normality_results = {}
    for column in data.columns:
        _, p_value = stats.normaltest(data[column])
        if p_value < 0.05:
            normality_results[column] = False  # Not normally distributed
        else:
            normality_results[column] = True  # Normally distributed
    return normality_results

# Step 3: Run appropriate statistical tests based on normality
def run_statistical_tests(data, group_column):
    groups = data[group_column].unique()
    p_values = {}

    if len(groups) ==2:
      group1 = data[data[group_column] == groups[0]]
      group2 = data[data[group_column] == groups[1]]

      results = {}
      for column in data.columns:
        if column == group_column:
          continue  # Skip the grouping column itself

        if normality_results[column]:
            _, p_value = stats.ttest_ind(group1[column], group2[column])
        else:
            _, p_value = stats.mannwhitneyu(group1[column], group2[column])

        p_values[column] = p_value

    else:
        return None # for multi-grouop comparison as needed

    return p_values


# Step 4: Create interactive plots using Plotly
def create_interactive_plots(data, x_column, y_column):
    fig = px.scatter(data, x=x_column, y=y_column, color=x_column,
                     title=f'Interactive Plot: {x_column} vs. {y_column}')
    fig.show()

# if __name__ == "__main__":
#     # Replace 'file1.csv', 'file2.csv', etc. with your CSV file paths
#     file_paths = ['file1.csv', 'file2.csv']

#     # Step 1: Read CSV files and create a DataFrame
#     df = read_csv_files(file_paths)

#     # Step 2: Run normality tests on each column
#     normality_results = run_normality_tests(df)

#     # Step 3: Perform statistical tests based on normality
#     group_column = 'Group'  # Replace with the actual column name
#     target_column = 'Data'  # Replace with the actual column name
#     p_value = run_statistical_tests(df, group_column, target_column)

#     if p_value is not None:
#         if p_value < 0.05:
#             print(f"The groups are statistically different (p-value = {p_value})")
#         else:
#             print(f"No significant difference between groups (p-value = {p_value})")

#     # Step 4: Create interactive plots
#     create_interactive_plot(df, x_column=group_column, y_column=target_column)


In [245]:
# Replace 'file1.csv', 'file2.csv', etc. with your CSV file paths
folder_path = './drive/MyDrive/set2/'
file_paths = get_file_paths(folder_path)

# Step 1: Read CSV files and create a DataFrame
df = read_csv_files(file_paths)

meta = read_meta_file(folder_path)
df = meta.merge(df, how='left')

# Step 2: Run normality tests on each column
normality_results = run_normality_tests(df.iloc[:,3:])

 # Step 3: Perform statistical tests based on normality
group_column = 'Condition'

# trim two unnecessary cols
cols_drop = ['Exp_Id','TimeDate']
df_analysis = df.drop(cols_drop,axis=1)
p_values = run_statistical_tests(df_analysis, group_column)

for column, results in p_values.items():

  print(f"Column: {column}")
  if p_value < 0.05:
      print(f"Statistical difference between conditions (p-value = {p_value})")
  else:
      print(f"No significant difference between conditions (p-value = {p_value})")

# Step 4: Create interactive plots
create_interactive_plots(df, x_column=group_column, y_column=target_column)


reading csv files...


  0%|          | 0/15 [00:00<?, ?it/s]

Column: RR Interval (ms)
No significant difference between conditions (p-value = 0.6943278943278944)
Column: Heart Rate (BPM)
No significant difference between conditions (p-value = 0.6943278943278944)
Column: PR Interval (ms)
No significant difference between conditions (p-value = 0.6943278943278944)
Column: P Duration (ms)
No significant difference between conditions (p-value = 0.6943278943278944)
Column: QRS Interval (ms)
No significant difference between conditions (p-value = 0.6943278943278944)
Column: QT Interval (ms)
No significant difference between conditions (p-value = 0.6943278943278944)
Column: QTc (ms)
No significant difference between conditions (p-value = 0.6943278943278944)



kurtosistest only valid for n>=20 ... continuing anyway, n=15



In [243]:
df

Unnamed: 0,Exp_Id,Condition,TimeDate,RR Interval (ms),Heart Rate (BPM),PR Interval (ms),P Duration (ms),QRS Interval (ms),QT Interval (ms),QTc (ms),JT Interval (ms)
0,1606,treated,4/28/2022 0.28175,111.5,538.2,45.52,24.16,9.25,52.56,49.78,43.31
1,1609,control,4/29/2022 26.32075,119.3,503.1,42.51,23.14,10.5,48.57,44.48,38.07
2,1610,treated,4/30/2022 6.923,145.3,413.0,47.14,19.73,10.98,43.19,35.84,32.22
3,1612,control,4/26/2022 3.12825,112.4,534.0,44.42,21.55,10.42,46.43,43.8,36.02
4,1613,treated,4/26/2022 1.52775,113.8,527.2,41.86,24.08,11.75,48.59,45.55,36.84
5,1618,control,4/30/2022 57.46375,109.2,549.6,46.69,24.91,10.49,52.21,49.97,41.72
6,1619,treated,4/30/2022 5.4755,110.3,544.0,44.41,27.97,10.96,53.31,50.76,42.35
7,1620,control,4/30/2022 28.29975,138.2,434.1,43.25,27.5,10.75,57.07,48.54,46.32
8,1621,treated,4/30/2022 3.3255,134.5,446.2,44.24,26.1,13.25,58.52,50.46,45.27
9,1622,control,5/2/2022 7.752,111.0,540.4,39.78,22.87,10.66,42.14,39.99,31.48
