In [31]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Tuple
from pathlib import Path
import re

# Load Cleaned Data

In [32]:
cleaned_data_path = r'../data/processed/combined_cleaned.csv'
df_cleaned = pd.read_csv(cleaned_data_path)

In [33]:
# 0) normalize column names like 'chanel', 'CHANNEL' -> 'Channel'
rename_map = {}
for c in list(df_cleaned.columns):
    if c.lower().strip() in ("channel", "chanel") and c != "Channel":
        rename_map[c] = "Channel"
if rename_map:
    df_cleaned.rename(columns=rename_map, inplace=True)

# 1) locate SOURCE_FILE robustly
sf_candidates = [c for c in df_cleaned.columns if c.lower().strip() == "source_file"]
if not sf_candidates:
    raise KeyError("SOURCE_FILE column not found.")
sf_col = sf_candidates[0]
sf = df_cleaned[sf_col].astype(str)

# 2) (re)build Channel from filename for missing values (do not overwrite existing non-null)
_pat_single = re.compile(r'(?:^|[_-])(cfp|rfp|yfp)(?=[_\-\d]|$)', re.IGNORECASE)
_pat_compo  = re.compile(r'(?:^|[_-])r[-_]?y(?=[_\-\d]|$)', re.IGNORECASE)

def channel_from_name(name: str):
    s = Path(str(name)).stem.lower()
    m = _pat_single.search(s)
    if m:
        return m.group(1).upper()
    if _pat_compo.search(s):
        return "R-Y"
    return np.nan

new_channel = sf.apply(channel_from_name)
if "Channel" in df_cleaned.columns:
    miss = df_cleaned["Channel"].isna()
    df_cleaned.loc[miss, "Channel"] = new_channel[miss]
else:
    df_cleaned["Channel"] = new_channel

# 3) drop ChannelSource if present (we don't use it anymore)
if "ChannelSource" in df_cleaned.columns:
    df_cleaned.drop(columns=["ChannelSource"], inplace=True)

# 4) sanity print
print("Channel column exists:", "Channel" in df_cleaned.columns)
print("Channel value counts (top):")
print(df_cleaned["Channel"].value_counts(dropna=False).head(10))
print(df_cleaned[[sf_col, "Channel"]].head())

Channel column exists: True
Channel value counts (top):
Channel
RFP    27019
CFP    15246
YFP    13784
NaN     4593
R-Y     2089
Name: count, dtype: int64
                     SOURCE_FILE Channel
0  160727_k5_CFP_16-32_spots.csv     CFP
1  160727_k5_CFP_16-32_spots.csv     CFP
2  160727_k5_CFP_16-32_spots.csv     CFP
3  160727_k5_CFP_16-32_spots.csv     CFP
4  160727_k5_CFP_16-32_spots.csv     CFP


In [34]:
df_cleaned.head()

Unnamed: 0,LABEL,ID,TRACK_ID,QUALITY,POSITION_X,POSITION_Y,POSITION_T,FRAME,RADIUS,VISIBILITY,...,ELLIPSE_MINOR,ELLIPSE_THETA,ELLIPSE_ASPECTRATIO,AREA,PERIMETER,CIRCULARITY,SOLIDITY,SHAPE_INDEX,SOURCE_FILE,Channel
0,ID2945,2945,0,209.0,145.948467,95.621082,3.0,3,8.110457,1,...,7.479732,1.179736,1.401931,206.652412,71.594489,0.50663,0.853061,4.980344,160727_k5_CFP_16-32_spots.csv,CFP
1,ID2946,2946,0,220.0,148.047821,94.433312,7.0,7,8.321153,1,...,5.863332,0.908996,2.316572,217.528855,89.493111,0.341309,0.810313,6.067799,160727_k5_CFP_16-32_spots.csv,CFP
2,ID2947,2947,0,195.0,145.259303,91.481847,11.0,11,7.834106,1,...,5.192997,1.017382,2.432088,192.809667,89.493111,0.302524,0.79918,6.445034,160727_k5_CFP_16-32_spots.csv,CFP
3,ID2948,2948,0,234.0,145.152217,91.656074,21.0,21,8.581834,1,...,6.053673,0.832374,1.992784,231.3716,85.515639,0.397584,0.855576,5.621996,160727_k5_CFP_16-32_spots.csv,CFP
4,ID2951,2951,0,219.0,143.997185,89.188898,12.0,12,8.30222,1,...,5.036212,1.023793,2.747553,216.540087,99.43679,0.275204,0.773852,6.757374,160727_k5_CFP_16-32_spots.csv,CFP


In [35]:
((df_cleaned.isna().sum() / len(df_cleaned) * 100).round(2)
 .sort_values(ascending=False).head(20))

Channel                7.32
CONTRAST_CH1           0.00
SOURCE_FILE            0.00
SHAPE_INDEX            0.00
SOLIDITY               0.00
CIRCULARITY            0.00
PERIMETER              0.00
AREA                   0.00
ELLIPSE_ASPECTRATIO    0.00
ELLIPSE_THETA          0.00
ELLIPSE_MINOR          0.00
ELLIPSE_MAJOR          0.00
ELLIPSE_Y0             0.00
ELLIPSE_X0             0.00
SNR_CH1                0.00
LABEL                  0.00
ID                     0.00
TOTAL_INTENSITY_CH1    0.00
MAX_INTENSITY_CH1      0.00
MIN_INTENSITY_CH1      0.00
dtype: float64

In [36]:
# === Drop rows with missing/invalid Channel ===
valid_channels = {"CFP", "RFP", "YFP"}

before = len(df_cleaned)
df_core = df_cleaned[df_cleaned["Channel"].isin(valid_channels)].copy()  # keep only clean rows
df_core.reset_index(drop=True, inplace=True)
after = len(df_core)

print(f"Dropped {before - after} rows ({(before - after)/before:.2%}) "
      f"due to missing/invalid Channel (incl. NaN and 'R-Y').")
print("Channel counts in CORE:\n", df_core["Channel"].value_counts())

# (optional) save the clean core set
# df_core.to_csv("../data/processed/analysis_core.csv", index=False)


Dropped 6682 rows (10.65%) due to missing/invalid Channel (incl. NaN and 'R-Y').
Channel counts in CORE:
 Channel
RFP    27019
CFP    15246
YFP    13784
Name: count, dtype: int64


In [38]:
((df_core.isna().sum() / len(df_core) * 100).round(2)
 .sort_values(ascending=False).head(20))

LABEL                  0.0
CONTRAST_CH1           0.0
SOURCE_FILE            0.0
SHAPE_INDEX            0.0
SOLIDITY               0.0
CIRCULARITY            0.0
PERIMETER              0.0
AREA                   0.0
ELLIPSE_ASPECTRATIO    0.0
ELLIPSE_THETA          0.0
ELLIPSE_MINOR          0.0
ELLIPSE_MAJOR          0.0
ELLIPSE_Y0             0.0
ELLIPSE_X0             0.0
SNR_CH1                0.0
STD_INTENSITY_CH1      0.0
ID                     0.0
TOTAL_INTENSITY_CH1    0.0
MAX_INTENSITY_CH1      0.0
MIN_INTENSITY_CH1      0.0
dtype: float64

In [41]:
print(df_cleaned["Channel"].value_counts(dropna=False).head(10))

Channel
RFP    27019
CFP    15246
YFP    13784
NaN     4593
R-Y     2089
Name: count, dtype: int64


In [40]:
print(df_core["Channel"].value_counts(dropna=False).head(10))

Channel
RFP    27019
CFP    15246
YFP    13784
Name: count, dtype: int64
