In [112]:
import pandas as pd 
from dataclasses import dataclass
from typing import List
from datetime import datetime


In [113]:
df_nov=pd.read_excel(r"..\Data\LineC_SessionData-individual rats_SampleData.xlsx")
df_march=pd.read_excel(r"..\Data\Snifftime data_March15.2019_example_withIndividualSniffs.xlsx")
df_sept=pd.read_excel(r"..\Data\AutomatedCage_Sept2023-24_raw data.xlsx")

In [114]:
df_sept.columns

Index(['RAT_NAME', 'ID_SAMPLE', 'LEVEL_NAME', 'RUN', 'HOLE', 'ID_BL_DOTS',
       'ID_GXP_DOTS', 'ID_BL_APOPO', 'EXECUTED', 'HIT', 'REWARD', 'SniffTime',
       'ReadTotalSnifftime', 'SESSION_DATE', 'ID_EVALUATION_SESSION',
       'STATUS_BLINDPOS', 'SAMPLE_TYPE', 'tblRAT_SESSION_REMARKS', 'REUSED',
       'tblEVALUATION_SniffThreshold', 'tblRAT_SESSION_SniffThreshold',
       'CONFIGURATION_NAME', 'TEMPERATURE', 'STATUS_KNOWNPOS', 'START_TIME',
       'END_TIME', 'DATE_INCOMING', 'tblEVALUATION_SESSION_REMARKS', 'Trainer',
       'Documenter', 'Handler', 'DOTS_NAME', 'ID_CONFIGURATION'],
      dtype='object')

In [115]:
rename_dict = {
    "RAT_NAME": "Rat_Name",
    "ID_EVALUATION_SESSION": "Evaluation_Session_ID",
    "RUN": "Run",
    "HOLE": "Hole",
    "SniffTime": "Sniff_Time",
    "tblEVALUATION_SniffThreshold": "Threshold_Time",
    'tblEVALUATION.SniffThreshold':"Threshold_Time",
    "SniffThreshold":"Threshold_Time",
    "SESSION_DATE" : "Date", 
    "ID_SESSION": "Sample_ID",
    "ID_SAMPLE": "Sample_ID",
    "age":"Age"
    # "ID_BL_DOTS": "Lab_Test"
}

In [116]:
df_sept = df_sept.rename(columns=rename_dict)

In [117]:
def _positive_sample_rat(row): 
    if row['Sniff_Time']>=row['Threshold_Time']: 
        return 1 
    else: 
        return 0

def _positive_sample_lab(row): 
    if row['ID_BL_DOTS']==0:
        return pd.NA
    elif row['ID_BL_DOTS']>1: 
        return 1
    else: 
        return 0


def false_positive(rat, lab):
    """Return 1 if rat says positive but lab test is negative, else 0"""
    return 1 if rat == 1 and lab == 0 else 0

def false_negative(rat, lab):
    """Return 1 if rat says negative but lab test is positive, else 0"""
    return 1 if rat == 0 and lab == 1 else 0

def true_positive(rat, lab):
    """Return 1 if rat says positive and lab test is positive, else 0"""
    return 1 if rat == 1 and lab == 1 else 0

def true_negative(rat, lab):
    """Return 1 if rat says negative and lab test is negative, else 0"""
    return 1 if rat == 0 and lab == 0 else 0


In [118]:
df_sept['Result_Of_Rat']=df_sept.apply(lambda x: _positive_sample_rat(x),axis=1)
df_sept['Lab_Test']=df_sept.apply(lambda x: _positive_sample_lab(x),axis=1)
df_sept["FP"] = df_sept.apply(lambda row: false_positive(row["Result_Of_Rat"], row["Lab_Test"]), axis=1)
df_sept["FN"] = df_sept.apply(lambda row: false_negative(row["Result_Of_Rat"], row["Lab_Test"]), axis=1)
df_sept["TP"]  = df_sept.apply(lambda row: true_positive(row["Result_Of_Rat"], row["Lab_Test"]), axis=1)
df_sept["TN"]  = df_sept.apply(lambda row: true_negative(row["Result_Of_Rat"], row["Lab_Test"]), axis=1)
df_sept["Age"]=""
df_sept["Weight"]=""
df_sept["Gender"]=""

In [119]:
# Select and rename columns, add empty columns, and calculate new columns for df_march
df_march_mod = df_march.rename(columns=rename_dict)

# Keep only the required columns (including those to be created)
columns_needed = [
     'Rat_Name',
     'Evaluation_Session_ID',
     'Gender',
     'Age',
     'Weight',
     'Run',
     'Hole',
     'Sniff_Time',
     'Threshold_Time',
     'Result_Of_Rat',
     'Lab_Test',
     "Date",
     'Sample_ID'
 ]

# Add empty columns for Gender and Weight
df_march_mod['Gender'] = ''
df_march_mod['Weight'] = ''
df_march_mod['Sniff_Time'] = df_march_mod['Sniff_Time'].fillna(0)

# Calculate Result_Of_Rat: 1 if Sniff_Time > Threshold_Time, else 0
df_march_mod['Result_Of_Rat'] = (df_march_mod['Sniff_Time'] > df_march_mod['Threshold_Time']).astype(int)

# Calculate Lab_Test: 1 if ID_BL_DOTS > 1, else 0 if == 1
df_march_mod['Lab_Test'] = df_march['ID_BL_DOTS'].apply(lambda x: 1 if x > 1 else (0 if x == 1 else None))

# Reorder columns to match the requested order
df_march_mod = df_march_mod[columns_needed]

# Display the first few rows to check


In [120]:
# Create FP, FN, TP, TN columns based on Lab_Test (ground truth) and Result_Of_Rat (prediction)
df_march_mod['TP'] = ((df_march_mod['Lab_Test'] == 1) & (df_march_mod['Result_Of_Rat'] == 1)).astype(int)
df_march_mod['TN'] = ((df_march_mod['Lab_Test'] == 0) & (df_march_mod['Result_Of_Rat'] == 0)).astype(int)
df_march_mod['FP'] = ((df_march_mod['Lab_Test'] == 0) & (df_march_mod['Result_Of_Rat'] == 1)).astype(int)
df_march_mod['FN'] = ((df_march_mod['Lab_Test'] == 1) & (df_march_mod['Result_Of_Rat'] == 0)).astype(int)



In [121]:
df_march_mod.columns

Index(['Rat_Name', 'Evaluation_Session_ID', 'Gender', 'Age', 'Weight', 'Run',
       'Hole', 'Sniff_Time', 'Threshold_Time', 'Result_Of_Rat', 'Lab_Test',
       'Date', 'Sample_ID', 'TP', 'TN', 'FP', 'FN'],
      dtype='object')

In [122]:
# Select and rename only the columns you want
df_nov_selected = df_nov.rename(columns=rename_dict).copy()

# Calculate Age
df_nov_selected['Age'] = ((
    pd.to_datetime(df_nov['SESSION_DATE']) - pd.to_datetime(df_nov['Birthdate'])
).dt.days / 365.25).round(2)

df_nov_selected['Run'] = ''
df_nov_selected['Hole'] = ''
# Reorder to match your desired structure
final_columns = [
    'Rat_Name', 'Evaluation_Session_ID', 'Gender', 'Age', 'Weight',
    'Run', 'Hole', 'Sniff_Time', 'Threshold_Time', 'ID_BL_DOTS',"Date"
]



df_nov_final = df_nov_selected.copy()

df_nov_final.loc[:, 'Lab_Test'] = df_nov_final['ID_BL_DOTS'].apply(lambda x: 1 if x == 1 else 0)
df_nov_final = df_nov_final.drop(columns=['ID_BL_DOTS'])

df_nov_final = df_nov_final.copy()
df_nov_final.loc[:, 'Result_Of_Rat'] = (
    df_nov_final['Sniff_Time'] >= df_nov_final['Threshold_Time']
).astype(int)

df_nov_final = df_nov_final.copy()

# FP: Rat says Positive (1), Lab says Negative (0)
df_nov_final['FP'] = ((df_nov_final['Result_Of_Rat'] == 1) & (df_nov_final['Lab_Test'] == 0)).astype(int)

# FN: Rat says Negative (0), Lab says Positive (1)
df_nov_final['FN'] = ((df_nov_final['Result_Of_Rat'] == 0) & (df_nov_final['Lab_Test'] == 1)).astype(int)

# TP: Rat says Positive (1), Lab says Positive (1)
df_nov_final['TP'] = ((df_nov_final['Result_Of_Rat'] == 1) & (df_nov_final['Lab_Test'] == 1)).astype(int)

# TN: Rat says Negative (0), Lab says Negative (0)
df_nov_final['TN'] = ((df_nov_final['Result_Of_Rat'] == 0) & (df_nov_final['Lab_Test'] == 0)).astype(int)

['Date', 'Evaluation_Session_ID', 'ID_RAT', 'Name', 'Birthdate', 'Gender', 'WEIGHT', 'TEMPERATURE', 'Sample_ID', 'ID_BL_APOPO', 'STATUS_KNOWNPOS', 'STATUS_BLINDPOS', 'REWARD', 'HIT', 'Sniff_Time', 'Threshold_Time', 'Age', 'Run', 'Hole', 'Lab_Test', 'Result_Of_Rat', 'FP', 'FN', 'TP', 'TN']


In [123]:
df_nov_final = df_nov_final.rename(columns={'Name': 'Rat_Name', 'WEIGHT': 'Weight'})
df_nov_final = df_nov_final.drop(columns=['Birthdate', 'TEMPERATURE', 'ID_BL_APOPO', 'STATUS_KNOWNPOS', 'STATUS_BLINDPOS', 'REWARD', 'HIT', ])

In [124]:
df_nov_final = df_nov_final.drop(columns=['ID_RAT'])

In [125]:
# Get column sets for each dataframe
nov_cols = set(df_nov_final.columns)
march_cols = set(df_march_mod.columns)
sept_cols = set(df_sept.columns)

# Columns common to all three
common_cols = nov_cols & march_cols & sept_cols

# Columns unique to each dataframe
nov_only = nov_cols - (march_cols | sept_cols)
march_only = march_cols - (nov_cols | sept_cols)
sept_only = sept_cols - (nov_cols | march_cols)

# Columns present in any two but not all three
nov_march = (nov_cols & march_cols) - sept_cols
nov_sept = (nov_cols & sept_cols) - march_cols
march_sept = (march_cols & sept_cols) - nov_cols

print("Columns common to all three:")
print(sorted(common_cols))

print("\nColumns only in df_nov_final:")
print(sorted(nov_only))

print("\nColumns only in df_march_mod:")
print(sorted(march_only))

print("\nColumns only in df_sept:")
print(sorted(sept_only))

print("\nColumns in df_nov_final and df_march_mod only:")
print(sorted(nov_march))

print("\nColumns in df_nov_final and df_sept only:")
print(sorted(nov_sept))

print("\nColumns in df_march_mod and df_sept only:")
print(sorted(march_sept))

Columns common to all three:
['Age', 'Date', 'Evaluation_Session_ID', 'FN', 'FP', 'Gender', 'Hole', 'Lab_Test', 'Rat_Name', 'Result_Of_Rat', 'Run', 'Sample_ID', 'Sniff_Time', 'TN', 'TP', 'Threshold_Time', 'Weight']

Columns only in df_nov_final:
[]

Columns only in df_march_mod:
[]

Columns only in df_sept:
['CONFIGURATION_NAME', 'DATE_INCOMING', 'DOTS_NAME', 'Documenter', 'END_TIME', 'EXECUTED', 'HIT', 'Handler', 'ID_BL_APOPO', 'ID_BL_DOTS', 'ID_CONFIGURATION', 'ID_GXP_DOTS', 'LEVEL_NAME', 'REUSED', 'REWARD', 'ReadTotalSnifftime', 'SAMPLE_TYPE', 'START_TIME', 'STATUS_BLINDPOS', 'STATUS_KNOWNPOS', 'TEMPERATURE', 'Trainer', 'tblEVALUATION_SESSION_REMARKS', 'tblRAT_SESSION_REMARKS', 'tblRAT_SESSION_SniffThreshold']

Columns in df_nov_final and df_march_mod only:
[]

Columns in df_nov_final and df_sept only:
[]

Columns in df_march_mod and df_sept only:
[]


In [126]:
cols_to_drop = [
    'CONFIGURATION_NAME', 'DATE_INCOMING', 'DOTS_NAME', 'Documenter', 'END_TIME', 'EXECUTED', 'HIT', 'Handler',
    'ID_BL_APOPO', 'ID_BL_DOTS', 'ID_CONFIGURATION', 'ID_GXP_DOTS', 'LEVEL_NAME', 'REUSED', 'REWARD',
    'ReadTotalSnifftime', 'SAMPLE_TYPE', 'START_TIME', 'STATUS_BLINDPOS', 'STATUS_KNOWNPOS', 'TEMPERATURE',
    'Trainer', 'tblEVALUATION_SESSION_REMARKS', 'tblRAT_SESSION_REMARKS', 'tblRAT_SESSION_SniffThreshold'
]
df_sept = df_sept.drop(columns=cols_to_drop, errors='ignore')

In [127]:
# Get column sets for each dataframe
nov_cols = set(df_nov_final.columns)
march_cols = set(df_march_mod.columns)
sept_cols = set(df_sept.columns)

# Columns common to all three
common_cols = nov_cols & march_cols & sept_cols

# Columns unique to each dataframe
nov_only = nov_cols - (march_cols | sept_cols)
march_only = march_cols - (nov_cols | sept_cols)
sept_only = sept_cols - (nov_cols | march_cols)

# Columns present in any two but not all three
nov_march = (nov_cols & march_cols) - sept_cols
nov_sept = (nov_cols & sept_cols) - march_cols
march_sept = (march_cols & sept_cols) - nov_cols

print("Columns common to all three:")
print(sorted(common_cols))

print("\nColumns only in df_nov_final:")
print(sorted(nov_only))

print("\nColumns only in df_march_mod:")
print(sorted(march_only))

print("\nColumns only in df_sept:")
print(sorted(sept_only))

print("\nColumns in df_nov_final and df_march_mod only:")
print(sorted(nov_march))

print("\nColumns in df_nov_final and df_sept only:")
print(sorted(nov_sept))

print("\nColumns in df_march_mod and df_sept only:")
print(sorted(march_sept))

Columns common to all three:
['Age', 'Date', 'Evaluation_Session_ID', 'FN', 'FP', 'Gender', 'Hole', 'Lab_Test', 'Rat_Name', 'Result_Of_Rat', 'Run', 'Sample_ID', 'Sniff_Time', 'TN', 'TP', 'Threshold_Time', 'Weight']

Columns only in df_nov_final:
[]

Columns only in df_march_mod:
[]

Columns only in df_sept:
[]

Columns in df_nov_final and df_march_mod only:
[]

Columns in df_nov_final and df_sept only:
[]

Columns in df_march_mod and df_sept only:
[]


In [128]:
columns_needed = [
    'Rat_Name',
    'Evaluation_Session_ID',
    'Gender',
    'Age',
    'Weight',
    'Run',
    'Hole',
    'Sniff_Time',
    'Threshold_Time',
    'Result_Of_Rat',
    'Lab_Test',
    "Date", 
    "TP",
    "FP",
    "TN",
    "FN", 
    "Sample_ID"
]

In [129]:
# Ensure TP, TN, FN, FP columns are integer type in all three dataframes
for df in [df_sept, df_march_mod, df_nov_final]:
    for col in ["TP", "TN", "FP", "FN"]:
        df[col] = df[col].fillna(0).astype(int)

In [130]:
final_df= pd.concat([df_sept,df_march_mod,df_nov_final])


In [131]:
# Show basic info and missing values for each column
print("DataFrame shape:", final_df.shape)
print("\nColumn data types and non-null counts:")
print(final_df.info())

print("\nMissing values per column:")
print(final_df.isnull().sum())

print("\nSummary statistics for numeric columns:")
print(final_df.describe(include=[float, int]))

print("\nSummary statistics for object columns:")
print(final_df.describe(include=[object]))

# Show unique values for categorical/object columns (first 10 unique values)
for col in final_df.select_dtypes(include=[object]).columns:
    print(f"\nColumn: {col}")
    print(final_df[col].unique()[:10])

DataFrame shape: (240223, 17)

Column data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
Index: 240223 entries, 0 to 34852
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Rat_Name               240223 non-null  object        
 1   Sample_ID              240223 non-null  int64         
 2   Run                    240223 non-null  object        
 3   Hole                   240223 non-null  object        
 4   Sniff_Time             240223 non-null  int64         
 5   Date                   240223 non-null  datetime64[ns]
 6   Evaluation_Session_ID  240223 non-null  int64         
 7   Threshold_Time         240223 non-null  int64         
 8   Result_Of_Rat          240223 non-null  int64         
 9   Lab_Test               240223 non-null  int64         
 10  FP                     240223 non-null  int64         
 11  FN                     240223 non-null  

In [132]:
final_df.head()

Unnamed: 0,Rat_Name,Sample_ID,Run,Hole,Sniff_Time,Date,Evaluation_Session_ID,Threshold_Time,Result_Of_Rat,Lab_Test,FP,FN,TP,TN,Age,Weight,Gender
0,Ella,569474,A,1,784,2023-09-01,22012,1000,0,1,0,1,0,0,,,
1,Ella,525681,A,2,724,2023-09-01,22012,1000,0,1,0,1,0,0,,,
2,Ella,548001,A,3,2599,2023-09-01,22012,0,1,1,0,0,1,0,,,
3,Ella,525331,B,1,2576,2023-09-01,22012,1000,1,1,0,0,1,0,,,
4,Ella,525587,B,2,755,2023-09-01,22012,1000,0,1,0,1,0,0,,,


In [133]:
# Show basic info and missing values for each column
print("DataFrame shape:", final_df.shape)
print("\nColumn data types and non-null counts:")
print(final_df.info())

print("\nMissing values per column:")
print(final_df.isnull().sum())

print("\nSummary statistics for numeric columns:")
print(final_df.describe(include=[float, int]))

print("\nSummary statistics for object columns:")
print(final_df.describe(include=[object]))

# Show unique values for categorical/object columns (first 10 unique values)
for col in final_df.select_dtypes(include=[object]).columns:
    print(f"\nColumn: {col}")
    print(final_df[col].unique()[:10])

DataFrame shape: (240223, 17)

Column data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
Index: 240223 entries, 0 to 34852
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Rat_Name               240223 non-null  object        
 1   Sample_ID              240223 non-null  int64         
 2   Run                    240223 non-null  object        
 3   Hole                   240223 non-null  object        
 4   Sniff_Time             240223 non-null  int64         
 5   Date                   240223 non-null  datetime64[ns]
 6   Evaluation_Session_ID  240223 non-null  int64         
 7   Threshold_Time         240223 non-null  int64         
 8   Result_Of_Rat          240223 non-null  int64         
 9   Lab_Test               240223 non-null  int64         
 10  FP                     240223 non-null  int64         
 11  FN                     240223 non-null  

In [134]:
bieber_stats = final_df[final_df['Rat_Name'] == 'Bieber'][['TP', 'FP']].sum()
print(f"Bieber - True Positives: {bieber_stats['TP']}, False Positives: {bieber_stats['FP']}")

Bieber - True Positives: 1729, False Positives: 12098


In [135]:
import numpy as np



In [136]:
final_df['Age'] = final_df['Age'].replace('', pd.NA)
final_df['Weight'] = final_df['Weight'].replace('', pd.NA)
final_df['Weight'] = final_df['Weight'].replace(np.nan, pd.NA)

In [137]:
# Show basic info and missing values for each column
print("DataFrame shape:", final_df.shape)
print("\nColumn data types and non-null counts:")
print(final_df.info())

print("\nMissing values per column:")
print(final_df.isnull().sum())

print("\nSummary statistics for numeric columns:")
print(final_df.describe(include=[float, int]))

print("\nSummary statistics for object columns:")
print(final_df.describe(include=[object]))

# Show unique values for categorical/object columns (first 10 unique values)
for col in final_df.select_dtypes(include=[object]).columns:
    print(f"\nColumn: {col}")
    print(final_df[col].unique()[:10])

DataFrame shape: (240223, 17)

Column data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
Index: 240223 entries, 0 to 34852
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Rat_Name               240223 non-null  object        
 1   Sample_ID              240223 non-null  int64         
 2   Run                    240223 non-null  object        
 3   Hole                   240223 non-null  object        
 4   Sniff_Time             240223 non-null  int64         
 5   Date                   240223 non-null  datetime64[ns]
 6   Evaluation_Session_ID  240223 non-null  int64         
 7   Threshold_Time         240223 non-null  int64         
 8   Result_Of_Rat          240223 non-null  int64         
 9   Lab_Test               240223 non-null  int64         
 10  FP                     240223 non-null  int64         
 11  FN                     240223 non-null  

In [138]:
rows_with_zero_threshold_and_positive_sniff = final_df[(final_df['Threshold_Time'] == 0) & (final_df['Sniff_Time'] > 0)]
print(f"Number of rows where Threshold_Time == 0 and Sniff_Time > 0: {len(rows_with_zero_threshold_and_positive_sniff)}")
rows_with_zero_threshold_and_positive_sniff.head()

Number of rows where Threshold_Time == 0 and Sniff_Time > 0: 136467


Unnamed: 0,Rat_Name,Sample_ID,Run,Hole,Sniff_Time,Date,Evaluation_Session_ID,Threshold_Time,Result_Of_Rat,Lab_Test,FP,FN,TP,TN,Age,Weight,Gender
2,Ella,548001,A,3,2599,2023-09-01,22012,0,1,1,0,0,1,0,,,
5,Ella,525635,B,3,2590,2023-09-01,22012,0,1,1,0,0,1,0,,,
8,Ella,525442,C,3,2600,2023-09-01,22012,0,1,1,0,0,1,0,,,
11,Chilleta,548001,A,3,3094,2023-09-01,22012,0,1,1,0,0,1,0,,,
14,Chilleta,525635,B,3,3091,2023-09-01,22012,0,1,1,0,0,1,0,,,


In [139]:
# Impute or remove rows with Threshold_Time == 0
def impute_or_remove_threshold(df):
    # Find rats with at least one Threshold_Time > 0
    rat_thresholds = df[df['Threshold_Time'] > 0].groupby('Rat_Name')['Threshold_Time'].agg('first').to_dict()
    
    # Function to impute or mark for removal
    def fix_threshold(row):
        if row['Threshold_Time'] == 0:
            rat = row['Rat_Name']
            if rat in rat_thresholds:
                return rat_thresholds[rat]
            else:
                return pd.NA  # Mark for removal
        else:
            return row['Threshold_Time']
    
    df['Threshold_Time'] = df.apply(fix_threshold, axis=1)
    # Remove rows where Threshold_Time is still NA (i.e., rat never had >0 threshold)
    df = df[df['Threshold_Time'].notna()]
    # Convert Threshold_Time to int if possible
    df['Threshold_Time'] = df['Threshold_Time'].astype(int)
    return df

final_df = impute_or_remove_threshold(final_df)

In [140]:
zero_threshold_count = (final_df['Threshold_Time'] == 0).sum()
print(f"Number of rows with Threshold_Time == 0: {zero_threshold_count}")

Number of rows with Threshold_Time == 0: 0


In [141]:
# Recompute Result_Of_Rat: 1 if Sniff_Time > Threshold_Time, else 0
final_df['Result_Of_Rat'] = (final_df['Sniff_Time'] > final_df['Threshold_Time']).astype(int)

# Recompute confusion matrix metrics
final_df['TP'] = ((final_df['Lab_Test'] == 1) & (final_df['Result_Of_Rat'] == 1)).astype(int)
final_df['TN'] = ((final_df['Lab_Test'] == 0) & (final_df['Result_Of_Rat'] == 0)).astype(int)
final_df['FP'] = ((final_df['Lab_Test'] == 0) & (final_df['Result_Of_Rat'] == 1)).astype(int)
final_df['FN'] = ((final_df['Lab_Test'] == 1) & (final_df['Result_Of_Rat'] == 0)).astype(int)

In [142]:
# Show basic info and missing values for each column
print("DataFrame shape:", final_df.shape)
print("\nColumn data types and non-null counts:")
print(final_df.info())

print("\nMissing values per column:")
print(final_df.isnull().sum())

print("\nSummary statistics for numeric columns:")
print(final_df.describe(include=[float, int]))

print("\nSummary statistics for object columns:")
print(final_df.describe(include=[object]))

# Show unique values for categorical/object columns (first 10 unique values)
for col in final_df.select_dtypes(include=[object]).columns:
    print(f"\nColumn: {col}")
    print(final_df[col].unique()[:10])

DataFrame shape: (240223, 17)

Column data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
Index: 240223 entries, 0 to 34852
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Rat_Name               240223 non-null  object        
 1   Sample_ID              240223 non-null  int64         
 2   Run                    240223 non-null  object        
 3   Hole                   240223 non-null  object        
 4   Sniff_Time             240223 non-null  int64         
 5   Date                   240223 non-null  datetime64[ns]
 6   Evaluation_Session_ID  240223 non-null  int64         
 7   Threshold_Time         240223 non-null  int64         
 8   Result_Of_Rat          240223 non-null  int64         
 9   Lab_Test               240223 non-null  int64         
 10  FP                     240223 non-null  int64         
 11  FN                     240223 non-null  

In [143]:
# Show basic info and missing values for each column
print("DataFrame shape:", final_df.shape)
print("\nColumn data types and non-null counts:")
print(final_df.info())

print("\nMissing values per column:")
print(final_df.isnull().sum())

print("\nSummary statistics for numeric columns:")
print(final_df.describe(include=[float, int]))

print("\nSummary statistics for object columns:")
print(final_df.describe(include=[object]))

# Show unique values for categorical/object columns (first 10 unique values)
for col in final_df.select_dtypes(include=[object]).columns:
    print(f"\nColumn: {col}")
    print(final_df[col].unique()[:10])

DataFrame shape: (240223, 17)

Column data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
Index: 240223 entries, 0 to 34852
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Rat_Name               240223 non-null  object        
 1   Sample_ID              240223 non-null  int64         
 2   Run                    240223 non-null  object        
 3   Hole                   240223 non-null  object        
 4   Sniff_Time             240223 non-null  int64         
 5   Date                   240223 non-null  datetime64[ns]
 6   Evaluation_Session_ID  240223 non-null  int64         
 7   Threshold_Time         240223 non-null  int64         
 8   Result_Of_Rat          240223 non-null  int64         
 9   Lab_Test               240223 non-null  int64         
 10  FP                     240223 non-null  int64         
 11  FN                     240223 non-null  

In [144]:
final_df.to_excel(r"..\Data\Report_data_final.xlsx")

In [111]:
num_unique_rats = final_df['Rat_Name'].nunique()
print(f"Number of different rat names in final_df: {num_unique_rats}")

Number of different rat names in final_df: 36
