In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np


# 1. Loading the Cleaned Archival and Primary Datasets

try:
    # Importing both the archival dataset (N=105) and the primary survey dataset (N=45)
    # Ensuring these files are in the Colab environment or Google Drive orelse through error
    archival_df = pd.read_csv('/content/drive/MyDrive/MS THESIS/cleaned_archival_n105.csv')
    primary_df = pd.read_csv('/content/drive/MyDrive/MS THESIS/cleaned_primary_data_n45.csv')
except FileNotFoundError:
    print("Error: Files not found. Ensure paths are correct and files are uploaded.")


# 2. Standardizing Column Names Across Datasets
# Harmonizing the column headers to ensure seamless merging into a single hybrid dataset
# Standardizing archival headers to match primary dataset format
# Renaming 'Physical Activity Level' to 'Activity_Value' to bridge mins/days
archival_df = archival_df.rename(columns={
    'Daily Steps': 'Daily_Steps',
    'Sleep Duration': 'Sleep_Duration',
    'Stress Level': 'Stress_Level',
    'Heart Rate': 'Heart_Rate',
    'BMI Category': 'BMI_Category',
    'Quality of Sleep': 'Quality_of_Sleep',
    'Physical Activity Level': 'Activity_Value'
})

# Renaming primary datset column to match the archival set(handling both potential original names)
if 'Activity_Frequency' in primary_df.columns:
    primary_df = primary_df.rename(columns={'Activity_Frequency': 'Activity_Value'})
elif 'Activity_Level' in primary_df.columns:
    primary_df = primary_df.rename(columns={'Activity_Level': 'Activity_Value'})

# Defining the full column set for the hybrid dataset (Used for merging)
cols = [
    'Occupation', 'Age', 'Gender', 'Daily_Steps',
    'Activity_Value', 'Sleep_Duration', 'Quality_of_Sleep',
    'Stress_Level', 'Heart_Rate', 'BMI_Category', 'IBS_C_Risk'
]


# 3. Merging into hybrid dataset (N=150)
# Merging the archival and primary datasets while maintaining consistent column structure
hybrid_df = pd.concat([archival_df[cols], primary_df[cols]], ignore_index=True)


# 4. Initial Quality Check of the Hybrid Dataset
print("HYBRID DATA INTEGRITY AUDIT")
print(f"Final Dataset Shape: {hybrid_df.shape}")
print(f"Total Combined Records (N): {len(hybrid_df)}")
print(f"Total Missing Values:\n{hybrid_df.isnull().sum()}")
print("-" * 45)

# Standardize BMI categories for prefessional reporting purposes
hybrid_df['BMI_Category'] = hybrid_df['BMI_Category'].replace('Normal Weight', 'Normal')


# 5. Categorical Variable Summary
print("\nDATA FOR CATEGORICAL VARIABLES")
for col in ['Gender', 'Occupation', 'BMI_Category', 'IBS_C_Risk']:
    counts = hybrid_df[col].value_counts()
    percents = (hybrid_df[col].value_counts(normalize=True) * 100).round(1)

    summary = pd.DataFrame({'Total (N=150)': counts, 'Frequency (%)': percents})
    print(f"\n{col} Distribution:\n{summary}")


# 6. Continuous Variable Summary
print("\n" + "-"*45)
print("DATA FOR CONTINUOUS VARIABLES")
continuous_vars = ['Age', 'Activity_Value', 'Daily_Steps', 'Sleep_Duration', 'Quality_of_Sleep', 'Stress_Level', 'Heart_Rate']

# Calculate mean, standard deviation, min, and max for descriptive statistics
stats = hybrid_df[continuous_vars].describe().T[['mean', 'std', 'min', 'max']]

# Creating the specific "Mean ± SD" column for dissertation tables
stats['Mean ± SD'] = stats.apply(lambda x: f"{x['mean']:.1f} ± {x['std']:.1f}", axis=1)

print(stats[['Mean ± SD', 'min', 'max']])


# 7. Exporting the Final Master Hybrid Dataset
hybrid_df.to_csv('final_master_hybrid_n150.csv', index=False)
print("\nSuccess: Master Hybrid dataset saved as 'final_master_hybrid_n150.csv'")

HYBRID DATA INTEGRITY AUDIT
Final Dataset Shape: (150, 11)
Total Combined Records (N): 150
Total Missing Values:
Occupation          0
Age                 0
Gender              0
Daily_Steps         0
Activity_Value      0
Sleep_Duration      0
Quality_of_Sleep    0
Stress_Level        0
Heart_Rate          0
BMI_Category        0
IBS_C_Risk          0
dtype: int64
---------------------------------------------

DATA FOR CATEGORICAL VARIABLES

Gender Distribution:
                   Total (N=150)  Frequency (%)
Gender                                         
Female                        90           60.0
Male                          56           37.3
Prefer not to say              4            2.7

Occupation Distribution:
                   Total (N=150)  Frequency (%)
Occupation                                     
Engineer                      71           47.3
Accountant                    45           30.0
Software Engineer             22           14.7
Manager                   