# Bellabeat Case Study — Phase 1 & 2

### Notebook 01: Data Understanding & Cleaning

In this notebook, we will:
- Load the raw 18 CSV files
- Generate a Data Dictionary
- Clean & Merge key datasets into a single master file (`bellabeat_clean.csv`)

This corresponds to **Phase 1 & 2** of the roadmap.

In [1]:
# Import libraries
import os
import pandas as pd
import re


## Step 1: Set Paths

In [2]:
RAW_DATA_PATH = r"D:/Projects/Bellabeat/Data/Raw/"
PROCESSED_PATH = r"D:/Projects/Bellabeat/Data/Processed/"

# Ensure processed folder exists
os.makedirs(PROCESSED_PATH, exist_ok=True)

## Step 2: Generate Data Dictionary (Phase 1)

In [3]:
dictionary_records = []

for file in os.listdir(RAW_DATA_PATH):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(RAW_DATA_PATH, file))
        for col in df.columns:
            dictionary_records.append({
                "file_name": file,
                "column_name": col,
                "dtype": str(df[col].dtype),
                "non_null_count": df[col].notnull().sum(),
                "null_count": df[col].isnull().sum(),
                "unique_values": df[col].nunique()
            })

data_dict = pd.DataFrame(dictionary_records)
data_dict.to_csv(os.path.join(PROCESSED_PATH, "bellabeat_data_dictionary.csv"), index=False)
data_dict.head(10)

Unnamed: 0,file_name,column_name,dtype,non_null_count,null_count,unique_values
0,dailyActivity_merged.csv,Id,int64,940,0,33
1,dailyActivity_merged.csv,ActivityDate,object,940,0,31
2,dailyActivity_merged.csv,TotalSteps,int64,940,0,842
3,dailyActivity_merged.csv,TotalDistance,float64,940,0,615
4,dailyActivity_merged.csv,TrackerDistance,float64,940,0,613
5,dailyActivity_merged.csv,LoggedActivitiesDistance,float64,940,0,19
6,dailyActivity_merged.csv,VeryActiveDistance,float64,940,0,333
7,dailyActivity_merged.csv,ModeratelyActiveDistance,float64,940,0,211
8,dailyActivity_merged.csv,LightActiveDistance,float64,940,0,491
9,dailyActivity_merged.csv,SedentaryActiveDistance,float64,940,0,9


**Output:** `bellabeat_data_dictionary.csv` created in `Processed` folder.

In [4]:
# Function to convert column names to snake_case
def to_snake_case(name: str) -> str:
    name = name.strip()
    name = re.sub(r'(?<!^)(?=[A-Z])', '_', name)   # insert _ before capital letters
    name = name.replace(" ", "_").replace("-", "_")
    return name.lower()

## Step 3: Load Key Datasets

In [5]:
daily_activity = pd.read_csv(os.path.join(RAW_DATA_PATH, "dailyActivity_merged.csv"))
sleep_day = pd.read_csv(os.path.join(RAW_DATA_PATH, "sleepDay_merged.csv"))
weight_log = pd.read_csv(os.path.join(RAW_DATA_PATH, "weightLogInfo_merged.csv"))

# Apply column cleaning
daily_activity.columns = [to_snake_case(c) for c in daily_activity.columns]
sleep_day.columns = [to_snake_case(c) for c in sleep_day.columns]
weight_log.columns = [to_snake_case(c) for c in weight_log.columns]

daily_activity.head()

Unnamed: 0,id,activity_date,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,sedentary_active_distance,very_active_minutes,fairly_active_minutes,lightly_active_minutes,sedentary_minutes,calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863


## Step 4: Convert Dates & Remove Duplicates

In [6]:
daily_activity["activity_date"] = pd.to_datetime(daily_activity["activity_date"])
sleep_day["sleep_day"] = pd.to_datetime(sleep_day["sleep_day"])
weight_log["date"] = pd.to_datetime(weight_log["date"])

sleep_day = sleep_day.drop_duplicates()
weight_log = weight_log.drop_duplicates()


  sleep_day["sleep_day"] = pd.to_datetime(sleep_day["sleep_day"])
  weight_log["date"] = pd.to_datetime(weight_log["date"])


## Step 5: Merge Datasets (Phase 2)

In [7]:
merged = daily_activity.merge(sleep_day, how="left", left_on=["id", "activity_date"], right_on=["id", "sleep_day"])
merged = merged.merge(weight_log, how="left", left_on=["id", "activity_date"], right_on=["id", "date"])

# Drop redundant columns
merged = merged.drop(columns=["sleep_day", "date"], errors="ignore")

print("Shape:", merged.shape)
merged.head()

Shape: (940, 24)


Unnamed: 0,id,activity_date,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,sedentary_active_distance,...,calories,total_sleep_records,total_minutes_asleep,total_time_in_bed,weight_kg,weight_pounds,fat,b_m_i,is_manual_report,log_id
0,1503960366,2016-04-12,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,...,1985,1.0,327.0,346.0,,,,,,
1,1503960366,2016-04-13,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,...,1797,2.0,384.0,407.0,,,,,,
2,1503960366,2016-04-14,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,...,1776,,,,,,,,,
3,1503960366,2016-04-15,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,...,1745,1.0,412.0,442.0,,,,,,
4,1503960366,2016-04-16,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,...,1863,2.0,340.0,367.0,,,,,,


## Step 6: Save Clean Dataset

In [None]:
output_file = os.path.join(PROCESSED_PATH, "bellabeat_clean.csv")
merged.to_csv(output_file, index=False)
print(f" Clean dataset saved at {output_file}")

✅ Clean dataset saved at D:/Projects/Bellabeat/Data/Processed/bellabeat_clean.csv
