# SQL database and python working environment

```
python -m venv .venv
```

```
pip install pandas sqlalchemy
```

```
CREATE DATABASE [myFitnessApp]
GO

USE [myFitnessApp]
CREATE TABLE FitnessData (
    AthleteId VARCHAR(50) NOT NULL,
    ActivityId BIGINT NOT NULL,
    Type VARCHAR(50) NOT NULL,
    Date DATETIME NOT NULL,
    Distance FLOAT NOT NULL,
    MovingTime FLOAT NOT NULL,
    Name VARCHAR(255) NOT NULL,
    AvgHR INT NULL,
    IntensityPercent INT NULL,
    AvgAltitude FLOAT NOT NULL,
    AvgHRPercent INT NULL,
    ElapsedTime FLOAT NOT NULL,
    HRRc INT NULL,
    kcal INT NOT NULL,
    MaxAltitude FLOAT NOT NULL,
    MaxHR INT NULL,
    MaxHRPercent INT NULL,
    Pace VARCHAR(10) NOT NULL,
    PRIMARY KEY (AthleteId, ActivityId)  -- Composite Primary Key
);
```

# Building the ETL data pipeline

- Extract: Read CSV files containing fitness activity data
- Transform: Clean, reformat, and adjust columns to match the database schema
- Load: Insert the transformed data into the SQL Server database

## Extract

In [1]:
import pandas as pd

In [2]:
# Extract step: Read the CSV file
def extract_data(file_path):
    df = pd.read_csv(file_path)
    print(f"Data extracted successfully from {file_path}.")
    return df

## Transform

In [3]:
# Function to format pace
def convert_pace(speed_m_per_s):
    """
    Convert speed in meters per second (m/s) to pace in min:sec/km.

    Args:
    speed_m_per_s (float): Speed in meters per second.

    Returns:
    str: Pace in the format 'min:sec/km'.
    """
    # Convert speed from meters per second to kilometers per minute
    speed_km_per_min = speed_m_per_s * 60 / 1000

    # Convert speed (km/min) to time per km (minutes per km)
    minutes_per_km = 1 / speed_km_per_min

    # Get the integer part of the minutes
    minutes = int(minutes_per_km)

    # Get the remaining seconds (fractional part converted to seconds)
    seconds = (minutes_per_km - minutes) * 60
    seconds = round(seconds)

    # Handle case when rounding seconds goes to 60
    if seconds == 60:
        minutes += 1
        seconds = 0

    # Format the output as 'min:sec/km'
    return f"{minutes}:{seconds:02d}/km"

# Transform step
def transform_data(df, athlete_id):
    # Convert time from seconds to minutes
    df['Moving Time'] = df['Moving Time'] / 60
    df['Elapsed time'] = df['Elapsed time'] / 60

    # Convert distance from meters to kilometers
    df['Distance'] = df['Distance'] / 1000

    # Format pace using the new function
    df['Pace'] = df['Pace'].apply(lambda x: convert_pace(x))

    # Round specific columns
    df['Intensity'] = df['Intensity'].round(1)
    df['Avg Altitude'] = df['Avg Altitude'].round(1)
    df['Avg HR%'] = df['Avg HR%'].round(1)
    df['Max HR%'] = df['Max HR%'].round(1)
    df['Max Altitude'] = df['Max Altitude'].round(1)
    df['Pace'] = df['Pace'].round(2)
    df['Distance'] = df['Distance'].round(2)
    df['Moving Time'] = df['Moving Time'].round(2)
    df['Elapsed time'] = df['Elapsed time'].round(2)

    # Add athlete_id column
    df['athlete_id'] = athlete_id
    print(f"Transformed data for athlete {athlete_id}.")

    # Rename columns to match SQL Server table
    df.rename(columns={
        'athlete_id': 'AthleteId',
        'id': 'ActivityId',
        'Intensity': 'IntensityPercent',
        'Moving Time': 'MovingTime',
        'Elapsed time': 'ElapsedTime',
        'Avg Altitude': 'AvgAltitude',
        'Avg HR%': 'AvgHRPercent',
        'Max HR%': 'MaxHRPercent',
        'Max Altitude': 'MaxAltitude',
        'Max HR': 'MaxHR',
        'HRRc': 'HRRc',
        'Avg HR': 'AvgHR'
    }, inplace=True)

    return df

## Load

In [4]:
# Load step
def load_data_to_sql(df, engine):
    try:
        df.to_sql('FitnessData', con=engine, if_exists='append', index=False)
        # Print the IDs of activities added
        added_ids = df['ActivityId'].tolist()
        print(f"Activities with the following IDs have been added: {added_ids}")
    except SQLAlchemyError as e:
        print(f"Error occurred while inserting data: {e}")

# SQL database connection

In [5]:
# config.py
SERVER_NAME = "YOUR_SERVER_NAME"
DATABASE_NAME = "YOUR_DATABASE_NAME"  # Correct database name
DATA_FOLDER = "data/"

# db_connection.py

from sqlalchemy import create_engine
from config import SERVER_NAME, DATABASE_NAME
def connect_to_db():
    conn_string = f'mssql+pyodbc://{SERVER_NAME}/{DATABASE_NAME}?driver=[YOUR_DRIVER_NAME]'
    engine = create_engine(conn_string)
    return engine

## ETL pipeline function

In [7]:
import os

In [8]:
# Main ETL function
def etl_pipeline():
    engine = connect_to_db()

    for file_name in os.listdir(DATA_FOLDER):
        if file_name.endswith("_activities.csv") and len(file_name) == 22:  # Check for 6-digit number format
            # Extract the athlete_id from the file name
            athlete_id = file_name.split("_activities")[0]  # This extracts the 'i123456' part

            file_path = os.path.join(DATA_FOLDER, file_name)

            # Extract
            df = extract_data(file_path)

            # Transform
            df_transformed = transform_data(df, athlete_id)

            # Load
            load_data_to_sql(df_transformed, engine)

# Run the ETL pipeline
if __name__ == "__main__":
    etl_pipeline()

# Running the ETL data pipeline

```
USE [myFitnessApp]
SELECT DB_NAME() AS CurrentDatabase
GO

SELECT * FROM [dbo].[FitnessData]
ORDER BY Date DESC
```