# Data cleaning and assembling

## Assembling of the 12 datasets in a temporary table

_Each dataset contains monthly ride data. Starting 2021/09, ending 2022/08_

In [None]:
SELECT * INTO #temporary_table_1
FROM (
    SELECT *
    FROM [Cyclistic].[dbo].[202109-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202110-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202111-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202112-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202201-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202202-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202203-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202204-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202205-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202206-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202207-divvy-tripdata]
    UNION
    SELECT *
    FROM [Cyclistic].[dbo].[202208-divvy-tripdata]
) AS primary_table

## Deleting the rows where the trip duration is inferior or equal to 0

In [None]:
DELETE
FROM #temporary_table_1
WHERE DATEDIFF(minute, [started_at], [ended_at]) IN (
    SELECT
        DATEDIFF(minute, [started_at], [ended_at])
    FROM #temporary_table_1
    GROUP BY DATEDIFF(minute, [started_at], [ended_at])
    HAVING DATEDIFF(minute, [started_at], [ended_at]) <= 0
)

## Deleting the ride\_id column (useless in our case)

In [None]:
ALTER TABLE #temporary_table_1
DROP COLUMN ride_id,
    start_station_id,
    end_station_id

## Deleting the rows containing NULL value

In [None]:
DELETE
FROM #temporary_table_1
WHERE
    rideable_type IS NULL OR
    started_at IS NULL OR
    ended_at IS NULL OR
    start_station_name IS NULL OR
    end_station_name IS NULL OR
    end_station_id IS NULL OR
    start_station_id IS NULL OR
    start_lat IS NULL OR
    start_lng IS NULL OR
    end_lat IS NULL OR
    end_lng IS NULL OR
    member_casual IS NULL

## TRIM the columns

In [None]:
UPDATE #temporary_table_1
SET
    rideable_type = TRIM(rideable_type),
    start_station_name = TRIM(start_station_name),
    start_station_id = TRIM(start_station_id),
    end_station_name = TRIM(end_station_name),
    end_station_id = TRIM(end_station_id),
    start_lat = TRIM(start_lat),
    start_lng = TRIM(start_lng),
    end_lat = TRIM(end_lat),
    end_lng = TRIM(end_lng),
    member_casual = TRIM(member_casual)
GO

## Deleting a few others columns

In [None]:
ALTER TABLE #temporary_table_1
DROP COLUMN start_station_id,
    end_station_id

## Creating the final table for the analysis

In [None]:
SELECT *
INTO twelve_months_trip
FROM #temporary_table_1