**Task Description**

Download and store all files for New York’s CitiBike trips in 2022

In [4]:
import requests
import zipfile
import os
import pandas as pd

def download_file(url, save_path):
    with requests.get(url, stream=True) as response:
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                
def unzip_file(zip_path, extract_folder):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

def main():
    base_url = "https://s3.amazonaws.com/tripdata/"
    # List of filenames for different months and years
    filenames = [
"202201-citibike-tripdata.csv.zip",
        "202202-citibike-tripdata.csv.zip",
        "202203-citibike-tripdata.csv.zip",
        "202204-citibike-tripdata.csv.zip",
        "202205-citibike-tripdata.csv.zip",
        "202206-citbike-tripdata.csv.zip",
        "202207-citbike-tripdata.csv.zip",
        "202208-citibike-tripdata.csv.zip",
        "202209-citibike-tripdata.csv.zip",
        "202210-citibike-tripdata.csv.zip",
        "202211-citibike-tripdata.csv.zip",
        "202212-citibike-tripdata.csv.zip"
    ]
    save_folder = "./data1/"  # Specify the folder where you want to save the downloaded files.

    for file_name in filenames:
        full_url = base_url + file_name
        save_path = os.path.join(save_folder, file_name)

        download_file(full_url, save_path)
        print(f"File {file_name} downloaded successfully to {save_path}.")

        # Unzip the downloaded file and save the extracted contents to a new folder
        extract_folder = os.path.join(save_folder, file_name.split(".")[0])  # Use the filename without the extension as the extraction folder
        os.makedirs(extract_folder, exist_ok=True)  # Create the folder if it doesn't exist
        unzip_file(save_path, extract_folder)
        print(f"File {file_name} extracted to {extract_folder}.")

if __name__ == "__main__":
    main()


File 202201-citibike-tripdata.csv.zip downloaded successfully to ./data1/202201-citibike-tripdata.csv.zip.
File 202201-citibike-tripdata.csv.zip extracted to ./data1/202201-citibike-tripdata.


**4.	In a new notebook, import all necessary libraries, read in your data, and join it. Hint: what’s the most effective way to import and join data in such a format?**

In [4]:
main_directory = 'CitiBiketrip'

folder_list = os.listdir(main_directory)
print(folder_list)

['202201-citibike-tripdata', '202202-citibike-tripdata', '202203-citibike-tripdata', '202204-citibike-tripdata', '202205-citibike-tripdata', '202206-citbike-tripdata', '202207-citbike-tripdata', '202208-citibike-tripdata', '202209-citibike-tripdata', '202210-citibike-tripdata', '202211-citibike-tripdata', '202212-citibike-tripdata']


In [20]:
dataframes = []

# Walk through the main directory and its subdirectories to find CSV files.
for root, dirs, files in os.walk(main_directory):
    dirs[:] = [d for d in dirs if not d.lower().endswith('_macosx')]
    
    # Skip processing if the current directory path contains any directory with name "_macosx"
    if "_macosx" in root.lower():
        continue
        
    for file in files:
        #print(file)
        if file.endswith('.csv'):
            csv_file_path = os.path.join(root, file)
            df = pd.read_csv(csv_file_path)
            dataframes.append(df)

# Combine all the DataFrames into a single DataFrame
combined_data = pd.concat(dataframes, ignore_index=True)

combined_data.to_csv('Combined_CitiBiketripData.csv', index=False)



202201-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202202-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202203-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202204-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202205-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202206-citbike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202207-citbike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202208-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202209-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202210-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202211-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


202212-citibike-tripdata.csv


  df = pd.read_csv(csv_file_path)


In [None]:

data=pd.read_csv("Combined_CitiBiketripData.csv")
data.tail(20)

In [16]:
data.shape

(469864, 34)

In [9]:
data.shape

(30689921, 13)

**Task Description**

5.Leave a comment that describes how the code you wrote in point 4 works.

**Comment**

The script utilizes the Pandas library and the os module to combine multiple CSV files into a single DataFrame and then export the merged data to a new CSV file. Here's a breakdown of the code's functionality:

Import necessary libraries:

pandas and os libraries are used to handle data frame and to manage file operations 

A variable named main_directory is set to represent the folder path where the script will search for CSV files to combine which stores the names of all files and directories within the main_directory then Prepare a list for storing DataFrames named "dataframes"

os.walk will traverse through the main and its sub directories to find a file ending with ".csv" extension whose content are read and saved in a dataframe using pandas
Now will concatenates all dataframe in one single dataframe by using pd.concat () function and used a parameter ignore_index=True ensures a continuous index in the merged DataFrame.

**Merge the weather data with the New York CitiBike data set and export it to a CSV file.**

**Approach**

Due to the large size of the datasets, we will adopt a two-step approach to merge the Citibike trip data with weather data. First, we will merge the Citibike trip data with weather data on a monthly basis. This means that we will combine the trip data and weather data for each month separately. Then, in the second step, we will merge all the monthly data to create the final merged dataset.

In [None]:
# Define the directory path where all the 12 months data are located
data_directory = ".\\CitiBiketrip\\"

# List of months 
months = ['202201', '202202', '202203', '202204', '202205', '202206', '202207', '202208', '202209', '202210', '202211', '202212']

# Loop through each month's data, merge with weather data, and save them separately
for month in months:
    # Load Citibike trip data for the current month
    df1 = pd.read_csv(data_directory + month + '-citibike-tripdata.csv')
    
    # Load weather data for the current month
    df2 = pd.read_csv('weather_data.csv')
    columns_to_drop = [7, 8, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 44, 45]
    df2 = weather_data.drop(weather_data.columns[columns_to_drop], axis=1)

    # Convert the 'Date' column to a pandas datetime type for correct merging
    df1['Date'] = pd.to_datetime(df1['started_at']).dt.date
    df2['Date'] = pd.to_datetime(df2['DATE']).dt.date

    # Merge the dataframes based on the 'Date' column using an inner join
    merged_df = pd.merge(df1, df2, on='Date', how='inner')

    # Save the merged dataframe to a CSV file for the current month
    merged_df.to_csv(data_directory + month + '-merged_data.csv', index=False)


In [43]:
merged_df.shape

(1649652, 33)

In [44]:
df1.shape

(1649652, 14)

In [10]:
merged_df.head(20)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WT01,WT08
0,55262E4365A955A2,classic_bike,2022-01-18 08:23:52,2022-01-18 08:28:18,Boerum Pl\t& Pacific St,4488.09,Clinton St & Joralemon St,4605.04,40.688489,-73.99116,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
1,75EAB4C9619AB463,classic_bike,2022-01-18 16:45:52,2022-01-18 16:56:03,W 49 St & 8 Ave,6747.06,Amsterdam Ave & W 73 St,7260.09,40.762272,-73.987882,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
2,582C4FB96E010416,classic_bike,2022-01-18 08:45:24,2022-01-18 08:54:11,E 88 St & 1 Ave,7235.13,2 Ave & E 72 St,6925.09,40.778301,-73.948813,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
3,4FAA081EE6A9B319,electric_bike,2022-01-18 18:12:01,2022-01-18 18:17:15,Clermont Ave & Lafayette Ave,4461.01,Emerson Pl & Myrtle Ave,4683.02,40.687645,-73.969689,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
4,AA6CE6206341D731,electric_bike,2022-01-18 13:14:24,2022-01-18 13:25:50,Court St & State St,4488.08,Emerson Pl & Myrtle Ave,4683.02,40.690238,-73.992031,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
5,0B215E18DCAB5559,electric_bike,2022-01-18 10:48:27,2022-01-18 10:53:31,Crescent St & 35 Ave,6688.01,Queens Plaza North & Crescent St,6429.01,40.759628,-73.932146,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
6,C3C9A7074153C0FE,classic_bike,2022-01-18 16:54:44,2022-01-18 16:57:08,31 Ave & 30 St,6857.09,31 St & Broadway,6789.04,40.7647,-73.924031,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
7,A09D2C3867EA5E5D,electric_bike,2022-01-18 08:40:28,2022-01-18 08:57:04,W 22 St & 10 Ave,6306.06,John St & William St,5065.04,40.74692,-74.004519,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
8,A67290B3D88CFD27,classic_bike,2022-01-18 08:06:28,2022-01-18 08:10:48,34 St & 35 Ave,6605.08,31 St & Broadway,6789.04,40.756933,-73.926223,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,
9,77FDAF01894C026F,classic_bike,2022-01-18 21:26:51,2022-01-18 21:28:50,Smith St & 9 St,4077.04,10 St & 2 Ave,3922.02,40.674696,-73.997858,...,0.0,1.8,3.3,-0.5,310,300,14.8,19.2,,


In [56]:
main_directory = 'Merged'

folder_list = os.listdir(main_directory)
print(folder_list)

['merged_data_01.csv', 'merged_data_02.csv', 'merged_data_03.csv', 'merged_data_04.csv', 'merged_data_05.csv', 'merged_data_06.csv', 'merged_data_07.csv', 'merged_data_08.csv', 'merged_data_09.csv', 'merged_data_10.csv', 'merged_data_11.csv', 'merged_data_12.csv']


In [None]:
dataframes = []

# Walk through the main directory and its subdirectories to find CSV files.
for root, dirs, files in os.walk(main_directory):
    dirs[:] = [d for d in dirs if not d.lower().endswith('_macosx')]
    
    # Skip processing if the current directory path contains "_macosx"
    if "_macosx" in root.lower():
        continue
        
    for file in files:
        print(file)
        if file.endswith('.csv'):
            csv_file_path = os.path.join(root, file)
            df = pd.read_csv(csv_file_path)
            dataframes.append(df)

# Combine all the DataFrames into a single DataFrame
combined_data = pd.concat(dataframes, ignore_index=True)

combined_data.to_csv('Combined_DataWeatherFinal.csv', index=False)



merged_data_01.csv


  df = pd.read_csv(csv_file_path)


merged_data_02.csv


  df = pd.read_csv(csv_file_path)


merged_data_03.csv


  df = pd.read_csv(csv_file_path)


merged_data_04.csv


  df = pd.read_csv(csv_file_path)


merged_data_05.csv


  df = pd.read_csv(csv_file_path)


merged_data_06.csv


  df = pd.read_csv(csv_file_path)


merged_data_07.csv


  df = pd.read_csv(csv_file_path)


merged_data_08.csv
