# sourcing data with an API

# Importing libraries and concatenating

In [1]:
import pandas as pd 
import numpy as np
import os 
import requests
import json
from datetime import datetime
import zipfile


In [2]:
# Creating a list with all files in the folder using a list compehension

folderpath = r"C:\Users\obiki\Citibike-tripdata\citibike-extracted\Citibike-extract"

# Step 1: List all files (CSV and ZIP) in the folder and subfolders
filepaths = []
for root, dirs, files in os.walk(folderpath):
    for name in files:
        if name.lower().endswith(".csv") or name.lower().endswith(".zip"):
            filepaths.append(os.path.join(root, name))

dfs = []

# Step 2: Read CSVs and CSVs inside ZIPs
for f in filepaths:
    if f.lower().endswith(".csv"):
        try:
            df_tmp = pd.read_csv(f, encoding="utf-8", dtype=str)
        except UnicodeDecodeError:
            df_tmp = pd.read_csv(f, encoding="latin1", dtype=str)
        except pd.errors.EmptyDataError:
            print(f"⚠️ Skipping empty file: {f}")
            continue
        dfs.append(df_tmp)

    elif f.lower().endswith(".zip"):
        with zipfile.ZipFile(f, "r") as z:
            for filename in z.namelist():
                if filename.lower().endswith(".csv"):
                    try:
                        with z.open(filename) as csvfile:
                            try:
                                df_tmp = pd.read_csv(csvfile, encoding="utf-8", dtype=str)
                            except UnicodeDecodeError:
                                df_tmp = pd.read_csv(csvfile, encoding="latin1", dtype=str)
                    except pd.errors.EmptyDataError:
                        print(f"⚠️ Skipping empty file in zip: {filename}")
                        continue
                    dfs.append(df_tmp)
      

In [3]:
# Read and concatenate all files simultaneously

# Combine only if we have dataframes
if dfs:
    df = pd.concat(dfs, ignore_index=True)
    print("✅ Combined DataFrame shape:", df.shape)
else:
    print("❌ No valid CSV files found.")

✅ Combined DataFrame shape: (29838806, 13)


In [4]:
filepaths

['C:\\Users\\obiki\\Citibike-tripdata\\citibike-extracted\\Citibike-extract\\2022-citibike-tripdata\\202201-citibike-tripdata.zip',
 'C:\\Users\\obiki\\Citibike-tripdata\\citibike-extracted\\Citibike-extract\\2022-citibike-tripdata\\202202-citibike-tripdata.zip',
 'C:\\Users\\obiki\\Citibike-tripdata\\citibike-extracted\\Citibike-extract\\2022-citibike-tripdata\\202203-citibike-tripdata.zip',
 'C:\\Users\\obiki\\Citibike-tripdata\\citibike-extracted\\Citibike-extract\\2022-citibike-tripdata\\202204-citibike-tripdata.zip',
 'C:\\Users\\obiki\\Citibike-tripdata\\citibike-extracted\\Citibike-extract\\2022-citibike-tripdata\\202205-citibike-tripdata.zip',
 'C:\\Users\\obiki\\Citibike-tripdata\\citibike-extracted\\Citibike-extract\\2022-citibike-tripdata\\202206-citibike-tripdata.zip',
 'C:\\Users\\obiki\\Citibike-tripdata\\citibike-extracted\\Citibike-extract\\2022-citibike-tripdata\\202207-citibike-tripdata.zip',
 'C:\\Users\\obiki\\Citibike-tripdata\\citibike-extracted\\Citibike-extract\

# my code works by looking through the folder for all CSV and zip files, reading each CSV into a temporary table while handling different text encodings and skipping empty files, collecting all these tables into a list, and then joining them into one big table. The most effective way to import and join data in this format is to extract zip files first and work with the plain CSVs, read the files as text to avoid errors from mixed data types, skip empty or corrupted files, and, if the files are very large, read them in smaller chunks and combine the chunks at the end to avoid memory issues.

In [5]:
df.shape

(29838806, 13)

In [6]:
df.head

<bound method NDFrame.head of                    ride_id  rideable_type               started_at  \
0         63AF72AB3CD47753   classic_bike  2022-01-13 21:36:47.689   
1         9C0DAD8C1E0EA571   classic_bike  2022-01-16 17:56:23.889   
2         9576DDD8920974F5  electric_bike  2022-01-18 07:10:04.799   
3         962A466CC3AC6781   classic_bike  2022-01-22 12:10:10.225   
4         C2585407BA0FE3E9   classic_bike  2022-01-08 16:35:16.497   
...                    ...            ...                      ...   
29838801  1F223EDAFF420AE3  electric_bike  2022-12-01 20:26:45.847   
29838802  CFA5C560ACB73B8E   classic_bike  2022-12-26 13:46:34.237   
29838803  11C8C5E0DB947B07   classic_bike  2022-12-01 05:56:14.903   
29838804  5B9B083C534A5964   classic_bike  2022-12-02 11:54:15.871   
29838805  91C286C462F89A50   classic_bike  2022-12-18 13:35:22.574   

                         ended_at                start_station_name  \
0         2022-01-13 21:46:02.024                   5 Ave 

In [7]:
df.tail

<bound method NDFrame.tail of                    ride_id  rideable_type               started_at  \
0         63AF72AB3CD47753   classic_bike  2022-01-13 21:36:47.689   
1         9C0DAD8C1E0EA571   classic_bike  2022-01-16 17:56:23.889   
2         9576DDD8920974F5  electric_bike  2022-01-18 07:10:04.799   
3         962A466CC3AC6781   classic_bike  2022-01-22 12:10:10.225   
4         C2585407BA0FE3E9   classic_bike  2022-01-08 16:35:16.497   
...                    ...            ...                      ...   
29838801  1F223EDAFF420AE3  electric_bike  2022-12-01 20:26:45.847   
29838802  CFA5C560ACB73B8E   classic_bike  2022-12-26 13:46:34.237   
29838803  11C8C5E0DB947B07   classic_bike  2022-12-01 05:56:14.903   
29838804  5B9B083C534A5964   classic_bike  2022-12-02 11:54:15.871   
29838805  91C286C462F89A50   classic_bike  2022-12-18 13:35:22.574   

                         ended_at                start_station_name  \
0         2022-01-13 21:46:02.024                   5 Ave 

# ## Get weather data using NOAA's API

In [8]:
# Defining my NOAA token

Token = 'lwtCnXGzTRjoLxZJTYgPLccskXigRocL' 

In [9]:
# Getting the API 

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31', headers={'token':Token})

In [10]:
# Loading the api response as a json

d = json.loads(r.text)

In [11]:
d

{'metadata': {'resultset': {'offset': 1, 'count': 365, 'limit': 1000}},
 'results': [{'date': '2022-01-01T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 116},
  {'date': '2022-01-02T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 114},
  {'date': '2022-01-03T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 14},
  {'date': '2022-01-04T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': -27},
  {'date': '2022-01-05T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 32},
  {'date': '2022-01-06T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 49},
  {'date': '2022-01-07T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attribut

In [12]:
# Securing all items in the response that correspond to TAVG

avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']

In [13]:
# Getting only the date field from all average temperature readings

dates_temp = [item['date'] for item in avg_temps]

In [14]:
# Getting the temperature from all average temperature readings

temps = [item['value'] for item in avg_temps]

In [15]:
temps

[116,
 114,
 14,
 -27,
 32,
 49,
 7,
 -25,
 14,
 16,
 -54,
 -19,
 40,
 48,
 -67,
 -80,
 39,
 18,
 32,
 51,
 -60,
 -59,
 -7,
 -2,
 36,
 -23,
 -42,
 1,
 -48,
 -71,
 -34,
 -17,
 23,
 64,
 58,
 -28,
 -48,
 5,
 41,
 28,
 63,
 86,
 118,
 28,
 -43,
 -47,
 16,
 116,
 99,
 1,
 -26,
 41,
 56,
 144,
 11,
 8,
 -11,
 24,
 8,
 26,
 77,
 56,
 -19,
 31,
 96,
 182,
 87,
 32,
 40,
 72,
 51,
 -23,
 53,
 116,
 136,
 82,
 139,
 131,
 143,
 104,
 107,
 72,
 57,
 96,
 104,
 59,
 -15,
 -12,
 26,
 99,
 131,
 72,
 70,
 76,
 97,
 86,
 91,
 128,
 114,
 86,
 88,
 134,
 154,
 163,
 157,
 157,
 83,
 79,
 76,
 94,
 106,
 146,
 133,
 128,
 104,
 115,
 116,
 82,
 104,
 129,
 136,
 121,
 124,
 120,
 172,
 148,
 100,
 104,
 135,
 154,
 161,
 171,
 160,
 179,
 188,
 204,
 197,
 178,
 155,
 154,
 212,
 278,
 210,
 174,
 162,
 164,
 202,
 204,
 207,
 233,
 283,
 178,
 203,
 195,
 219,
 200,
 222,
 218,
 237,
 244,
 218,
 218,
 205,
 242,
 239,
 233,
 203,
 252,
 217,
 173,
 205,
 221,
 191,
 192,
 219,
 258,
 270,
 243,
 22

In [16]:
# Putting the results in a dataframe

df_temp = pd.DataFrame()

In [17]:
# Getting only date and cast it to date time; converting temperature from tenths of Celsius to normal Celsius

df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temp['avgTemp'] = [float(v)/10.0 for v in temps]

In [18]:
df_temp.tail()

Unnamed: 0,date,avgTemp
360,2022-12-27,-0.7
361,2022-12-28,3.4
362,2022-12-29,6.4
363,2022-12-30,9.3
364,2022-12-31,8.2


In [19]:
df_temp.head()

Unnamed: 0,date,avgTemp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4
3,2022-01-04,-2.7
4,2022-01-05,3.2


In [20]:
df.dtypes

ride_id               object
rideable_type         object
started_at            object
ended_at              object
start_station_name    object
start_station_id      object
end_station_name      object
end_station_id        object
start_lat             object
start_lng             object
end_lat               object
end_lng               object
member_casual         object
dtype: object

In [21]:
df['started_at'] = pd.to_datetime(df['started_at'], format='ISO8601')

In [22]:
df['date'] = pd.to_datetime(df['started_at'], format='%Y-%m-%d').dt.date

In [23]:
df['date'] = pd.to_datetime(df['date'])

In [24]:
df_temp.head()

Unnamed: 0,date,avgTemp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4
3,2022-01-04,-2.7
4,2022-01-05,3.2


In [25]:
%%time
df_merged = df.merge(df_temp, how = 'left', on = 'date', indicator = True)

CPU times: total: 2min 12s
Wall time: 2min 37s


In [26]:
df_merged.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,avgTemp,_merge
0,63AF72AB3CD47753,classic_bike,2022-01-13 21:36:47.689,2022-01-13 21:46:02.024,5 Ave & E 63 St,6904.06,Broadway & W 51 St,6779.04,40.766368,-73.971518,40.76228826,-73.98336183,member,2022-01-13,4.0,both
1,9C0DAD8C1E0EA571,classic_bike,2022-01-16 17:56:23.889,2022-01-16 18:03:50.269,Grand Army Plaza & Plaza St West,4010.15,Bedford Ave & Montgomery St,3736.03,40.6729679,-73.97087984,40.665816,-73.956934,member,2022-01-16,-8.0,both
2,9576DDD8920974F5,electric_bike,2022-01-18 07:10:04.799,2022-01-18 07:20:54.450,W 20 St & 10 Ave,6306.01,Broadway & W 51 St,6779.04,40.745686,-74.005141,40.76228826,-73.98336183,member,2022-01-18,1.8,both
3,962A466CC3AC6781,classic_bike,2022-01-22 12:10:10.225,2022-01-22 12:20:06.899,W 54 St & 9 Ave,6920.03,10 Ave & W 28 St,6459.04,40.76584941,-73.98690506,40.75066386,-74.00176802,member,2022-01-22,-5.9,both
4,C2585407BA0FE3E9,classic_bike,2022-01-08 16:35:16.497,2022-01-08 16:45:33.279,Sharon St & Olive St,5323.05,Driggs Ave & Lorimer St,5481.04,40.715353,-73.93856,40.72179134,-73.9504154,casual,2022-01-08,-2.5,both


In [27]:
df_merged['_merge'].value_counts(dropna = False)

_merge
both          29838166
left_only          640
right_only           0
Name: count, dtype: int64

In [28]:
df_merged.to_csv('newyork_data.csv')

In [29]:
df.shape

(29838806, 14)

In [30]:
df.head

<bound method NDFrame.head of                    ride_id  rideable_type              started_at  \
0         63AF72AB3CD47753   classic_bike 2022-01-13 21:36:47.689   
1         9C0DAD8C1E0EA571   classic_bike 2022-01-16 17:56:23.889   
2         9576DDD8920974F5  electric_bike 2022-01-18 07:10:04.799   
3         962A466CC3AC6781   classic_bike 2022-01-22 12:10:10.225   
4         C2585407BA0FE3E9   classic_bike 2022-01-08 16:35:16.497   
...                    ...            ...                     ...   
29838801  1F223EDAFF420AE3  electric_bike 2022-12-01 20:26:45.847   
29838802  CFA5C560ACB73B8E   classic_bike 2022-12-26 13:46:34.237   
29838803  11C8C5E0DB947B07   classic_bike 2022-12-01 05:56:14.903   
29838804  5B9B083C534A5964   classic_bike 2022-12-02 11:54:15.871   
29838805  91C286C462F89A50   classic_bike 2022-12-18 13:35:22.574   

                         ended_at                start_station_name  \
0         2022-01-13 21:46:02.024                   5 Ave & E 63 St   