# Exploring the NYC taxi data

In Project 2, you will work on the [NYC taxi trip data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). Every month, the city of New York publishes open data which contains a record of every taxi ride taken that month in the city.

The function `get_taxi_data()` is provided for you in `utils.py` to easily download and read data for a particular month and type of taxi. You should use it in your project.

Open `utils.py` in VSCode, study it carefully, and try the example below. If you are not sure how it works, ask a tutor!

In [28]:
import pandas as pd

# Import the function get_taxi_data() from utils.py
from utils import get_taxi_data

In [29]:
# Example: get yellow taxi data for January 2022
cols_to_read = ['tpep_pickup_datetime',
                'tpep_dropoff_datetime',
                'passenger_count',
                'trip_distance',
                'fare_amount']

# Download the data and get the specified columns, save the file locally
df = get_taxi_data('2022', '06', 'yellow', columns=cols_to_read, save=True)
df.head()

File not in current folder; trying to download data...


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount
0,2022-06-01 00:25:41,2022-06-01 00:48:22,1.0,11.0,32.0
1,2022-06-01 00:44:40,2022-06-01 01:01:48,1.0,4.2,14.0
2,2022-06-01 00:23:07,2022-06-01 00:39:50,1.0,9.49,26.0
3,2022-06-01 00:25:53,2022-06-01 00:57:06,2.0,12.1,37.0
4,2022-06-01 00:23:58,2022-06-01 00:33:43,0.0,1.8,9.0


In [3]:
# Now, get the data only for those 3 columns.
# We have the file already saved from the previous command, so this should be faster!
cols_to_read = ['tpep_pickup_datetime',
                'tpep_dropoff_datetime',
                'trip_distance']

# We also don't need to save this as it's a subset of the file we already have.
df = get_taxi_data('2022', '01', 'yellow', columns=cols_to_read)
df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance
0,2022-01-01 00:35:40,2022-01-01 00:53:29,3.8
1,2022-01-01 00:33:43,2022-01-01 00:42:07,2.1
2,2022-01-01 00:53:21,2022-01-01 01:02:19,0.97
3,2022-01-01 00:25:21,2022-01-01 00:35:23,1.09
4,2022-01-01 00:36:48,2022-01-01 01:14:20,4.3


In [4]:
# Now, I want the same data, but I need a new column 'total_amount' which is not in my current file.
cols_to_read = ['fare_amount',
                'total_amount']

# The function tries to get the columns from the existing data file,
# but can't find them, so it automatically re-downloads the data.
df = get_taxi_data('2022', '01', 'yellow', columns=cols_to_read)
df.head()

File is in current folder, but may not contain all required columns.
Re-downloading data...


Unnamed: 0,fare_amount,total_amount
0,14.5,21.95
1,8.0,13.3
2,7.5,10.56
3,8.0,11.8
4,23.5,30.3


In [177]:
cols_to_read = ['tpep_pickup_datetime',
                'tpep_dropoff_datetime',
                'passenger_count',
                'trip_distance',
                'fare_amount','payment_type']

# Download the data and get the specified columns, save the file locally
df = get_taxi_data('2022', '01', 'yellow', columns=cols_to_read, save=True)
df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount,payment_type
0,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,14.5,1
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,8.0,1
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,7.5,1
3,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,8.0,2
4,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,23.5,1


In [178]:
popular_payment_method = df['payment_type'].value_counts().idxmax()
print(popular_payment_method)

1


As it is seem from the results the most popular payment method is 1 in January 2022 for the yellow taxi

In [180]:
import numpy as np
print(df.isnull().sum())
df['passenger_count'].isnull().sum()
df['passenger_count'].fillna(np.mean(df['passenger_count']), inplace=True) #replace NA values with mean values in 'passenger_count' column
df['passenger_count'].isnull().sum()


tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
fare_amount              0
payment_type             0
dtype: int64


0

In [181]:
from utils import get_taxi_data

In [182]:
df.nunique()

tpep_pickup_datetime     1423522
tpep_dropoff_datetime    1424266
passenger_count               11
trip_distance               4305
fare_amount                 6403
payment_type                   6
dtype: int64

In [10]:
df.describe()

Unnamed: 0,passenger_count,trip_distance,fare_amount,payment_type
count,2463931.0,2463931.0,2463931.0,2463931.0
mean,1.389453,5.372751,12.94648,1.194449
std,0.9686008,547.8714,255.8149,0.5001778
min,0.0,0.0,-480.0,0.0
25%,1.0,1.04,6.5,1.0
50%,1.0,1.74,9.0,1.0
75%,1.389453,3.13,14.0,1.0
max,9.0,306159.3,401092.3,5.0


In [183]:
weather_data = pd.read_csv('Hourly weather june NYC.csv')
weather_data

Unnamed: 0,time,temperature_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),snow_depth (m),windspeed_180m (km/h)
0,2022-06-08T00:00,19.9,20.0,0.0,,,0,42.5
1,2022-06-08T01:00,19.7,20.4,0.3,,,0,38.2
2,2022-06-08T02:00,19.8,20.9,0.3,,,0,36.3
3,2022-06-08T03:00,20.0,21.4,0.0,,,0,39.8
4,2022-06-08T04:00,20.1,21.8,0.2,,,0,36.0
...,...,...,...,...,...,...,...,...
547,2022-06-30T19:00,24.4,24.5,0.0,0.0,,0,27.0
548,2022-06-30T20:00,23.2,24.1,0.0,0.0,,0,24.7
549,2022-06-30T21:00,22.7,24.2,0.0,0.0,,0,21.6
550,2022-06-30T22:00,22.3,24.2,0.0,0.0,,0,20.7


In [184]:
import pandas as pd

# Import the function get_taxi_data() from utils.py
from utils import get_taxi_data

In [185]:
# Example: get yellow taxi data for June 2022
cols_to_read = ['tpep_pickup_datetime',
                'tpep_dropoff_datetime',
                'passenger_count',
                'trip_distance',
                'fare_amount']

# Download the data and get the specified columns, save the file locally
df = get_taxi_data('2022', '06', 'yellow', columns=cols_to_read, save=True)
df

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount
0,2022-06-01 00:25:41,2022-06-01 00:48:22,1.0,11.00,32.00
1,2022-06-01 00:44:40,2022-06-01 01:01:48,1.0,4.20,14.00
2,2022-06-01 00:23:07,2022-06-01 00:39:50,1.0,9.49,26.00
3,2022-06-01 00:25:53,2022-06-01 00:57:06,2.0,12.10,37.00
4,2022-06-01 00:23:58,2022-06-01 00:33:43,0.0,1.80,9.00
...,...,...,...,...,...
3558119,2022-06-30 23:45:51,2022-06-30 23:51:48,,0.00,9.20
3558120,2022-06-30 23:25:00,2022-06-30 23:40:00,,5.01,18.86
3558121,2022-06-30 23:29:00,2022-06-30 23:37:00,,1.55,10.03
3558122,2022-06-30 23:24:15,2022-06-30 23:50:19,,5.30,24.34


In [176]:
df['new_formatted_date']

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [159]:
data= df['new_formatted_date']

# resample your data to hourly frequency by taking the sum of all values within each hour. 
new_data = data.resample('H').sum()
new_data


TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'RangeIndex'

In [152]:
df['new_formatted_date'] = df['tpep_pickup_datetime'].dt.strftime('%d/%m/%y %H:%M')
df

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount,time,new_formatted_date
0,2022-06-01 00:25:41,2022-06-01 00:48:22,1.0,11.00,32.00,2022-06-01 00:25:41,01/06/22 00:25
1,2022-06-01 00:44:40,2022-06-01 01:01:48,1.0,4.20,14.00,2022-06-01 00:44:40,01/06/22 00:44
2,2022-06-01 00:23:07,2022-06-01 00:39:50,1.0,9.49,26.00,2022-06-01 00:23:07,01/06/22 00:23
3,2022-06-01 00:25:53,2022-06-01 00:57:06,2.0,12.10,37.00,2022-06-01 00:25:53,01/06/22 00:25
4,2022-06-01 00:23:58,2022-06-01 00:33:43,0.0,1.80,9.00,2022-06-01 00:23:58,01/06/22 00:23
...,...,...,...,...,...,...,...
3558119,2022-06-30 23:45:51,2022-06-30 23:51:48,,0.00,9.20,2022-06-30 23:45:51,30/06/22 23:45
3558120,2022-06-30 23:25:00,2022-06-30 23:40:00,,5.01,18.86,2022-06-30 23:25:00,30/06/22 23:25
3558121,2022-06-30 23:29:00,2022-06-30 23:37:00,,1.55,10.03,2022-06-30 23:29:00,30/06/22 23:29
3558122,2022-06-30 23:24:15,2022-06-30 23:50:19,,5.30,24.34,2022-06-30 23:24:15,30/06/22 23:24


In [186]:
df['pickup_hour']=df['tpep_pickup_datetime'].dt.hour
df['dropoff_hour']=df['tpep_dropoff_datetime'].dt.hour

In [118]:
df

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount,pickup_hour,dropoff_hour
0,2022-06-01 00:25:41,2022-06-01 00:48:22,1.0,11.00,32.00,0,0
1,2022-06-01 00:44:40,2022-06-01 01:01:48,1.0,4.20,14.00,0,1
2,2022-06-01 00:23:07,2022-06-01 00:39:50,1.0,9.49,26.00,0,0
3,2022-06-01 00:25:53,2022-06-01 00:57:06,2.0,12.10,37.00,0,0
4,2022-06-01 00:23:58,2022-06-01 00:33:43,0.0,1.80,9.00,0,0
...,...,...,...,...,...,...,...
3558119,2022-06-30 23:45:51,2022-06-30 23:51:48,,0.00,9.20,23,23
3558120,2022-06-30 23:25:00,2022-06-30 23:40:00,,5.01,18.86,23,23
3558121,2022-06-30 23:29:00,2022-06-30 23:37:00,,1.55,10.03,23,23
3558122,2022-06-30 23:24:15,2022-06-30 23:50:19,,5.30,24.34,23,23


In [188]:
concate_data = pd.concat([df,weather_data])
concate_data

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount,pickup_hour,dropoff_hour,time,temperature_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),snow_depth (m),windspeed_180m (km/h)
0,2022-06-01 00:25:41,2022-06-01 00:48:22,1.0,11.00,32.0,0.0,0.0,,,,,,,,
1,2022-06-01 00:44:40,2022-06-01 01:01:48,1.0,4.20,14.0,0.0,1.0,,,,,,,,
2,2022-06-01 00:23:07,2022-06-01 00:39:50,1.0,9.49,26.0,0.0,0.0,,,,,,,,
3,2022-06-01 00:25:53,2022-06-01 00:57:06,2.0,12.10,37.0,0.0,0.0,,,,,,,,
4,2022-06-01 00:23:58,2022-06-01 00:33:43,0.0,1.80,9.0,0.0,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,NaT,NaT,,,,,,2022-06-30T19:00,24.4,24.5,0.0,0.0,,0.0,27.0
548,NaT,NaT,,,,,,2022-06-30T20:00,23.2,24.1,0.0,0.0,,0.0,24.7
549,NaT,NaT,,,,,,2022-06-30T21:00,22.7,24.2,0.0,0.0,,0.0,21.6
550,NaT,NaT,,,,,,2022-06-30T22:00,22.3,24.2,0.0,0.0,,0.0,20.7


# Example: get yellow taxi data for January 2022
cols_to_read = ['tpep_pickup_datetime',
                'tpep_dropoff_datetime',
                'passenger_count',
                'trip_distance',
                'fare_amount','payment_type']

# Download the data and get the specified columns, save the file locally
df = get_taxi_data('2022', '01', 'green', columns=cols_to_read, save=True)
df.head()

Now, choose another month, a type of vehicle, use `get_taxi_data()` to obtain the data, and start exploring the dataset!

---

## Important tips about memory usage

Some of the data files are very heavy (several gigabytes!). Depending on your computer's RAM (memory), you may not be able to read entire data files at once, in a single data frame.

### Specify `columns`

The `columns` input argument is provided for you to select which columns you want to include in your dataframe. You should always specify which columns you need when you read data, to avoid loading unnecessary data into memory.

### Save your processed data into CSV files

To create your report, you will be selecting specific parts of the data, and likely performing some cleaning and/or aggregation on this data. You may wish to save your data at intermediate steps of your processing into CSV files, so that you can load these directly the next time you start your notebook (instead of having to re-do all the processing every time you restart Jupyter).

---

In [21]:
import requests
import matplotlib.pyplot as plt
from dateutil.parser import parse

# Task 1

# Request information using online URL builder interface

#  print(r.json())

# NOTE: doing this with Pandas is probably a lot easier! Revisit this in a couple of weeks...
# Convenience function for extracting data from a JSON request
def get_weather_data(r, frequency, variables):
    '''
    Returns only the required data from a JSON request.
    The result is a dictionary in the form:
    {'units': {variables[0]: unit, variables[1]: unit, ...},
     'variables[0]': [.., .., .., ..],
     'variables[1]': [.., .., .., ..],
     ...}
    Input:
        r (Request): the request object
        frequency (str): 'hourly' or 'daily'.
        variables (list): a list of the required variable names.
    Output:
        weather_dict (dict): a dictionary containing only the required data.
    '''
    # Parse the JSON data to a dictionary
    data = r.json()

    # Start an empty dictionary, populate the timestamps parsed as datetime objects
    weather_dict = {'timestamps': [parse(t) for t in data[frequency]['time']]}

    # Start a dictionary item to store the units
    weather_dict['units'] = {var: data[f'{frequency}_units'][var] for var in variables}

    # Extract the data, add to dictionary
    for var in variables:
        weather_dict[var] = data[frequency][var]

    # Return the data
    return weather_dict


# Convenience function to plot the results
def display_weather_data(weather_dict, same_plot=False):
    '''
    Plots the required weather data.
    Input:
        weather_dict (dict): output from get_weather_data().
        same_plot (bool, default False): display on the same graph or not.
    Output: figure and axes.
    '''
    # Get list of variables
    variables = list(weather_dict.keys())
    variables = [var for var in variables if var not in ['timestamps', 'units']]
    num_vars = len(variables)

    # Total number of subplots
    num_plots = 1 if same_plot else num_vars

    # Create figure and axes
    fig, ax = plt.subplots(num_plots, 1)

    # Plot each variable over time (except units and time itself)
    for i in range(num_vars):
        # Current axis
        if same_plot:
            current_ax = ax
        else:
            current_ax = ax[i]

        current_ax.plot(weather_dict['timestamps'], weather_dict[variables[i]])

        # Format the axes
        ylabel = f'{variables[i]} ({weather_dict["units"][variables[i]]})'
        current_ax.set(xlabel='Date', ylabel=ylabel)
        current_ax.xaxis_date()

    return fig, ax


# Display the results of the request
weather_dict = get_weather_data(r, 'hourly', ['temperature_2m','precipitation'])
fig, ax = display_weather_data(weather_dict)
plt.show()



# Task 1: simplified

# Request information using online URL builder interface
r = requests.get('https://api.open-meteo.com/v1/forecast?latitude=40.71&longitude=-74.01&hourly=temperature_2m,apparent_temperature,precipitation&start_date=2022-06-08&end_date=2022-06-30')

# Parse the JSON data to a dictionary
data = r.json()

# Get the time, temperature, and precipitation
timestamps = data['daily']['time']
temperature = data['daily']['temperature_2m']
precipitation = data['daily']['precipitation']

# Plot the data
fig, ax = plt.subplots(2, 1)

# Plot temperature and precipitation over time
ax[0].plot(timestamps, temperature)
ax[1].plot(timestamps, precipitation)

# Format the axes
ylabels = ['Temperature (C)', 'Precipitation (mm)']

for i in range(2):
    # Set axis labels
    ax[i].set(xlabel='Date', ylabel=ylabels[i])

    # Only display x-tick every 24 hours
    ax[i].set_xticks(ax[i].get_xticks()[::24])

plt.show()



# Task 2

# Create a dictionary for the parameters I need
frequency = 'daily'
params_dict = {'timezone': 'Europe/London',
               'latitude': 55.9,
               'longitude': -3.2,
               frequency: ['temperature_2m_max' ,'temperature_2m_min']}

# Make a request by passing these parameters
r = requests.get('https://api.open-meteo.com/v1/forecast', params=params_dict)

# Display the results
weather_dict = get_weather_data(r, frequency, params_dict[frequency])
fig, ax = display_weather_data(weather_dict, same_plot=True)
plt.show()



# Task 3

def weather_forecast(city_name):
    '''
    Retrieves and displays a weather forecast for city_name.
    '''
    params_dict = {'name': city_name, 'count': 1}
    city_info = requests.get('https://geocoding-api.open-meteo.com/v1/search', params=params_dict).json()

    # Extract the first result in the list
    city_info = city_info['results'][0]

    # Get latitude, longitude, and time zone
    latitude, longitude = city_info['latitude'], city_info['longitude']
    time_zone = city_info['timezone']

    # Create a dictionary for the parameters I need
    frequency = 'hourly'
    params_dict = {'timezone': time_zone,
                   'latitude': latitude,
                   'longitude': longitude,
                   frequency: ['cloudcover' ,'temperature_2m']}

    # Display the weather forecast
    r = requests.get('https://api.open-meteo.com/v1/forecast', params=params_dict)
    weather_dict = get_weather_data(r, frequency, params_dict[frequency])
    fig, ax = display_weather_data(weather_dict)
    ax[0].set_title(f'Weather forecast in {city_name}')
    plt.show()

weather_forecast('NYC')

KeyError: 'hourly'