In [1]:
import pynvml
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
print(f'Total memory: {info.total / 1024**3:.2f} GB')
print(f'Free memory: {info.free / 1024**3:.2f} GB')
print(f'Used memory: {info.used / 1024**3:.2f} GB')
import dask_cudf
import cupy as cp
import rmm
import cudf
import gc


Total memory: 8.00 GB
Free memory: 7.60 GB
Used memory: 0.40 GB


In [2]:
def check_gpu():
    try:
        # Initialize RMM to manage GPU memory
        rmm.reinitialize()

        # Check if cuDF can create a DataFrame
        df = cudf.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
        print("cuDF DataFrame created successfully:")
        print(df)

        # If we reach this point, GPU is being used correctly
        print("GPU is available and cuDF is functioning properly.")

    except Exception as e:
        print("An error occurred while checking the GPU with cuDF:")
        print(e)

# Run the check
check_gpu()

cuDF DataFrame created successfully:
   a  b
0  1  4
1  2  5
2  3  6
GPU is available and cuDF is functioning properly.


In [3]:
import cudf.pandas
cudf.pandas.install()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
taxi_busyness_data= dask_cudf.read_csv('2020_Yellow_Taxi_Trip_Data_20240522.csv', blocksize="100MB")
info = pynvml.nvmlDeviceGetMemoryInfo(handle)

print(f'Used memory: {info.used / 1024**3:.2f} GB')

Used memory: 0.52 GB


In [4]:
taxi_busyness_data.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge'],
      dtype='object')

In [5]:
taxi_busyness_data = taxi_busyness_data[["VendorID","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","PULocationID","DOLocationID"]]
print(taxi_busyness_data.columns)
rows = taxi_busyness_data.shape[0].compute()
print(rows)
print(f'Used memory: {info.used / 1024**3:.2f} GB')

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'PULocationID', 'DOLocationID'],
      dtype='object')
24648499
Used memory: 0.52 GB


In [6]:


for column in taxi_busyness_data.columns:
    print(f"Value counts for {column}:")
    value_counts = taxi_busyness_data[column].value_counts().compute()
    print(value_counts)
    print("\n")
    # print(f"Unique values for {column}:")
    
    # Replace null values with a placeholder or drop them
    # taxi_busyness_data[column] = taxi_busyness_data[column].fillna(-1)  # Adjust based on data type and context
    # print("hi")
    
    # unique_values = taxi_busyness_data[column].unique_count()
    # print(unique_values)
    # print("\n")

taxi_busyness_data.nunique().compute()
info = pynvml.nvmlDeviceGetMemoryInfo(handle)

print(f'Used memory: {info.used / 1024**3:.2f} GB')
gc.collect()

Value counts for VendorID:
VendorID
2    15908366
1     7930565
Name: count, dtype: int64


Value counts for tpep_pickup_datetime:
tpep_pickup_datetime
01/09/2020 04:53:00 PM    21
01/06/2020 08:13:00 AM    20
01/09/2020 06:13:00 PM    20
01/09/2020 07:51:00 AM    18
01/09/2020 08:15:00 AM    18
                          ..
12/31/2020 12:59:46 PM     1
12/31/2020 12:59:51 PM     1
12/31/2020 12:59:52 PM     1
12/31/2020 12:59:53 PM     1
12/31/2020 12:59:54 PM     1
Name: count, Length: 11775710, dtype: int64


Value counts for tpep_dropoff_datetime:
tpep_dropoff_datetime
01/12/2020 12:00:00 AM    69
01/11/2020 12:00:00 AM    67
02/23/2020 12:00:00 AM    64
01/03/2020 12:00:00 AM    59
01/02/2020 12:00:00 AM    58
                          ..
12/31/2020 12:59:48 PM     1
12/31/2020 12:59:49 PM     1
12/31/2020 12:59:53 PM     1
12/31/2020 12:59:55 PM     1
12/31/2020 12:59:58 PM     1
Name: count, Length: 11776107, dtype: int64


Value counts for passenger_count:
passenger_count
1    1

234

In [7]:
taxi_busyness_data.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,PULocationID,DOLocationID
0,1,01/01/2020 12:28:15 AM,01/01/2020 12:33:03 AM,1,238,239
1,1,01/01/2020 12:35:39 AM,01/01/2020 12:43:04 AM,1,239,238
2,1,01/01/2020 12:47:41 AM,01/01/2020 12:53:52 AM,1,238,238
3,1,01/01/2020 12:55:23 AM,01/01/2020 01:00:14 AM,1,238,151
4,2,01/01/2020 12:01:58 AM,01/01/2020 12:04:16 AM,1,193,193


In [8]:
print(cudf.__version__)

24.04.01


In [9]:
pandas_dol=taxi_busyness_data["DOLocationID"].compute().to_pandas()
pandas_pul=taxi_busyness_data["PULocationID"].compute().to_pandas()
unique_in_pul = pandas_pul[~pandas_pul.isin(pandas_dol)].unique()
print(pandas_pul.shape,pandas_dol.shape)
# Find unique elements in pandas_dol that are not in pandas_pul
unique_in_dol = pandas_dol[~pandas_dol.isin(pandas_pul)].unique()
print(unique_in_pul)

print(unique_in_pul.shape,unique_in_dol.shape)
print("Unique elements in pandas_pul but not in pandas_dol:", unique_in_pul)
print("Unique elements in pandas_dol but not in pandas_pul:", unique_in_dol)

(24648499,) (24648499,)
[]
(0,) (1,)
Unique elements in pandas_pul but not in pandas_dol: []
Unique elements in pandas_dol but not in pandas_pul: [104]


In [10]:

# Compute the Dask cuDF DataFrame to bring it into memory as a cuDF DataFrame
computed_df = taxi_busyness_data.compute()

# Access the first element in the 'tpep_dropoff_datetime' column
first_element = computed_df["tpep_dropoff_datetime"].iloc[0]
print(first_element)
# Get the type of the first element
element_type = type(first_element)

print(f"Type of the first element in 'tpep_dropoff_datetime': {element_type}")
info = pynvml.nvmlDeviceGetMemoryInfo(handle)

print(f'Used memory: {info.used / 1024**3:.2f} GB')
gc.collect()

01/01/2020 12:33:03 AM
Type of the first element in 'tpep_dropoff_datetime': <class 'str'>
Used memory: 4.92 GB


3437

In [11]:
for column in taxi_busyness_data.columns:
    non_null_count = taxi_busyness_data[column].dropna().shape[0].compute()
    print(f"Non-null count for column {column}: {rows - non_null_count}")

info = pynvml.nvmlDeviceGetMemoryInfo(handle)

print(f'Used memory: {info.used / 1024**3:.2f} GB')

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /opt/conda/conda-bld/work/include/rmm/mr/device/cuda_memory_resource.hpp:60: cudaErrorMemoryAllocation out of memory

In [10]:
taxi_filled_na = taxi_busyness_data.fillna({"passenger_count": 1})

type(taxi_filled_na)
gc.collect()

831

In [None]:
import rmm
gc.collect()
# Clear RMM cache
rmm.reinitialize(pool_allocator=True, managed_memory=False)

In [6]:
non_null_count = taxi_filled_na["passenger_count"].dropna().shape[0].compute()
print(f"Non-null count for column passenger_count: {rows - non_null_count}")

NameError: name 'taxi_filled_na' is not defined

In [None]:
max_value = taxi_filled_na['passenger_count'].max().compute()
print("Maximum value in column 'passenger_count':", max_value)

taxi_filled_na.head(5)

Maximum value in column 'passenger_count': 9


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,PULocationID,DOLocationID
0,1,01/01/2020 12:28:15 AM,01/01/2020 12:33:03 AM,1,238,239
1,1,01/01/2020 12:35:39 AM,01/01/2020 12:43:04 AM,1,239,238
2,1,01/01/2020 12:47:41 AM,01/01/2020 12:53:52 AM,1,238,238
3,1,01/01/2020 12:55:23 AM,01/01/2020 01:00:14 AM,1,238,151
4,2,01/01/2020 12:01:58 AM,01/01/2020 12:04:16 AM,1,193,193


In [None]:
# Example: Creating a cuDF DataFrame
gdf = cudf.DataFrame({
    'a': [1, 2, None, 4],
    'b': [None, 2, 3, 4]
})

# Convert cuDF DataFrame to Dask-cuDF DataFrame
ddf = dask_cudf.from_cudf(gdf, npartitions=2)

# Replace NA values
ddf_filled = ddf.fillna({'a': 0, 'b': -1})

# Compute the result
result = ddf_filled.compute()

print(result)


     a    b
0  1.0 -1.0
1  2.0  2.0
2  0.0  3.0
3  4.0  4.0


In [None]:
del taxi_busyness_data
gc.collect()
# Clear RMM cache
rmm.reinitialize(pool_allocator=True, managed_memory=False)

MemoryError: std::bad_alloc: out_of_memory: RMM failure at:/opt/conda/conda-bld/work/include/rmm/mr/device/pool_memory_resource.hpp:424: Maximum pool size exceeded

In [12]:
taxi_busyness_data['tpep_pickup_datetime'] = cudf.to_datetime(taxi_busyness_data['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p')
# taxi_busyness_data['tpep_dropoff_datetime'] = cudf.to_datetime(taxi_busyness_data['tpep_dropoff_datetime'], format='%m/%d/%Y %I:%M:%S %p').to_cupy()

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /opt/conda/conda-bld/work/include/rmm/mr/device/cuda_memory_resource.hpp:60: cudaErrorMemoryAllocation out of memory

In [14]:
taxi_filled_na.to_csv('saved_csvs/taxi_trip.csv', single_file=True)

['/home/eamonn-walsh/Documents/Summer-Project/saved_csvs/taxi_trip.csv']

In [None]:
l_o_business= dask_cudf.read_csv('Legally_Operating_Businesses_20240527.csv', blocksize="100MB")
arrest_data = dask_cudf.read_csv('NYPD_Arrest_Data__Year_to_Date__20240529.csv', blocksize="100MB")
mta_station_data = dask_cudf.read_csv('MTA_Subway_Stations.csv', blocksize="100MB")
subway_surfers = dask_cudf.read_csv('MTA_Subway_Hourly_Ridership__Beginning_February_2022_20240531.csv', blocksize="100MB") 

In [None]:
l_o_business.columns

Index(['DCA License Number', 'License Type', 'License Expiration Date',
       'License Status', 'License Creation Date', 'Industry', 'Business Name',
       'Business Name 2', 'Address Building', 'Address Street Name',
       'Secondary Address Street Name', 'Address City', 'Address State',
       'Address ZIP', 'Contact Phone Number', 'Address Borough',
       'Borough Code', 'Community Board', 'Council District', 'BIN', 'BBL',
       'NTA', 'Census Tract', 'Detail', 'Longitude', 'Latitude', 'Location'],
      dtype='object')

In [None]:
arrest_data.columns

Index(['ARREST_KEY', 'ARREST_DATE', 'PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC',
       'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'ARREST_PRECINCT',
       'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'X_COORD_CD',
       'Y_COORD_CD', 'Latitude', 'Longitude', 'New Georeferenced Column'],
      dtype='object')

In [None]:
mta_station_data.columns

Index(['GTFS Stop ID', 'Station ID', 'Complex ID', 'Division', 'Line',
       'Stop Name', 'Borough', 'Daytime Routes', 'Structure', 'GTFS Latitude',
       'GTFS Longitude', 'North Direction Label', 'South Direction Label',
       'ADA', 'ADA Northbound', 'ADA Southbound', 'ADA Notes', 'Georeference'],
      dtype='object')

In [None]:
subway_surfers.columns

Index(['transit_timestamp', 'transit_mode', 'station_complex_id',
       'station_complex', 'borough', 'payment_method', 'fare_class_category',
       'ridership', 'transfers', 'latitude', 'longitude', 'Georeference'],
      dtype='object')