<a href="https://colab.research.google.com/github/CheolsoonIm/Android-Cheat-sheet/blob/master/pandasVsSpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# To download a dataset and prepare to use

import urllib.request
url="https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-01.csv.gz"
file_path="yellow_tripdata_2019-01.csv.gz"

urllib.request.urlretrieve(url,file_path)
print("Dataset downloaded!")


Dataset downloaded!


In [2]:
import pandas as pd
import time

# Load dataset
start_time=time.time()
df=pd.read_csv("yellow_tripdata_2019-01.csv.gz",compression='gzip')
load_time=time.time()-start_time
print(f"Pandas: Data loaded in {load_time:.2f} seconds")

# Filter trips longer than 10 miles
start_time=time.time()
long_trips=df[df['trip_distance']>10]
filter_time=time.time()-start_time
print(f"Pandas: Filter operation took {filter_time:.2f} seconds")

# Calculate average trip distance
start_time=time.time()
avg_distance=df['trip_distance'].mean()
agg_time=time.time()-start_time
print(f"Pandas: Aggregation took {agg_time:.2f} seconds")
print(f"Pandas: Average trip distance is {avg_distance:.2f} miles")



Pandas: Data loaded in 34.62 seconds
Pandas: Filter operation took 0.16 seconds
Pandas: Aggregation took 0.02 seconds
Pandas: Average trip distance is 2.80 miles


In [3]:
# Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean
import time

# to initialize SparkSession
spark=SparkSession.builder.appName("NYC_Taxi_Comparison").getOrCreate()

# to load dataset
start_time=time.time()
df=spark.read.csv("yellow_tripdata_2019-01.csv.gz",header=True,inferSchema=True)
load_time=time.time()-start_time
print(f"Spark: Data loaded in {load_time:.2f} seconds")

# to filter trips longer than 10 miles
start_time=time.time()
long_trips=df.filter(df['trip_distance']>10)
long_trips_count=long_trips.count() # to trigger computation
filter_time=time.time()-start_time
print(f"Spark: Filter operation took {filter_time:.2f} seconds")

# To calculate average trip distance
start_time=time.time()
avg_distance=df.select(mean('trip_distance')).collect()[0][0] #
agg_time=time.time()-start_time
print(f"Spark: Aggregation took {agg_time:.2f} seconds")
print(f"Spark: Average trip distance is {avg_distance:.2f} miles")

Spark: Data loaded in 56.13 seconds
Spark: Filter operation took 18.98 seconds
Spark: Aggregation took 17.65 seconds
Spark: Average trip distance is 2.80 miles


Pandas: Data loaded in 11.69 seconds
Pandas: Filter operation took 0.15 seconds
Pandas: Aggregation took 0.02 seconds
Pandas: Average trip distance is 2.80 miles
Spark: Data loaded in 19.65 seconds
Spark: Filter operation took 8.17 seconds
Spark: Aggregation took 9.04 seconds
Spark: Average trip distance is 2.80 miles




In [4]:
# To compare memory usage

# install required library
# pip install psutil

In [6]:
# To monitor memory usage in Pandas
import pandas as pd
import psutil
import time
import os

# Function to get memory usage in MB
def get_memory_usage():
    process=psutil.Process(os.getpid())
    return process.memory_info().rss/1024/1024 # convert to MB
# To track memory before loading
mem_before=get_memory_usage()
# To load dataset
start_time=time.time()
df=pd.read_csv("yellow_tripdata_2019-01.csv.gz",compression='gzip')
load_time=time.time()-start_time
mem_after_load=get_memory_usage()
# To filter trips longer than 10 miles
start_time=time.time()
long_trips=df[df['trip_distance']>10]
filter_time=time.time()-start_time
mem_after_filter=get_memory_usage()
# To calculate average trip distance
start_time=time.time()
avg_distance=df['trip_distance'].mean()
agg_time=time.time()-start_time
mem_after_agg=get_memory_usage()

import pandas as pd
import psutil
import time
import os

# Function to get memory usage in MB
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024  # Convert to MB

# Track memory before loading
mem_before = get_memory_usage()

# Load dataset
start_time = time.time()
df = pd.read_csv("yellow_tripdata_2019-01.csv.gz", compression='gzip')
load_time = time.time() - start_time
mem_after_load = get_memory_usage()

# Filter trips longer than 10 miles
start_time = time.time()
long_trips = df[df['trip_distance'] > 10]
filter_time = time.time() - start_time
mem_after_filter = get_memory_usage()

# Calculate average trip distance
start_time = time.time()
avg_distance = df['trip_distance'].mean()
agg_time = time.time() - start_time
mem_after_agg = get_memory_usage()



# To print results
print(f"Pandas: Data loaded in {load_time:.2f} seconds, \
      Memory: {mem_after_load-mem_before:.2f} MB")
print(f"Pandas: Filter operation took {filter_time:.2f} seconds, \
      Memory: {mem_after_filter-mem_after_load:.2f} MB")
print(f"Pandas: Aggregation took {agg_time:.2f} seconds, \
      Memory: {mem_after_agg-mem_after_filter:.2f} MB")



Pandas: Data loaded in 22.93 seconds,       Memory: 2969.20 MB
Pandas: Filter operation took 0.51 seconds,       Memory: -2866.45 MB
Pandas: Aggregation took 0.02 seconds,       Memory: 0.00 MB


In [7]:
# To monitor memory usage in Spark

In [8]:
# To do the same thing with bigger dataset (>1gb?)