In [7]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

#Load the dataset into a Pandas Dataframe
df = pd.read_csv('taxi_zones.csv')

#Display the first few rows of the dataset
print("Sample of the Dataset:")
print(df.head())

#Step-2 : Volume - How large is the data?

print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

#Step-3 : Variety
print(df.dtypes)
print(df.info)


Sample of the Dataset:
              X              Y  OBJECTID  Shape_Leng  Shape_Area  \
0  9.352230e+05  190535.052575         1    0.116357    0.000782   
1  1.032516e+06  167292.493195         2    0.433470    0.004866   
2  1.025883e+06  254779.600631         3    0.084341    0.000314   
3  9.906188e+05  203105.532318         4    0.043567    0.000112   
4  9.314680e+05  139837.478389         5    0.092146    0.000498   

                      zone  LocationID        borough   Latitude  Longitude  
0           Newark Airport           1            EWR  35.475594 -83.290731  
1              Jamaica Bay           2         Queens  35.742300 -90.562330  
2  Allerton/Pelham Gardens           3          Bronx  33.293157 -86.767961  
3            Alphabet City           4      Manhattan  40.724545 -73.979050  
4            Arden Heights           5  Staten Island  39.809279 -75.486587  
Number of Rows: 263
Number of Columns: 10
Memory Usage: 0.05 MB
X             float64
Y             

In [None]:
#Step-4 : Velocity
for i in range(5):                        #Stream 5 batches
  sample_data = df.sample(2)              #Stream 5 random rows
  print(f"\nBatch (i+1):\n", sample_data)
  time.sleep(2)                           #Wait for 2 seconds to simulate streaming


Batch (i+1):
                  X              Y  OBJECTID  Shape_Leng  Shape_Area  \
67   984266.581407  211937.944079        68    0.049337    0.000111   
216  996215.257198  195492.216142       217    0.055391    0.000115   

                   zone  LocationID    borough   Latitude  Longitude  
67         East Chelsea          68  Manhattan  42.391760 -71.032830  
216  South Williamsburg         217   Brooklyn  28.414451 -81.442849  

Batch (i+1):
                 X              Y  OBJECTID  Shape_Leng  Shape_Area  \
202  1.057297e+06  179660.443813       203    0.189938    0.000615   
92   1.028064e+06  208639.269516        93    0.170998    0.000594   

                             zone  LocationID borough   Latitude   Longitude  
202                      Rosedale         203  Queens  35.468183 -118.779649  
92   Flushing Meadows-Corona Park          93  Queens  39.112510 -100.358750  

Batch (i+1):
                 X              Y  OBJECTID  Shape_Leng  Shape_Area  \
111  9.985

In [None]:
#Install PySpark in Google Colab
!pip install pyspark



In [2]:
#Import PySpark session
from pyspark.sql import SparkSession

#Create a SparkSession
spark = SparkSession.builder.appName("Big Data Tools Overview").getOrCreate()

#Create a simple DataFrame
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, schema=columns)

#Show the DataFrame
df.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [9]:
from collections import Counter

#sample dataset
data = ["Big Data ecosystem", "Data architecture", "Big Data storage"]

#Map phase
mapped =[]
for line in data:
  words = line.split()
  mapped.extend([(word,1) for word in words])

#Reduce phase
reduced = Counter()
for key, value in mapped:
  reduced[key] += value

for word, count in reduced.items():
  print(f"{word}: {count}")

Big: 2
Data: 3
ecosystem: 1
architecture: 1
storage: 1
