<a href="https://colab.research.google.com/github/EdiNel0407/us-ie-big-data-technologies/blob/main/day3/day3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# --- Repair/setup (safe to run anytime) ---
# 1) ensure PySpark is installed
try:
    import pyspark  # noqa
except ImportError:
    !pip -q install pyspark==3.5.1

# 2) start (or reuse) Spark and functions alias F
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

try:
    spark  # noqa
except NameError:
    spark = SparkSession.builder.appName("bdt").getOrCreate()

print("Spark OK ->", spark.version)


Spark OK -> 3.5.1


In [6]:
import pandas as pd
from pathlib import Path

if not Path("rows.csv").exists():
    !wget -q -O rows.csv "https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv"

pdf = pd.read_csv("rows.csv", low_memory=False)
nyc_small = pdf.sample(frac=0.2, replace=False, random_state=1)[[
    "CRASH DATE",
    "CONTRIBUTING FACTOR VEHICLE 1",
    "BOROUGH",
    "VEHICLE TYPE CODE 1",
    "NUMBER OF PERSONS INJURED",
]]

sdf = spark.createDataFrame(nyc_small.fillna("").astype(str)).cache()
_ = sdf.count()
sdf.printSchema()


root
 |-- CRASH DATE: string (nullable = true)
 |-- CONTRIBUTING FACTOR VEHICLE 1: string (nullable = true)
 |-- BOROUGH: string (nullable = true)
 |-- VEHICLE TYPE CODE 1: string (nullable = true)
 |-- NUMBER OF PERSONS INJURED: string (nullable = true)



In [1]:
!wget -q -O rows.csv "https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv"
import pandas as pd
pd.read_csv("rows.csv").head()


  pd.read_csv("rows.csv").head()


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,11/01/2023,1:29,BROOKLYN,11230.0,40.62179,-73.970024,"(40.62179, -73.970024)",OCEAN PARKWAY,AVENUE K,,...,Unspecified,Unspecified,,,4675373,Moped,Sedan,Sedan,,
3,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
4,09/21/2022,13:21,,,,,,BROOKLYN BRIDGE,,,...,Unspecified,,,,4566131,Station Wagon/Sport Utility Vehicle,,,,


In [2]:
!wget -q -O users_v.csv https://storage.googleapis.com/bdt-beam/users_v.csv
!ls -lh users_v.csv && head -n 5 users_v.csv



-rw-r--r-- 1 root root 141K Nov  4  2021 users_v.csv
user_id,name,gender,age,address,date_joined
1,Anthony Wolf,male,73,New Rachelburgh-VA-49583,2019/03/13
2,James Armstrong,male,56,North Jillianfort-UT-86454,2020/11/06
3,Cody Shaw,male,75,North Anne-SC-53799,2004/05/29
4,Sierra Hamilton,female,76,New Angelafurt-ME-46190,2005/08/26


In [7]:
!wget -q -O rows.csv "https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv"
!ls -lh rows.csv && head -n 3 rows.csv


-rw-r--r-- 1 root root 444M Sep 16 22:46 rows.csv
CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage,Unspecified,,,,4455765,Sedan,Sedan,,,
03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,1,0,0,0,0,0,1,0,Pavement Slippery,,,,,4513547,Sedan,,,,


In [8]:
(
  sdf.withColumn("CRASH_DATE", F.to_date(F.col("CRASH DATE"), "MM/dd/yyyy"))
     .groupBy("CRASH_DATE").count()
     .orderBy(F.desc("count"))
     .show(5, truncate=False)
)


+----------+-----+
|CRASH_DATE|count|
+----------+-----+
|2018-11-15|240  |
|2017-12-15|226  |
|2014-01-21|225  |
|2016-09-30|187  |
|2018-03-02|186  |
+----------+-----+
only showing top 5 rows



In [9]:
sdf.filter(F.col("BOROUGH") != "") \
   .groupBy("BOROUGH").count() \
   .orderBy(F.desc("count")).show(5, truncate=False)


+-------------+-----+
|BOROUGH      |count|
+-------------+-----+
|BROOKLYN     |97615|
|QUEENS       |81855|
|MANHATTAN    |67995|
|BRONX        |45287|
|STATEN ISLAND|12820|
+-------------+-----+



In [10]:
sdf.filter(
    (F.col("BOROUGH") == "QUEENS") &
    (F.col("CONTRIBUTING FACTOR VEHICLE 1") != "") &
    (F.lower(F.col("CONTRIBUTING FACTOR VEHICLE 1")) != "unspecified")
).groupBy("CONTRIBUTING FACTOR VEHICLE 1") \
 .agg(F.count("*").alias("freq")) \
 .orderBy(F.desc("freq")).show(5, truncate=False)


+------------------------------+-----+
|CONTRIBUTING FACTOR VEHICLE 1 |freq |
+------------------------------+-----+
|Driver Inattention/Distraction|17655|
|Failure to Yield Right-of-Way |7067 |
|Backing Unsafely              |3988 |
|Following Too Closely         |3019 |
|Passing or Lane Usage Improper|2304 |
+------------------------------+-----+
only showing top 5 rows



In [11]:
sdf.withColumn("NUM_INJ", F.col("NUMBER OF PERSONS INJURED").cast("int")) \
   .groupBy("VEHICLE TYPE CODE 1","BOROUGH") \
   .agg(F.avg("NUM_INJ").alias("avg_injuries")) \
   .orderBy(F.desc("avg_injuries")).show(3, truncate=False)


+-------------------+--------+------------+
|VEHICLE TYPE CODE 1|BOROUGH |avg_injuries|
+-------------------+--------+------------+
|FRONT              |BROOKLYN|12.0        |
|TOWER              |BRONX   |5.0         |
|Amb                |BROOKLYN|4.0         |
+-------------------+--------+------------+
only showing top 3 rows

