In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ArrestAnalysis") \
    .config("spark.master", "local") \
    .getOrCreate()


24/02/25 19:10:25 WARN Utils: Your hostname, Aknars-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.20 instead (on interface en0)
24/02/25 19:10:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/25 19:10:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Assuming the data is already uploaded in CSV format and accessible either in HDFS or the local file system
path = "NYPD_Arrest_Data__Year_to_Date__20240225.csv"


In [3]:
# Reading data into an RDD
rdd = spark.sparkContext.textFile(path)

# Skipping the title
header = rdd.first()
rdd = rdd.filter(lambda line: line != header)


In [4]:
def parseLine(line):
    fields = line.split(',')
    try:
        age_group = fields[11]  # Assuming the age group is in the 12th column
        arrest_boro = fields[8]  # Assuming the area of arrest is in the 9th column
        offense_type = fields[5]  # Assuming the type of crime is in the 6th column
        return offense_type, (arrest_boro, age_group, 1)
    except IndexError:
        return ()


In [5]:
# Data processing
parsedData = rdd.map(parseLine).filter(lambda x: x)

# Aggregating data by crime type and counting cases by boroughs and age groups
offenseSummary = parsedData.map(lambda x: (x[0], x[1][2])).reduceByKey(lambda x, y: x + y)


In [6]:
# Conclusion: Identifying the top 5 most frequent crimes
topOffenses = offenseSummary.takeOrdered(5, key=lambda x: -x[1])
print("Top 5 crimes by number of arrests:")
for offense in topOffenses:
    print(offense)


Top 5 crimes by number of arrests:
('ASSAULT 3 & RELATED OFFENSES', 28055)
('"', 23820)
('UNCLASSIFIED"', 15498)
('348', 11925)
('105', 11071)
