Data preparation

In [33]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

data_rdd = sc.textFile("sample_data/Crimes_-_2001_to_Present.csv")
split_data_rdd = data_rdd.map(lambda line: line.split(","))
header = split_data_rdd.first()
data_rdd = split_data_rdd.filter(lambda row: row != header)
for row in data_rdd.take(5):
    print(row)


['11037294', 'JA371270', '03/18/2015 12:00:00 PM', '0000X W WACKER DR', '1153', 'DECEPTIVE PRACTICE', 'FINANCIAL IDENTITY THEFT OVER $ 300', 'BANK', 'false', 'false', '0111', '001', '42', '32', '11', '', '', '2015', '08/01/2017 03:52:26 PM', '', '', '']
['11646293', 'JC213749', '12/20/2018 03:00:00 PM', '023XX N LOCKWOOD AVE', '1154', 'DECEPTIVE PRACTICE', 'FINANCIAL IDENTITY THEFT $300 AND UNDER', 'APARTMENT', 'false', 'false', '2515', '025', '36', '19', '11', '', '', '2018', '04/06/2019 04:04:43 PM', '', '', '']
['11645836', 'JC212333', '05/01/2016 12:25:00 AM', '055XX S ROCKWELL ST', '1153', 'DECEPTIVE PRACTICE', 'FINANCIAL IDENTITY THEFT OVER $ 300', '', 'false', 'false', '0824', '008', '15', '63', '11', '', '', '2016', '04/06/2019 04:04:43 PM', '', '', '']
['11645959', 'JC211511', '12/20/2018 04:00:00 PM', '045XX N ALBANY AVE', '2820', 'OTHER OFFENSE', 'TELEPHONE THREAT', 'RESIDENCE', 'false', 'false', '1724', '017', '33', '14', '08A', '', '', '2018', '04/06/2019 04:04:43 PM', '',

Analyze


In [34]:
# Count the occurrences of each primary type
primary_types_rdd = data_rdd.map(lambda row: row[5])
primary_type_counts = primary_types_rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

# Get the top 10 most frequent primary types
top_primary_types = primary_type_counts.takeOrdered(10, key=lambda x: -x[1])

print("Top 10 Primary Types:")
for primary_type, count in top_primary_types:
    print(primary_type, ":", count)


Top 10 Primary Types:
THEFT : 2751
BATTERY : 2358
CRIMINAL DAMAGE : 1542
DECEPTIVE PRACTICE : 1162
ASSAULT : 1027
MOTOR VEHICLE THEFT : 795
OTHER OFFENSE : 785
NARCOTICS : 691
BURGLARY : 539
ROBBERY : 526


In [35]:
# Filter the dataset to include only crimes from the year 2023
crimes_2023_rdd = data_rdd.filter(lambda row: len(row) > 17 and row[17] == '2023')

# Extract the Location Description field (assuming it's at index 7)
location_description_rdd = crimes_2023_rdd.map(lambda row: (row[7], 1) if len(row) > 7 else ('Unknown', 1))

# Count the occurrences of each location description
crime_count_by_location = location_description_rdd.reduceByKey(lambda x, y: x + y)

# Sort the result by the count in descending order
sorted_crime_count_by_location = crime_count_by_location.sortBy(lambda x: x[1], ascending=False)

# Collect the results
location_counts = sorted_crime_count_by_location.collect()

# Print the count of crimes for each unique location
for location, count in location_counts:
    print(f"Location: {location}, Crime Count: {count}")


Location: STREET, Crime Count: 1457
Location: APARTMENT, Crime Count: 982
Location: RESIDENCE, Crime Count: 655
Location: SIDEWALK, Crime Count: 262
Location: PARKING LOT / GARAGE (NON RESIDENTIAL), Crime Count: 199
Location: SMALL RETAIL STORE, Crime Count: 131
Location: ALLEY, Crime Count: 118
Location: RESTAURANT, Crime Count: 93
Location: GAS STATION, Crime Count: 72
Location: OTHER (SPECIFY), Crime Count: 70
Location: RESIDENCE - YARD (FRONT / BACK), Crime Count: 69
Location: RESIDENCE - GARAGE, Crime Count: 66
Location: DEPARTMENT STORE, Crime Count: 65
Location: VEHICLE NON-COMMERCIAL, Crime Count: 62
Location: COMMERCIAL / BUSINESS OFFICE, Crime Count: 59
Location: RESIDENCE - PORCH / HALLWAY, Crime Count: 59
Location: PARK PROPERTY, Crime Count: 52
Location: GROCERY FOOD STORE, Crime Count: 51
Location: SCHOOL - PUBLIC GROUNDS, Crime Count: 43
Location: SCHOOL - PUBLIC BUILDING, Crime Count: 41
Location: BAR OR TAVERN, Crime Count: 38
Location: CTA TRAIN, Crime Count: 28
Locat

In [38]:
# Extract the number of crimes per district
crime_counts_by_district = data_rdd.map(lambda row: (row[11], 1) if len(row) > 11 else ('Unknown', 1))

# Reduce by key to count the total number of crimes in each district
total_crimes_by_district = crime_counts_by_district.reduceByKey(lambda x, y: x + y)

crime_counts = total_crimes_by_district.map(lambda x: x[1])

# Calculate mean and standard deviation
mean_crime_count = crime_counts.mean()
std_dev_crime_count = crime_counts.stdev()

print("Mean Crime Count per District:", mean_crime_count)
print("Standard Deviation of Crime Count per District:", std_dev_crime_count)


Mean Crime Count per District: 353.57894736842104
Standard Deviation of Crime Count per District: 309.65758946727186
