# Team 3 Big Data Project

## Project Overview


In [1]:
import os
import pandas as pd

In [3]:
data_folder = "data/"

csv_files = []
for folder in os.listdir(data_folder):
    for file in os.listdir(os.path.join(data_folder, folder)):
        if file.endswith(".csv"):
            csv_files.append(os.path.join(data_folder, folder, file))

if csv_files:
    try:
        # Read all CSV files into dataframes and combine them
        dataframes = [pd.read_csv(file, header=0) for file in csv_files]
        combined_df = pd.concat(dataframes, ignore_index=True)

        # Save the combined dataframe to a single CSV file
        output_path = "data/combined_pandas.csv"
        combined_df.to_csv(output_path, index=False)

        print(f"Combined CSV saved to {output_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

Combined CSV saved to data/combined_pandas.csv


In [2]:
df=pd.read_csv("data/combined_pandas.csv", header=0)  # Read the combined CSV file
df

Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context
0,,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.492876,51.422716,On or near Trajectus Way,E01014399,Bath and North East Somerset 001A,Anti-social behaviour,,
1,,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.510372,51.423406,On or near Durley Lane,E01014399,Bath and North East Somerset 001A,Anti-social behaviour,,
2,,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.491616,51.424619,On or near Julius Place,E01014399,Bath and North East Somerset 001A,Anti-social behaviour,,
3,2eff3ea72aa91fdd06a5fbbb1fb79cb592a6cd5725b4b0...,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.511967,51.412987,On or near Heathfield Close,E01014399,Bath and North East Somerset 001A,Burglary,Status update unavailable,
4,cd58673ba2ea0e0c8aefc2395205efb6ef79a64922a430...,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.513901,51.418814,On or near Stockwood Hill,E01014399,Bath and North East Somerset 001A,Criminal damage and arson,Status update unavailable,
...,...,...,...,...,...,...,...,...,...,...,...,...
212074,e94ec744f2eb831e7155d55fb31f2ae16f748a4324c94d...,2025-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No Location,,,Other crime,Under investigation,
212075,3996680500fa7f7cc35ac4e0cba8a89feacba39f049a61...,2025-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No Location,,,Other crime,Under investigation,
212076,15062da79af0d04d8ad7d7178910726ebf5e40744d9bcd...,2025-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No Location,,,Other crime,Under investigation,
212077,04f2a5313893fea06ceae7b22e8d17f78a0dffcb69e52b...,2025-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No Location,,,Other crime,Under investigation,


In [4]:
# Handle Missing Values
# For Crime Id we will be generating a new id for each missing values
df['Crime ID'] = df['Crime ID'].fillna(value=pd.Series(range(1, len(df) + 1)))

# For Longitude and Latitude we will be filling with the mean value
df['Longitude'] = df['Longitude'].fillna(df['Longitude'].mean())
df['Latitude'] = df['Latitude'].fillna(df['Latitude'].mean())

# For Crime type we set the missing to 'Other'
df['Crime type'] = df['Crime type'].fillna('Other')

# For Last outcome category we will set the missing to 'Other'
df['Last outcome category'] = df['Last outcome category'].fillna('Other')

df

Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context
0,1,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.492876,51.422716,On or near Trajectus Way,E01014399,Bath and North East Somerset 001A,Anti-social behaviour,Other,
1,2,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.510372,51.423406,On or near Durley Lane,E01014399,Bath and North East Somerset 001A,Anti-social behaviour,Other,
2,3,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.491616,51.424619,On or near Julius Place,E01014399,Bath and North East Somerset 001A,Anti-social behaviour,Other,
3,2eff3ea72aa91fdd06a5fbbb1fb79cb592a6cd5725b4b0...,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.511967,51.412987,On or near Heathfield Close,E01014399,Bath and North East Somerset 001A,Burglary,Status update unavailable,
4,cd58673ba2ea0e0c8aefc2395205efb6ef79a64922a430...,2024-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.513901,51.418814,On or near Stockwood Hill,E01014399,Bath and North East Somerset 001A,Criminal damage and arson,Status update unavailable,
...,...,...,...,...,...,...,...,...,...,...,...,...
212074,e94ec744f2eb831e7155d55fb31f2ae16f748a4324c94d...,2025-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.677138,51.338961,No Location,,,Other crime,Under investigation,
212075,3996680500fa7f7cc35ac4e0cba8a89feacba39f049a61...,2025-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.677138,51.338961,No Location,,,Other crime,Under investigation,
212076,15062da79af0d04d8ad7d7178910726ebf5e40744d9bcd...,2025-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.677138,51.338961,No Location,,,Other crime,Under investigation,
212077,04f2a5313893fea06ceae7b22e8d17f78a0dffcb69e52b...,2025-05,Avon and Somerset Constabulary,Avon and Somerset Constabulary,-2.677138,51.338961,No Location,,,Other crime,Under investigation,
