# Advanced Database Systems - NTUA - 2023


## Project Scope

## 

### Contributors

Dimitris Vasios 03119404

Thodoris - Angelos Mexis 03118408

### Script to enable the cluster
**Cluster Specification**

Namenode

Datanodes

In [1]:
# Pyspark Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, DateType, DoubleType, TimestampType, TimestampNTZType
from pyspark.sql.functions import col, count, asc, desc
from pyspark.sql.functions import year, month, count, dense_rank, row_number, to_date
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, DateType
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Other Libraries
import subprocess as sp
import pandas as pd

In [None]:
# Download necessary data for the project
sp.call(['bash', '../scripts/import_data.sh'])

In [None]:
# Start Cluster
sp.call(['bash', '../scripts/cluster_initiate.sh'])

In [None]:
# Combine Primary Data into one csv
crime_data_2010_2019 = pd.read_csv('../data/primary/crime_data_2010_2019.csv')
crime_data_2020_present = pd.read_csv('../data/primary/crime_data_2020_present.csv')
crime_data = pd.concat([crime_data_2010_2019, crime_data_2020_present], ignore_index=True)
crime_data.to_csv('../data/primary/crime_data.csv', index=False)

In [None]:
# Load Data to HDFS
sp.call(['bash', '../scripts/load_data_to_hdfs.sh'])

In [2]:
# Start a Spark Session
sc = SparkSession \
    .builder \
    .appName("Standard Query") \
    .getOrCreate() 

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/06 18:17:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/06 18:17:55 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
# Crime Data Schema
crime_data_schema = StructType([
    StructField("DR_NO", StringType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", StringType()),
    StructField("AREA", StringType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", StringType()),
    StructField("Part 1-2", StringType()),
    StructField("Crm Cd", StringType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", StringType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", StringType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", StringType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", StringType()),
    StructField("Crm Cd 2", StringType()),
    StructField("Crm Cd 3", StringType()),
    StructField("Crm Cd 4", StringType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", StringType()),
    StructField("LON", StringType()),
])

crime_data_df = sc.read.format('csv') \
    .options(header='true') \
    .schema(crime_data_schema) \
    .load("hdfs://okeanos-master:54310/user/data/primary/crime_data.csv")

# Change Columns types
crime_data_df = crime_data_df.withColumn('Date Rptd', to_date('Date Rptd', 'MM/dd/yyyy hh:mm:ss a')) \
                             .withColumn('DATE OCC', to_date('DATE OCC', 'MM/dd/yyyy hh:mm:ss a')) \
                             .withColumn('Vict Age', col('Vict Age').cast('int')) \
                             .withColumn('LAT',col('LAT').cast('double')) \
                             .withColumn('LON', col('LON').cast('double'))

crime_data_df.dtypes

rows = crime_data_df.count()
print(f"Crime Data Total Rows : {rows}")
print(f"Date Rptd: {crime_data_df.dtypes}")




Crime Data Total Rows : 2993433
Date Rptd: [('DR_NO', 'string'), ('Date Rptd', 'date'), ('DATE OCC', 'date'), ('TIME OCC', 'string'), ('AREA', 'string'), ('AREA NAME', 'string'), ('Rpt Dist No', 'string'), ('Part 1-2', 'string'), ('Crm Cd', 'string'), ('Crm Cd Desc', 'string'), ('Mocodes', 'string'), ('Vict Age', 'int'), ('Vict Sex', 'string'), ('Vict Descent', 'string'), ('Premis Cd', 'string'), ('Premis Desc', 'string'), ('Weapon Used Cd', 'string'), ('Weapon Desc', 'string'), ('Status', 'string'), ('Status Desc', 'string'), ('Crm Cd 1', 'string'), ('Crm Cd 2', 'string'), ('Crm Cd 3', 'string'), ('Crm Cd 4', 'string'), ('LOCATION', 'string'), ('Cross Street', 'string'), ('LAT', 'double'), ('LON', 'double')]


                                                                                

In [8]:
# ---- QUERY 1 | DATAFRAME API-----

# Keep specific columns from the dataframe
crime_data_date = crime_data_df.select('Date Rptd')

# Extract year and month from the 'date_occ' column
crime_data_year_month = crime_data_date.withColumn('Year', year('Date Rptd')) \
                                       .withColumn('Month', month('Date Rptd'))

# Calculate counts for each year and month
counts = crime_data_year_month.groupBy('Year', 'Month').agg(count('*').alias('crimetotal'))

# Order by Year and Total Crimes Crimes
partitioned = Window.partitionBy('Year').orderBy(counts['crimetotal'].desc())

# Add a rank column to the DataFrame
ranked_df = counts.withColumn('rnk', dense_rank().over(partitioned))

# Filter the top 3 counts for each year
top3_df = ranked_df.filter('rnk <= 3')

top3 = top3_df.withColumnRenamed('rnk', '#')

# Show the results
top3.show(50)



+----+-----+----------+---+
|Year|Month|crimetotal|  #|
+----+-----+----------+---+
|2010|    3|     17595|  1|
|2010|    7|     17520|  2|
|2010|    5|     17338|  3|
|2011|    8|     17139|  1|
|2011|    5|     17050|  2|
|2011|    3|     16951|  3|
|2012|    8|     17696|  1|
|2012|   10|     17477|  2|
|2012|    5|     17391|  3|
|2013|    8|     17329|  1|
|2013|    7|     16714|  2|
|2013|    5|     16671|  3|
|2014|    7|     14059|  1|
|2014|   10|     14031|  2|
|2014|    9|     13799|  3|
|2015|    8|     18951|  1|
|2015|   10|     18916|  2|
|2015|    7|     18528|  3|
|2016|    8|     19779|  1|
|2016|   10|     19615|  2|
|2016|    7|     19262|  3|
|2017|   10|     20400|  1|
|2017|    8|     20086|  2|
|2017|    7|     19997|  3|
|2018|    5|     20248|  1|
|2018|    7|     19972|  2|
|2018|   10|     19814|  3|
|2019|    7|     19338|  1|
|2019|    8|     19074|  2|
|2019|    3|     18932|  3|
|2020|    1|     18488|  1|
|2020|    2|     17436|  2|
|2020|    7|     172

                                                                                

In [None]:
# # Query 1
# # query_1 = "SELECT 'Date Rptd' as date_rptd from "
# crime_data_df.createOrReplaceTempView("crime_data")
# crime_data_query_1 = sc.sql("select * from crime_data limit 100")
# crime_data_df.dtypes


In [None]:
# df = pd.read_csv('../data/primary/crime_data.csv', nrows=1000)
# df.dtypes