# Advanced Database Systems - NTUA - 2023


## Project Scope

## 

### Contributors

Dimitris Vasios el19404

Thodoris Mexis 

### Script to enable the cluster
**Cluster Specification**

Namenode

Datanodes

In [14]:
# Library Imports
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, DateType, DoubleType, TimestampType, TimestampNTZType
from pyspark.sql.functions import col, count, to_timestamp
import subprocess as sp
import pandas as pd

In [None]:
# Download necessary data for the project
sp.call(['bash', '../scripts/import_data.sh'])

In [None]:
# Start Cluster
sp.call(['bash', '../scripts/cluster_initiate.sh'])

In [2]:
# Combine Primary Data into one csv
crime_data_2010_2019 = pd.read_csv('../data/primary/crime_data_2010_2019.csv')
crime_data_2020_present = pd.read_csv('../data/primary/crime_data_2020_present.csv')
crime_data = pd.concat([crime_data_2010_2019, crime_data_2020_present], ignore_index=True)
crime_data.to_csv('../data/primary/crime_data.csv')

In [6]:
# Load Data to HDFS
sp.call(['bash', '../scripts/load_data_to_hdfs.sh'])

put: `/user/data/secondary/revgecoding.csv': File exists
put: `/user/data/secondary/LAPD_Police_Stations.csv': File exists
put: `/user/data/secondary/median_household_incomes/LA_income_2015.csv': File exists
put: `/user/data/secondary/median_household_incomes/LA_income_2021.csv': File exists
put: `/user/data/secondary/median_household_incomes/LA_income_2019.csv': File exists
put: `/user/data/secondary/median_household_incomes/LA_income_2017.csv': File exists
put: `/user/data/text.txt': File exists
put: `/user/data/primary/crime_data_2020_present.csv': File exists
put: `/user/data/primary/crime_data_2010_2019.csv': File exists


1

In [3]:
# Start a Spark Session
sc = SparkSession \
    .builder \
    .appName("Standard Query") \
    .getOrCreate() 

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/03 23:03:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/03 23:03:36 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [30]:
# Crime Data Schema
crime_data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

crime_data_df = sc.read.format('csv') \
    .options(header='true') \
    .schema(crime_data_schema) \
    .load("hdfs://okeanos-master:54310/user/data/primary/crime_data_2010_2019.csv")

crime_data_df = crime_data_df.withColumn("detimestamp",to_timestamp("Date Rptd", 'yyyy/MM/dd HH:mm:ss'))
# df = df.withColumn("timestamp", to_timestamp("timestamp_string", "yyyy-MM-dd HH:mm:ss"))

rows = crime_data_df.count()
print(f"Crime Data Total Rows : {rows}")
print(f"Date Rptd: {crime_data_df.dtypes}")




Crime Data Total Rows : 2135775
Date Rptd: [('DR_NO', 'int'), ('Date Rptd', 'string'), ('DATE OCC', 'string'), ('TIME OCC', 'int'), ('AREA', 'int'), ('AREA NAME', 'string'), ('Rpt Dist No', 'int'), ('Part 1-2', 'int'), ('Crm Cd', 'int'), ('Crm Cd Desc', 'string'), ('Mocodes', 'string'), ('Vict Age', 'int'), ('Vict Sex', 'string'), ('Vict Descent', 'string'), ('Premis Cd', 'int'), ('Premis Desc', 'string'), ('Weapon Used Cd', 'int'), ('Weapon Desc', 'string'), ('Status', 'string'), ('Status Desc', 'string'), ('Crm Cd 1', 'int'), ('Crm Cd 2', 'int'), ('Crm Cd 3', 'int'), ('Crm Cd 4', 'int'), ('LOCATION', 'string'), ('Cross Street', 'string'), ('LAT', 'double'), ('LON', 'double'), ('timestamp', 'timestamp')]


                                                                                

In [31]:
# Query 1
# query_1 = "SELECT 'Date Rptd' as date_rptd from "
crime_data_df.createOrReplaceTempView("crime_data")
crime_data_query_1 = sc.sql("select * from crime_data limit 100")
crime_data_df.dtypes


[('DR_NO', 'int'),
 ('Date Rptd', 'string'),
 ('DATE OCC', 'string'),
 ('TIME OCC', 'int'),
 ('AREA', 'int'),
 ('AREA NAME', 'string'),
 ('Rpt Dist No', 'int'),
 ('Part 1-2', 'int'),
 ('Crm Cd', 'int'),
 ('Crm Cd Desc', 'string'),
 ('Mocodes', 'string'),
 ('Vict Age', 'int'),
 ('Vict Sex', 'string'),
 ('Vict Descent', 'string'),
 ('Premis Cd', 'int'),
 ('Premis Desc', 'string'),
 ('Weapon Used Cd', 'int'),
 ('Weapon Desc', 'string'),
 ('Status', 'string'),
 ('Status Desc', 'string'),
 ('Crm Cd 1', 'int'),
 ('Crm Cd 2', 'int'),
 ('Crm Cd 3', 'int'),
 ('Crm Cd 4', 'int'),
 ('LOCATION', 'string'),
 ('Cross Street', 'string'),
 ('LAT', 'double'),
 ('LON', 'double'),
 ('timestamp', 'timestamp')]

In [43]:
df = pd.read_csv('../data/primary/crime_data.csv', nrows=1000)
df.dtypes

Unnamed: 0          int64
DR_NO               int64
Date Rptd          object
DATE OCC           object
TIME OCC            int64
AREA              float64
AREA NAME          object
Rpt Dist No         int64
Part 1-2            int64
Crm Cd              int64
Crm Cd Desc        object
Mocodes            object
Vict Age            int64
Vict Sex           object
Vict Descent       object
Premis Cd         float64
Premis Desc        object
Weapon Used Cd    float64
Weapon Desc        object
Status             object
Status Desc        object
Crm Cd 1          float64
Crm Cd 2          float64
Crm Cd 3          float64
Crm Cd 4          float64
LOCATION           object
Cross Street       object
LAT               float64
LON               float64
AREA              float64
dtype: object