In [1]:
#pip install mysql-connector-python

In [2]:
import pyspark
import mysql.connector
from pyspark.sql import SparkSession
import os
os.environ['HADOOP_HOME'] = 'C:/hadoop/hadoop-3.3.6'
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["HADOOP_HOME"], "bin")

In [3]:
# Connect to MySQL
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="mysql"  # replace with your MySQL password
)
cursor = conn.cursor()
cursor.execute("CREATE DATABASE IF NOT EXISTS EV_Population_Data")
print("Database 'EV_Population_Data' created (if it did not exist).")

cursor.execute("USE EV_Population_Data")

# Create the table 'model_year' if it doesn't exist
create_table_query = """
CREATE TABLE IF NOT EXISTS model_year (
    model_year VARCHAR(4) UNIQUE NOT NULL
)
"""
cursor.execute(create_table_query)
print("Table 'model_year' created.")

# Create the table 'make' if it doesn't exist
create_table = """
CREATE TABLE IF NOT EXISTS make (
    make VARCHAR(20) UNIQUE NOT NULL
)
"""
cursor.execute(create_table)
print("Table 'model_year' created.")

# Create the table 'agg_cities' if it doesn't exis
query = """
CREATE TABLE IF NOT EXISTS agg_cities (
       city VARCHAR(20) UNIQUE NOT NULL,
       electric_cars INT NOT NULL
)
"""
cursor.execute(query)
print("Table 'agg_cities' created")


Database 'EV_Population_Data' created (if it did not exist).
Table 'model_year' created.
Table 'model_year' created.
Table 'agg_cities' created


In [4]:
spark = SparkSession.builder.appName("model_year").master("local[*]").getOrCreate()

In [5]:
# Read the Parquet file
path = "C:/Users/boris/Documents/DataEngenier/Spark/final_project/output/raw-data"

df = spark.read.parquet(path)

# Optionally, rename the column to 'model_year' (as your MySQL table might use lowercase)
model_year = df.withColumnRenamed("Model Year", "model_year")

model_year.createOrReplaceTempView("electric_cars") 

model_year_unique = spark.sql("SELECT DISTINCT CAST(model_year AS STRING) FROM electric_cars WHERE model_year > 1990 ORDER BY model_year")

model_year_unique.show(5)


+----------+
|model_year|
+----------+
|      1997|
|      1998|
|      1999|
|      2000|
|      2002|
+----------+
only showing top 5 rows



In [6]:
make_unique = spark.sql("SELECT DISTINCT Make FROM electric_cars")
make_unique.show(5)

+--------------------+
|                Make|
+--------------------+
|WHEEGO ELECTRIC CARS|
|             PORSCHE|
|             HYUNDAI|
|                FIAT|
|               LUCID|
+--------------------+
only showing top 5 rows



In [7]:
agg_cities = spark.sql("SELECT City, COUNT(*) AS electric_cars FROM electric_cars GROUP BY City")
agg_cities.show(5)                      

+---------+-------------+
|     City|electric_cars|
+---------+-------------+
|   Bingen|            3|
|Wahkiacus|            2|
|  Hanover|            1|
| Tumwater|          539|
|  Edmonds|         1590|
+---------+-------------+
only showing top 5 rows



In [10]:

cursor = conn.cursor()

model_year_list = [row[0] for row in model_year_unique.collect()]  # Convert PySpark DataFrame to Python list

# Insert each unique model year into the table
insert_query = "INSERT IGNORE INTO model_year (model_year) VALUES (%s)"
for model_year in model_year_list:
    cursor.execute(insert_query, (model_year,))

# Commit the changes to the database
conn.commit()
print("Inserted unique model year values into 'model_year' table.")

make_list = [row[0] for row in make_unique.collect()]  # Convert PySpark DataFrame to Python list

# Insert each make value into the table
insert_query = "INSERT IGNORE INTO make (make) VALUES (%s)"
for make in make_list:
    cursor.execute(insert_query, (make,))

# Commit the changes to the database
conn.commit()
print("Inserted unique make values into 'make' table.")

agg_cities_list = [(str(row[0]), int(row[1])) for row in agg_cities.collect()]

# Insert each city and electric car count into the table
insert_query = "INSERT IGNORE INTO agg_cities (city, electric_cars) VALUES (%s, %s)"
for city, electric_cars in agg_cities_list:
    cursor.execute(insert_query, (city, electric_cars))

# Commit the changes to the database
conn.commit()
print("Inserted unique city and electric car values into 'agg_cities' table.")


# Clean up and close connections
cursor.close()
conn.close()
spark.stop()

Inserted unique model year values into 'model_year' table.
Inserted unique make values into 'make' table.
Inserted unique city and electric car values into 'agg_cities' table.
