In [20]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.1.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.1                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.10% [2 InRelease gpgv 1,581 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Hit:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic

In [21]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2022-11-04 20:36:03--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.1’


2022-11-04 20:36:03 (5.57 MB/s) - ‘postgresql-42.2.9.jar.1’ saved [914037/914037]



In [82]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PlayersETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [83]:
# Read in data from GCS Buckets
from pyspark import SparkFiles
url="https://storage.googleapis.com/big-data-bowl/players.csv"
spark.sparkContext.addFile(url)
player_data_df = spark.read.csv(SparkFiles.get("players.csv"), sep=",", header=True, inferSchema=True)

# Show DataFrame
player_data_df.show()

+-----+------+------+----------+---------------+----------------+------------------+
|nflId|height|weight| birthDate|    collegeName|officialPosition|       displayName|
+-----+------+------+----------+---------------+----------------+------------------+
|25511|   6-4|   225|1977-08-03|       Michigan|              QB|         Tom Brady|
|28963|   6-5|   240|1982-03-02|      Miami, O.|              QB|Ben Roethlisberger|
|29550|   6-4|   328|1982-01-22|       Arkansas|               T|      Jason Peters|
|29851|   6-2|   225|1983-12-02|     California|              QB|     Aaron Rodgers|
|30078|   6-2|   228|1982-11-24|        Harvard|              QB|  Ryan Fitzpatrick|
|30842|   6-6|   267|1984-05-19|           UCLA|              TE|    Marcedes Lewis|
|30869|   6-7|   330|1981-12-12|Louisiana State|               T|  Andrew Whitworth|
|33084|   6-4|   217|1985-05-17| Boston College|              QB|         Matt Ryan|
|33107|   6-4|   315|1985-08-30|  Virginia Tech|               T|

In [84]:
from pyspark.sql.functions import regexp_replace

#Replace NA values in birthDate with blank values
player_data_df.withColumn('birthDate', regexp_replace('birthDate', 'NA', '')) \
  .show(truncate=False)


+-----+------+------+----------+---------------+----------------+------------------+
|nflId|height|weight|birthDate |collegeName    |officialPosition|displayName       |
+-----+------+------+----------+---------------+----------------+------------------+
|25511|6-4   |225   |1977-08-03|Michigan       |QB              |Tom Brady         |
|28963|6-5   |240   |1982-03-02|Miami, O.      |QB              |Ben Roethlisberger|
|29550|6-4   |328   |1982-01-22|Arkansas       |T               |Jason Peters      |
|29851|6-2   |225   |1983-12-02|California     |QB              |Aaron Rodgers     |
|30078|6-2   |228   |1982-11-24|Harvard        |QB              |Ryan Fitzpatrick  |
|30842|6-6   |267   |1984-05-19|UCLA           |TE              |Marcedes Lewis    |
|30869|6-7   |330   |1981-12-12|Louisiana State|T               |Andrew Whitworth  |
|33084|6-4   |217   |1985-05-17|Boston College |QB              |Matt Ryan         |
|33107|6-4   |315   |1985-08-30|Virginia Tech  |T               |

In [85]:
from pyspark.sql.functions import coalesce, to_date

def to_date_(col, formats=("MM/dd/yyyy", "yyyy-MM-dd")):
    return coalesce(*[to_date(col, f) for f in formats])

player_data_df.withColumn("birthDate", to_date_("birthDate")).show()



+-----+------+------+----------+---------------+----------------+------------------+
|nflId|height|weight| birthDate|    collegeName|officialPosition|       displayName|
+-----+------+------+----------+---------------+----------------+------------------+
|25511|   6-4|   225|1977-08-03|       Michigan|              QB|         Tom Brady|
|28963|   6-5|   240|1982-03-02|      Miami, O.|              QB|Ben Roethlisberger|
|29550|   6-4|   328|1982-01-22|       Arkansas|               T|      Jason Peters|
|29851|   6-2|   225|1983-12-02|     California|              QB|     Aaron Rodgers|
|30078|   6-2|   228|1982-11-24|        Harvard|              QB|  Ryan Fitzpatrick|
|30842|   6-6|   267|1984-05-19|           UCLA|              TE|    Marcedes Lewis|
|30869|   6-7|   330|1981-12-12|Louisiana State|               T|  Andrew Whitworth|
|33084|   6-4|   217|1985-05-17| Boston College|              QB|         Matt Ryan|
|33107|   6-4|   315|1985-08-30|  Virginia Tech|               T|

In [86]:
player_data_df.dtypes

[('nflId', 'int'),
 ('height', 'string'),
 ('weight', 'int'),
 ('birthDate', 'string'),
 ('collegeName', 'string'),
 ('officialPosition', 'string'),
 ('displayName', 'string')]

Postgres Setup

In [87]:
# Store environment variable
from getpass import getpass
password = getpass('Enter database password')

# Configure settings for Cloud SQL
mode = "append"
jdbc_url="jdbc:postgresql://34.72.136.99:5432/big-data-bowl"
config = {"user":"postgres", 
          "password": password, 
          "driver":"org.postgresql.Driver"}


Enter database password··········


In [88]:
# Write DataFrame to players table in Cloud SQL
player_data_df.write.jdbc(url=jdbc_url, table='players', mode=mode, properties=config)