In [1]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EVPop").getOrCreate()

In [2]:
dfEV = spark.read.options(inferSchema='True',delimiter=',',header=True) \
  .csv('EVPopulation.csv')
dfEV.show()

+----------+-----------+--------------+-----+-----------+----------+-------------+---------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+
|VIN (1-10)|     County|          City|State|Postal Code|Model Year|         Make|    Model|Electric Vehicle Type|Clean Alternative Fuel Vehicle (CAFV) Eligibility|Electric Range|Base MSRP|Legislative District|DOL Vehicle ID|    Vehicle Location|    Electric Utility|2020 Census Tract|
+----------+-----------+--------------+-----+-----------+----------+-------------+---------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+
|WDC0G5EB7K|     Louisa|       Bumpass|   VA|      23024|      2019|MERCEDES-BENZ|GLC-CLASS| Plug-in Hybrid El...|                            

In [3]:
dfEV.printSchema()

root
 |-- VIN (1-10): string (nullable = true)
 |-- County: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: integer (nullable = true)
 |-- Model Year: integer (nullable = true)
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Electric Vehicle Type: string (nullable = true)
 |-- Clean Alternative Fuel Vehicle (CAFV) Eligibility: string (nullable = true)
 |-- Electric Range: integer (nullable = true)
 |-- Base MSRP: integer (nullable = true)
 |-- Legislative District: integer (nullable = true)
 |-- DOL Vehicle ID: integer (nullable = true)
 |-- Vehicle Location: string (nullable = true)
 |-- Electric Utility: string (nullable = true)
 |-- 2020 Census Tract: long (nullable = true)



In [5]:
dfEV.select('County').show()

+-----------+
|     County|
+-----------+
|     Louisa|
|   Kittitas|
|     Chelan|
|  Snohomish|
|   Thurston|
|     Kitsap|
|       King|
|     Skagit|
|     Kitsap|
|  Snohomish|
|     Yakima|
|  Snohomish|
|Walla Walla|
|     Kitsap|
|     Kitsap|
|Walla Walla|
|       King|
|        Lee|
|  Snohomish|
|    Stevens|
+-----------+
only showing top 20 rows



In [6]:
dfEV.select(['County','City']).show()

+-----------+--------------+
|     County|          City|
+-----------+--------------+
|     Louisa|       Bumpass|
|   Kittitas|      Cle Elum|
|     Chelan|        Chelan|
|  Snohomish|     Snohomish|
|   Thurston|      Tumwater|
|     Kitsap|      Kingston|
|       King|       Seattle|
|     Skagit|  Mount Vernon|
|     Kitsap|  Port Orchard|
|  Snohomish|       Bothell|
|     Yakima|        Tieton|
|  Snohomish|       Edmonds|
|Walla Walla|   Walla Walla|
|     Kitsap|  Port Orchard|
|     Kitsap|  Port Orchard|
|Walla Walla|   Walla Walla|
|       King|        Vashon|
|        Lee|Smiths Station|
|  Snohomish|       Edmonds|
|    Stevens|          Rice|
+-----------+--------------+
only showing top 20 rows



In [7]:
### Value Counts
dfEV.groupBy('Make').count().orderBy('count', ascending=False).show()

+----------+-----+
|      Make|count|
+----------+-----+
|     TESLA|52674|
|    NISSAN|12839|
| CHEVROLET|10273|
|      FORD| 6072|
|       BMW| 4756|
|       KIA| 4561|
|    TOYOTA| 4445|
|VOLKSWAGEN| 2719|
|     VOLVO| 2535|
|      AUDI| 2374|
|  CHRYSLER| 1826|
|   HYUNDAI| 1538|
|      JEEP| 1176|
|    RIVIAN|  992|
|   PORSCHE|  821|
|      FIAT|  809|
|     HONDA|  788|
|      MINI|  671|
|  POLESTAR|  585|
|MITSUBISHI|  581|
+----------+-----+
only showing top 20 rows



In [11]:
dfEV.groupBy('State').count().orderBy('count', ascending=False).show()

+-----+------+
|State| count|
+-----+------+
|   WA|114312|
|   CA|    74|
|   VA|    38|
|   MD|    27|
|   TX|    16|
|   CO|     9|
|   NC|     9|
|   AZ|     7|
|   GA|     7|
|   NV|     7|
|   SC|     6|
|   FL|     6|
|   IL|     6|
|   CT|     6|
|   DC|     5|
|   NY|     5|
|   LA|     4|
|   OR|     4|
|   NJ|     4|
|   NE|     4|
+-----+------+
only showing top 20 rows



In [15]:
### Filter dfEV to just values in WA and save into a new dataframe
df = dfEV.filter(dfEV.State == 'WA')
df.groupBy('State').count().show()

+-----+------+
|State| count|
+-----+------+
|   WA|114312|
+-----+------+

