In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=235795671d1386ec564b6eed65d3242dda88a1c025763043b423f6dda7a9bf4f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, count, col
import matplotlib.pyplot as plt
from pyspark.sql.functions import to_date, date_format
import pandas as pd
import numpy as np
from scipy import stats

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.format("csv").option("header", True).load("/content/School_Updated.csv")
print(df.dtypes)
df.printSchema()

[('School ID', 'string'), ('District Name', 'string'), ('School Name', 'string'), ('Organization Type', 'string'), ('Organization Code', 'string'), ('Address', 'string'), ('Town', 'string'), ('Zipcode', 'string'), ('Phone', 'string'), ('Student Open Date', 'string'), ('State', 'string'), ('geometry', 'string')]
root
 |-- School ID: string (nullable = true)
 |-- District Name: string (nullable = true)
 |-- School Name: string (nullable = true)
 |-- Organization Type: string (nullable = true)
 |-- Organization Code: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Town: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Student Open Date: string (nullable = true)
 |-- State: string (nullable = true)
 |-- geometry: string (nullable = true)



In [4]:
df = df.withColumn("Student_Open_Date", to_date(df["Student Open Date"], "MM-dd-yyyy")) \
       .withColumn("Open_Year", year("Student_Open_Date"))

# Show the updated DataFrame
df.show()

+---------+--------------------+--------------------+--------------------+-----------------+--------------------+----------+-------+------------+-----------------+-----+--------------------+-----------------+---------+
|School ID|       District Name|         School Name|   Organization Type|Organization Code|             Address|      Town|Zipcode|       Phone|Student Open Date|State|            geometry|Student_Open_Date|Open_Year|
+---------+--------------------+--------------------+--------------------+-----------------+--------------------+----------+-------+------------+-----------------+-----+--------------------+-----------------+---------+
|        0|Vernon School Dis...| Skinner Road School|      Public Schools|          1461011|      90 Skinner Rd.|    Vernon|   6066|860-870-6180|       01-07-1984|   CT|POINT (-72.491502...|       1984-01-07|     1984|
|        1|Department of Soc...|Department of Soc...|      State Agencies|          3320015| 25 Sigourney Street|  Hartford|

In [5]:
df.printSchema()

root
 |-- School ID: string (nullable = true)
 |-- District Name: string (nullable = true)
 |-- School Name: string (nullable = true)
 |-- Organization Type: string (nullable = true)
 |-- Organization Code: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Town: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Student Open Date: string (nullable = true)
 |-- State: string (nullable = true)
 |-- geometry: string (nullable = true)
 |-- Student_Open_Date: date (nullable = true)
 |-- Open_Year: integer (nullable = true)



In [6]:
df = df.drop("Student Open Date")

df.printSchema()

# Show the updated DataFrame
df.show()

root
 |-- School ID: string (nullable = true)
 |-- District Name: string (nullable = true)
 |-- School Name: string (nullable = true)
 |-- Organization Type: string (nullable = true)
 |-- Organization Code: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Town: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- State: string (nullable = true)
 |-- geometry: string (nullable = true)
 |-- Student_Open_Date: date (nullable = true)
 |-- Open_Year: integer (nullable = true)

+---------+--------------------+--------------------+--------------------+-----------------+--------------------+----------+-------+------------+-----+--------------------+-----------------+---------+
|School ID|       District Name|         School Name|   Organization Type|Organization Code|             Address|      Town|Zipcode|       Phone|State|            geometry|Student_Open_Date|Open_Year|
+---------+--------------------+----------------

In [7]:
schools_per_town = df.groupBy("Town").agg(count("School Name").alias("Number_of_Schools"))

# Show the result
schools_per_town.show()

+-------------------+-----------------+
|               Town|Number_of_Schools|
+-------------------+-----------------+
|        Middlefield|                2|
|         Plainfield|                4|
|      Old Greenwich|                1|
|          Bethlehem|                1|
|   Stafford Springs|                4|
|             Monroe|                6|
|         Litchfield|                3|
|         Brookfield|                5|
|North Grosvenordale|                3|
|           Sterling|                2|
|             Bolton|                3|
|      Windsor Locks|                5|
|         Woodbridge|                4|
|            Wolcott|                6|
|           Cromwell|                5|
|     North Franklin|                2|
|        New Milford|                6|
|           Guilford|                7|
|         Manchester|               13|
|          Thomaston|                4|
+-------------------+-----------------+
only showing top 20 rows



In [35]:
public_schools_df = df.filter(df["Organization Type"] == "Public Schools")

# Show the filtered DataFrame
public_schools_df.show()

+---------+--------------------+--------------------+-----------------+-----------------+--------------------+-------------+-------+------------+-----+--------------------+-----------------+---------+
|School ID|       District Name|         School Name|Organization Type|Organization Code|             Address|         Town|Zipcode|       Phone|State|            geometry|Student_Open_Date|Open_Year|
+---------+--------------------+--------------------+-----------------+-----------------+--------------------+-------------+-------+------------+-----+--------------------+-----------------+---------+
|        0|Vernon School Dis...| Skinner Road School|   Public Schools|          1461011|      90 Skinner Rd.|       Vernon|   6066|860-870-6180|   CT|POINT (-72.491502...|       1984-01-07|     1984|
|        2|Norwalk School Di...|West Rocks Middle...|   Public Schools|          1035411|  81 West Rocks Road|      Norwalk|   6851|203-899-2970|   CT|POINT (-73.416113...|       1984-01-07|     1

In [None]:
# Write the DataFrame to a new CSV file
df.write.csv("Health_Updated(1).csv",header=True)

# Download the CSV file to the system
from google.colab import files
files.download("Health_Updated(1).csv")