In [57]:
import os
%load_ext autoreload
%autoreload 2
import pandas as pd
import pyspark
import json
import pyarrow
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

os.environ['SPARK_HOME'] = 'C:/Users/saul2/Spark_DF/spark-3.5.5-bin-hadoop3'
os.environ['HADOOP_HOME'] = 'C:/Users/saul2/Spark_DF/spark-3.5.5-bin-hadoop3'
os.environ['JAVA_HOME'] = 'C:/Program Files/Java/jdk1.8.0_202'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
spark = SparkSession.builder.appName("Practice").getOrCreate()

In [163]:
#csv_file = 'Students_Grading_Dataset.csv'
csv_file = 'C:/Users/saul2/OneDrive/Desktop/PastProjects/SQL_Query_Practice/Student_Test_Performance/StudentsPerformance.csv'
df = spark.read.option("header", True).csv(csv_file)
df.createOrReplaceTempView("Students")

In [165]:
# Data Cleaing
# Show current state of the table
df.show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [167]:
# Change type of columns
#df = df.withColumn("math score", col("math score").cast(IntegerType()))
#df = df.withColumn("reading score", col("reading score").cast(IntegerType()))
#df = df.withColumn("writing score", col("writing score").cast(IntegerType()))

# Checking if there are decimal values in the score columns
# df.filter((col("math score") % 1 != 0)).show()
# df.filter((col("reading score") % 1 != 0)).show()
# df.filter((col("writing score") % 1 != 0)).show()
# I got no results, so I will now convert them to int type
df = df.withColumn("math score", col("math score").cast(IntegerType()))
df = df.withColumn("reading score", col("reading score").cast(IntegerType()))
df = df.withColumn("writing score", col("writing score").cast(IntegerType()))
#df.printSchema()

In [169]:
# Check for any blanks or nulls in table
for columns in df.columns:
    df.filter(col(columns).isNull() | (trim(col(columns)) == "")).show()
    
# Result -- There aren't any on this file, so don't do anything
#df.show()

+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+

+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+

+------+--------------+-------

In [171]:
# Before I start looping through each column, make sure to trim any extra spaces
# Check Column Name Formatting
# Remove any unnecessary space if there are any on the columns
col_dict = {colName: colName.strip() for colName in df.columns}
for old_cols, new_cols in col_dict.items():
    df = df.withColumnRenamed(old_cols, new_cols)

In [173]:
# Begin Capitalizing the string words
for column, column_type in df.dtypes:
    if column_type == 'string':
        df = df.withColumn(column, initcap(col(column)))
# df.show()

In [183]:
# There seems to be a rare case scenario where '/' are being used 
# Loop through each column to see if there are special cases not shown currently
for column, column_type in df.dtypes:
    if column_type == 'string':
        df.select(column).distinct().show(truncate = False)

# Result
# 1. 'Free/reduced' is the special case
# 2. There seems to be apostrophes on Level of Educations

+------+
|gender|
+------+
|Female|
|Male  |
+------+

+--------------+
|race/ethnicity|
+--------------+
|Group A       |
|Group E       |
|Group D       |
|Group B       |
|Group C       |
+--------------+

+---------------------------+
|parental level of education|
+---------------------------+
|High School                |
|Bachelors Degree           |
|Some High School           |
|Associates Degree          |
|Masters Degree             |
|Some College               |
+---------------------------+

+------------+
|Lunch       |
+------------+
|Standard    |
|Free/Reduced|
+------------+

+-----------------------+
|test preparation course|
+-----------------------+
|None                   |
|Completed              |
+-----------------------+



In [175]:
# Going to fix that with a functions
# I will split the word into 2, use initcap, then attach them with '/'
df = df.withColumn("Lunch",
                  concat_ws("/", expr("transform(split(Lunch, '/') , x -> initcap(x))")))

In [179]:
# Will get rid of the apostrophres to make it more readable
df = df.withColumn("parental level of education", regexp_replace("parental level of education", "'", ""))

In [181]:
df.show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       Lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|Female|       Group B|           Bachelors Degree|    Standard|                   None|        72|           72|           74|
|Female|       Group C|               Some College|    Standard|              Completed|        69|           90|           88|
|Female|       Group B|             Masters Degree|    Standard|                   None|        90|           95|           93|
|  Male|       Group A|          Associates Degree|Free/Reduced|                   None|        47|           57|           44|
|  Male|       Group C|               Some College|    Standard|                   None|        76|     

In [185]:
# Start checking if inputted scores are valid
# Print column if there is a value less than 0 or greater than 100
for column, columnType in df.dtypes:
    if columnType == 'int':
        out_of_range = df.filter((col(column) < 0) | (100 < col(column)))
        if out_of_range.count() > 0:
            print("Column: " + column)
            print("Out of range values")
# Result -- There doesn't seem to be any wrong inputs

In [187]:
# Checking for duplicates
# There isn't a value to determine if it's a unique person like student_id
# Will check for duplicates by grouping them just in case
duplicate_rows = df.groupBy(df.columns).count().filter("count > 1")
duplicate_rows.show()

# Result - There currently aren't any duplicates, so I don't need to do anything

+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+-----+
|gender|race/ethnicity|parental level of education|Lunch|test preparation course|math score|reading score|writing score|count|
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+-----+
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+-----+



In [189]:
# I've checked all the values, now I will check Columns
# Add underscores to columns then capitalize theme
for old_col in df.columns:
    new_col = old_col.lower().replace(" ", "_")
    df = df.withColumnRenamed(old_col, new_col.title())
df.show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|Gender|Race/Ethnicity|Parental_Level_Of_Education|       Lunch|Test_Preparation_Course|Math_Score|Reading_Score|Writing_Score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|Female|       Group B|           Bachelors Degree|    Standard|                   None|        72|           72|           74|
|Female|       Group C|               Some College|    Standard|              Completed|        69|           90|           88|
|Female|       Group B|             Masters Degree|    Standard|                   None|        90|           95|           93|
|  Male|       Group A|          Associates Degree|Free/Reduced|                   None|        47|           57|           44|
|  Male|       Group C|               Some College|    Standard|                   None|        76|     

In [191]:
# I need to rename "Race/Ethnicity" to "Race_Ethnicity"
df = df.withColumnRenamed('Race/Ethnicity', 'Race_Ethnicity')

In [193]:
df.show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|Gender|Race_Ethnicity|Parental_Level_Of_Education|       Lunch|Test_Preparation_Course|Math_Score|Reading_Score|Writing_Score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|Female|       Group B|           Bachelors Degree|    Standard|                   None|        72|           72|           74|
|Female|       Group C|               Some College|    Standard|              Completed|        69|           90|           88|
|Female|       Group B|             Masters Degree|    Standard|                   None|        90|           95|           93|
|  Male|       Group A|          Associates Degree|Free/Reduced|                   None|        47|           57|           44|
|  Male|       Group C|               Some College|    Standard|                   None|        76|     

In [197]:
# Create Table After Cleanup
df.createOrReplaceTempView("Student_Performance")

In [209]:
# Make Jupyter create a directory, or it won't have the privilege to upload files to it
output_dir = 'results'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [205]:
# Export the clean file
pandas_df = df.toPandas()
path = os.path.join(output_dir, 'Student_Performance.parquet')
pandas_df.to_parquet(path, engine='pyarrow') 
path = os.path.join(output_dir, 'Student_Performance.csv')
pandas_df.to_csv(path, index=False)
path = os.path.join(output_dir, 'Student_Performance.json')
pandas_df.to_json(path, orient='records', lines=True)