In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a SparkSession
spark = SparkSession.builder.appName("App").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

## Load CSV File with Header

In [None]:
# Load the CSV file with the first row as a header
df = spark.read.format("csv").option("delimiter", ",").option("header", "true").load("1987.csv")

# Display the columns and the first 15 rows
df.show(15, truncate=False)

In [None]:
# Iterate over all columns in the DataFrame
for column in df.columns:
    df = df.withColumn(column, when(col(column) == "NA", None).otherwise(col(column)))

df.show()

In [None]:
from pyspark.sql.functions import col

# Counts the number of null values for each column
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
# Mostrar las columnas del DataFrame
print(df.columns)

In [None]:
df.printSchema()

# Preprocessing
- eliminate unnecesary variables
- missing and duplicates values
- see correlation
- variable transformation
- variable creation

In [None]:
# columns to eliminate
columns = [
    "ArrTime", 
    "ActualElapsedTime", 
    "AirTime", 
    "TaxiIn", 
    "Diverted", 
    "CarrierDelay", 
    "WeatherDelay", 
    "NASDelay", 
    "SecurityDelay", 
    "LateAircraftDelay"
]

# Eliminate columns
df = df.drop(*columns)


In [None]:
df.printSchema()

In [None]:
# columns to eliminate
columns = [
    "Year",
    "TailNum",
    "TaxiOut",
    "Cancelled",
    "CancellationCode"  
]

# Eliminate columns
df = df.drop(*columns)

In [None]:
df.printSchema()

## Missing values

In [None]:
# Contar valores NA por columna
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

#Now we check if NA stands for 0. If this value is not present, means that NA was 0.

In [None]:
from pyspark.sql import functions as F
# Filter the DataFrame to keep only rows where ArrDelay is equal to 0
filtered_df = df.filter(F.col("ArrDelay") == 0)

# Total number of rows in the DataFrame
total_rows = df.count()

# Check if there are any rows in the filtered DataFrame
if filtered_df.count() > 0:
    print("0 is present in the ArrDelay column " + str(filtered_df.count()) + " times out of " + str(total_rows) + ".")
else:
    print("0 is not present in the ArrDelay column.")

In [None]:
# Calculate the percentage of null values for each column
null_percentage = df.select([(count(when(col(c).isNull(), c)) / total_rows).alias(c) for c in df.columns])

# Show the percentage of null values for each column
null_percentage.show()

In [None]:
# Drop rows with at least one missing value

df = df.dropna()
dropped_rows = total_rows - df.count()
print("Dropped "+ str(dropped_rows)+ " rows.")

## Duplicates

In [None]:
df.printSchema()

In [None]:
# Check for duplicates and show the results
total_rows = df.count()
df = df.dropDuplicates()

if total_rows - df.count()  > 0:
    print("There are duplicates in the DataFrame.")
else:
    print("No duplicates found in the DataFrame.")

## Correlation

## Variable transformation

In [None]:
# List of columns to exclude from conversion
exclude_columns = ['UniqueCarrier', 'Origin', 'Dest']

# Convert all columns to integer type except the ones in exclude_columns
for column in df.columns:
    if column not in exclude_columns:
        df = df.withColumn(column, col(column).cast("integer"))

# Display the columns and the first 15 rows to verify the change
df.show(15, truncate=False)



In [None]:
# Print distinct values for each specified column
for column in exclude_columns:
    print(f"Distinct values in column '{column}':")
    distinct_values = df.select(column).distinct().collect()
    for value in distinct_values:
        print(value[column])
    print("\n")  # Adding a newline for better readability
    # Print the number of elements in the distinct_values list
    print(f"Number of distinct values: {len(distinct_values)}")

In [None]:
df.printSchema()

## Variable creation

# Modeling

# Validation

##Close the context

In [ ]:
spark.stop()