In [286]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,collect_list, to_timestamp, expr, regexp_replace,split, to_timestamp, from_unixtime,date_format,when,concat,lit,hour,minute,to_date
from pyspark.sql.types import StructType, StringType, StructField


In [287]:
spark = SparkSession.builder.appName('bovo').getOrCreate()

In [288]:
spark

In [289]:
schema = StructType([
    StructField("AreaCode", StringType(), False),
    StructField("Date", StringType(), False),
    StructField("Time", StringType(), False),
    StructField("Temperature(°F)", StringType(), False),
    StructField("DewPoint(°F)", StringType(), False),
    StructField("Humidity(%)", StringType(), False),
    StructField("Wind", StringType(), False),
    StructField("WindSpeed(mph)", StringType(), False),
    StructField("WindGust(mph)", StringType(), False),
    StructField("Pressure(in)", StringType(), False),
    StructField("Precip(in)", StringType(), False),
    StructField("Condition", StringType(), False)
    
    
    
])

In [290]:
df_pyspark=spark.read.option('header','true').csv(r'C:\Users\httyd\Desktop\Data\report_2021\*.csv',inferSchema=True)
df_weather=spark.read.option('header','true').csv(r'C:\Users\httyd\Desktop\capstone\2020\*.csv',schema=schema)

In [291]:
df_pyspark.show(100)

+----+-------+-----+----------+---------+----------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+---------------------------------------+----------------------------------------------+-------------------------------------------------+--------------------------------------------------+------------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+--------------+---------+-------------+-------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------+-----

In [292]:
# Get the list of column names and their corresponding indices
column_indices = [(index, column_name) for index, column_name in enumerate(df_pyspark.columns)]

# Display the column names with their corresponding indices
for index, column_name in column_indices:
    print(f"Column {index}: {column_name}")


Column 0: Year
Column 1: Quarter
Column 2: Month
Column 3: DayofMonth
Column 4: DayOfWeek
Column 5: FlightDate
Column 6: Marketing_Airline_Network
Column 7: Operated_or_Branded_Code_Share_Partners
Column 8: DOT_ID_Marketing_Airline
Column 9: IATA_Code_Marketing_Airline
Column 10: Flight_Number_Marketing_Airline
Column 11: Originally_Scheduled_Code_Share_Airline
Column 12: DOT_ID_Originally_Scheduled_Code_Share_Airline
Column 13: IATA_Code_Originally_Scheduled_Code_Share_Airline
Column 14: Flight_Num_Originally_Scheduled_Code_Share_Airline
Column 15: Operating_Airline 
Column 16: DOT_ID_Operating_Airline
Column 17: IATA_Code_Operating_Airline
Column 18: Tail_Number
Column 19: Flight_Number_Operating_Airline
Column 20: OriginAirportID
Column 21: OriginAirportSeqID
Column 22: OriginCityMarketID
Column 23: Origin
Column 24: OriginCityName
Column 25: OriginState
Column 26: OriginStateFips
Column 27: OriginStateName
Column 28: OriginWac
Column 29: DestAirportID
Column 30: DestAirportSeqID
Co

In [293]:
# Define the range of column indices to drop
start_index = 59  # Starting index (inclusive)
end_index = 110    # Ending index (exclusive)

# Get the list of column names from the DataFrame
column_names = df_pyspark.columns

# Drop the columns within the specified range
columns_to_drop = column_names[start_index:end_index]
df_pyspark = df_pyspark.drop(*columns_to_drop)


In [294]:
# Get the data types of all columns as a list of tuples
column_types = df_pyspark.dtypes

# Iterate over the list of tuples to find the data type of the 'CRSArrTime' column
for column_name, data_type in column_types:
    if column_name == "CRSArrTime":
        print("Data type of 'CRSArrTime' column:", data_type)


Data type of 'CRSArrTime' column: int


In [295]:
# Convert it to string with leading zeros if necessary
df_pyspark = df_pyspark.withColumn("CRSArrTime", col("CRSArrTime").cast("string"))

# Pad the string representation of time with leading zeros if necessary
df_pyspark = df_pyspark.withColumn("CRSArrTime", \
    expr("LPAD(CRSArrTime, 4, '0')"))

# Convert the 'CRSArrTime' column to timestamp format
df_pyspark = df_pyspark.withColumn("CRSArrTime", \
    to_timestamp(col("CRSArrTime"), "HHmm"))

split_col = split(df_pyspark['CRSArrTime'], ' ')
df_pyspark = df_pyspark.withColumn('CRSArrTime', split_col.getItem(1))  # Extract date component


# Show the DataFrame
df_pyspark.show(100)

+----+-------+-----+----------+---------+----------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+---------------------------------------+----------------------------------------------+-------------------------------------------------+--------------------------------------------------+------------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+--------------+---------+-------------+-------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------+-----

In [296]:
# Add a new column 'rounded_hour' that represents the closest hour
df_pyspark = df_pyspark.withColumn("Newdate", to_date("CRSArrTime")) \
    .withColumn("hour", hour("CRSArrTime")) \
    .withColumn("minute", minute("CRSArrTime")) \
    .withColumn("rounded_hour",
                when(col("minute") >= 30, expr("hour + 1")).otherwise(col("hour"))
                ) \
    .withColumn("rounded_hour", when(col("rounded_hour") == 24, 0).otherwise(col("rounded_hour"))) \
    .withColumn("rounded_hour", concat(
        when(col("rounded_hour") < 10, concat(lit("0"), col("rounded_hour"))).otherwise(col("rounded_hour")),
        lit(":00:00"))) \
    .drop("hour", "minute")

In [297]:
# Show the resulting DataFrame
df_pyspark.show(100)

+----+-------+-----+----------+---------+----------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+---------------------------------------+----------------------------------------------+-------------------------------------------------+--------------------------------------------------+------------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+--------------+---------+-------------+-------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------+-----

In [298]:
# Get the data types of all columns as a list of tuples
column_types = df_weather.dtypes

# Iterate over the list of tuples to find the data type of the 'CRSArrTime' column
for column_name, data_type in column_types:
    if column_name == "Time":
        print("Data type of 'Time' column:", data_type)


Data type of 'Time' column: string


In [299]:
# Assuming your DataFrame is named df_weather
df_weather = df_weather.withColumn("Time", date_format(to_timestamp("Time", "h:mm a"), "HH:mm"))



In [300]:
split_col = split(df_weather['Temperature(°F)'], '°')
df_weather = df_weather.withColumn('Temperature(°F)', split_col.getItem(0))  # Extract date component
split_col = split(df_weather['DewPoint(°F)'], '°F')
df_weather = df_weather.withColumn('DewPoint(°F)', split_col.getItem(0))  # Extract date component
split_col = split(df_weather['Humidity(%)'], '°%')
df_weather = df_weather.withColumn('Humidity(%)', split_col.getItem(0))  # Extract date component
split_col = split(df_weather['WindSpeed(mph)'], '°mph')
df_weather = df_weather.withColumn('WindSpeed(mph)', split_col.getItem(0))  # Extract date component
split_col = split(df_weather['WindGust(mph)'], '°mph')
df_weather = df_weather.withColumn('WindGust(mph)', split_col.getItem(0))  # Extract date component
split_col = split(df_weather['Pressure(in)'], '°in')
df_weather = df_weather.withColumn('Pressure(in)', split_col.getItem(0))  # Extract date component
split_col = split(df_weather['Precip(in)'], '°in')  # Use backticks to escape the dot
df_weather = df_weather.withColumn('Precip(in)', split_col.getItem(0))  # Extract date component

In [301]:
# Add a new column 'rounded_hour' that represents the closest hour
df_weather = df_weather.withColumn("Newdate", to_date("Time")) \
    .withColumn("hour", hour("Time")) \
    .withColumn("minute", minute("Time")) \
    .withColumn("rounded_hour",
                when(col("minute") >= 30, expr("hour + 1")).otherwise(col("hour"))
                ) \
    .withColumn("rounded_hour", when(col("rounded_hour") == 24, 0).otherwise(col("rounded_hour"))) \
    .withColumn("rounded_hour", concat(
        when(col("rounded_hour") < 10, concat(lit("0"), col("rounded_hour"))).otherwise(col("rounded_hour")),
        lit(":00:00"))) \
    .drop("hour", "minute")

In [302]:
df_weather.show(100)

+--------+----------+-----+---------------+------------+-----------+----+--------------+-------------+------------+----------+--------------------+-------+------------+
|AreaCode|      Date| Time|Temperature(°F)|DewPoint(°F)|Humidity(%)|Wind|WindSpeed(mph)|WindGust(mph)|Pressure(in)|Precip(in)|           Condition|Newdate|rounded_hour|
+--------+----------+-----+---------------+------------+-----------+----+--------------+-------------+------------+----------+--------------------+-------+------------+
|     ATL|2021-05-01|01:52|            61 |         46 |        58 | WNW|            8 |           0 |      28.96 |      0.0 |       Mostly Cloudy|   NULL|    02:00:00|
|     ATL|2021-05-01|02:52|            60 |         46 |        60 | WNW|           12 |           0 |      28.95 |      0.0 |       Mostly Cloudy|   NULL|    03:00:00|
|     ATL|2021-05-01|03:52|            58 |         46 |        65 |  NW|           10 |           0 |      28.95 |      0.0 |       Partly Cloudy|   NULL|

In [303]:
# unique_names = df_pyspark.select("Origin").distinct().rdd.flatMap(lambda x: x).collect()

# # Print the unique values
# for name in unique_names:
#     print(name)

In [308]:
# Define the list of specific texts you want to search for
specific_texts = ["ATL", "CLT", "DEN", "DFW", "EWR", "IAH", "JFK", "LAS", "LAX", "MCO", "MIA", "ORD", "PHX", "SEA", "SFO"]  # Add your specific texts here

# Filter DataFrame rows based on multiple specific texts in a particular column
filtered_df = df_pyspark.filter(col("Origin").isin(specific_texts))

# Show the filtered DataFrame
filtered_df.show(100)

+----+-------+-----+----------+---------+----------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+---------------------------------------+----------------------------------------------+-------------------------------------------------+--------------------------------------------------+------------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+-------------------+---------+-------------+--------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------

In [None]:
 # Join df_Arr and df_weather based on Newdate and rounded_date
joined_df_dep = filtered_df.join(df_weather, (filtered_df.rounded_hour == df_weather.rounded_hour)& (filtered_df.FlightDate == df_weather.Date)& (filtered_df.Dest == df_weather.AreaCode), "inner")
joined_df_dep.show(1000)

In [306]:
# # Define the list of specific texts you want to search for
# specific_texts = ["ATL", "CLT", "DEN", "DFW", "EWR", "IAH", "JFK", "LAS", "LAX", "MCO", "MIA", "ORD", "PHX", "SEA", "SFO"]  # Add your specific texts here

# # Filter DataFrame rows based on multiple specific texts in a particular column
# filtered_df = df_pyspark.filter(col("Origin").isin(specific_texts))

# # Show the filtered DataFrame
# filtered_df.show(100)

In [307]:
# # Assuming df is your DataFrame
# row_count = filtered_df.count()

# # Print the number of rows
# print("Number of rows in the DataFrame:", row_count)
