## Start 

In [30]:
from pyspark.sql import SparkSession

#Initialize the session 
spark=SparkSession.builder.appName("Preparation").getOrCreate()

#Read the data
data="./data/Data.csv" #Chemin vers le fichier
df=spark.read.csv(data,header=True,inferSchema=True)





                                                                                

In [31]:
#Show the data
df.show()
#Number of rows
df.count()

+-----------------+---------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+
|          Airline|Date_of_Journey|  Source|Destination|               Route|           Dep_Time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|
+-----------------+---------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+
|           IndiGo|     24/03/2019|Banglore|  New Delhi|           BLR → DEL|2024-11-15 22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|
|        Air India|      1/05/2019| Kolkata|   Banglore|CCU → IXR → BBI →...|2024-11-15 05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|
|      Jet Airways|      9/06/2019|   Delhi|     Cochin|DEL → LKO → BOM →...|2024-11-15 09:25:00|04:25 10 Jun|     19h|    2 stops|             No info|13882|
|           IndiGo|     12/05/2019| Kolkata|  

10683

## Clean the data 

In [32]:
from pyspark.sql.functions import to_timestamp,col
df.printSchema()
# #Data format 
#  |-- Airline: string (nullable = true)
#  |-- Date_of_Journey: string (nullable = true)
#  |-- Source: string (nullable = true)
#  |-- Destination: string (nullable = true)
#  |-- Route: string (nullable = true)
#  |-- Dep_Time: timestamp (nullable = true)
#  |-- Arrival_Time: string (nullable = true)
#  |-- Duration: string (nullable = true)²
#  |-- Total_Stops: string (nullable = true)
#  |-- Additional_Info: string (nullable = true)
#  |-- Price: integer (nullable = true)
#It is neccesary to change few format
df=df.withColumn(df['Date_of_Journey'],to_timestamp(col('Date_of_Journey'),'dd/MM/yyyy'))

from pyspark.sql.functions import col, to_timestamp, regexp_extract, when, concat, unix_timestamp, expr
from pyspark.sql import functions as F

# 1. Traiter la colonne `Arrival_Time`
# Ajouter une date par défaut si `Arrival_Time` contient seulement l'heure (par exemple, "13:15")
df = df.withColumn(
    "Arrival_Time_Processed", 
    when(
        col("Arrival_Time").rlike("^\d{2}:\d{2}$"),  # Si seulement l'heure est présente (HH:mm)
        concat(col("Arrival_Time"), " 01 Jan 1970")  # Ajouter une date par défaut
    )
    .when(
        col("Arrival_Time").rlike("^\d{2}:\d{2} \d{2} [A-Za-z]{3}$"),  # Si la date est partielle (HH:mm dd MMM)
        concat(col("Arrival_Time"), " 2024")  # Ajouter une année par défaut
    )
    .otherwise(col("Arrival_Time"))  # Sinon, garder la valeur d'origine (format complet avec date)
)

# 2. Convertir `Arrival_Time_Processed` en `timestamp`
df = df.withColumn("Arrival_Time", to_timestamp(col("Arrival_Time_Processed"), "HH:mm dd MMM yyyy"))

# 3. Traiter la colonne `Duration` (extraction des heures et minutes)
df = df.withColumn(
    "Duration_Hours", 
    regexp_extract(col("Duration"), r"(\d+)(?=h)", 1).cast("int")  # Extraire les heures
)

df = df.withColumn(
    "Duration_Minutes", 
    regexp_extract(col("Duration"), r"(\d+)(?=m)", 1).cast("int")  # Extraire les minutes
)

# Remplir les valeurs manquantes avec 0 (si l'heure ou les minutes sont absentes)
df = df.fillna({"Duration_Hours": 0, "Duration_Minutes": 0})

# 4. Convertir l'heure de départ `Dep_Time` en timestamp si ce n'est pas déjà le cas
# Cela est probablement déjà fait puisque la colonne `Dep_Time` est de type `timestamp`, mais vous pouvez vérifier et effectuer une conversion si nécessaire
df = df.withColumn("Dep_Time", to_timestamp(col("Dep_Time"), "yyyy-MM-dd HH:mm:ss"))

# 5. Calculer l'heure d'arrivée en ajoutant la durée de vol à l'heure de départ
df = df.withColumn(
    "Arrival_Time_Calculated",
    expr("Dep_Time + INTERVAL Duration_Hours HOURS + INTERVAL Duration_Minutes MINUTES")
)

# Afficher les résultats finaux
df.select("Airline", "Dep_Time", "Arrival_Time_Calculated", "Duration", "Arrival_Time").show(truncate=False)


root
 |-- Airline: string (nullable = true)
 |-- Date_of_Journey: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- Dep_Time: timestamp (nullable = true)
 |-- Arrival_Time: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Total_Stops: string (nullable = true)
 |-- Additional_Info: string (nullable = true)
 |-- Price: integer (nullable = true)



In [7]:
df.filter(df['_c0'].isNull()|df["airline"].isNull() | df["flight"].isNull() | df["source_city"].isNull() | df["destination_city"].isNull()| df["departure_time"].isNull() | df["arrival_time"].isNull() | df["stops"].isNull() | df["class"].isNull() | df["duration"].isNull() | df["days_left"].isNull() | df["price"].isNull()).show()

#No null values

24/11/15 16:48:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , airline, flight, source_city, departure_time, stops, arrival_time, destination_city, class, duration, days_left, price
 Schema: _c0, airline, flight, source_city, departure_time, stops, arrival_time, destination_city, class, duration, days_left, price
Expected: _c0 but found: 
CSV file: file:///app/data/Dataset.csv


+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|_c0|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



In [24]:
#Delete duplicates
df = df.dropDuplicates(['_c0'])
df.count()

24/11/15 16:59:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , airline, flight, source_city, departure_time, stops, arrival_time, destination_city, class, duration, days_left, price
 Schema: _c0, airline, flight, source_city, departure_time, stops, arrival_time, destination_city, class, duration, days_left, price
Expected: _c0 but found: 
CSV file: file:///app/data/Dataset.csv
                                                                                

300153

In [13]:
df_duplicates = df.groupBy(df.columns).count().filter("count > 1")
df_duplicates.show()


24/11/15 16:56:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , airline, flight, source_city, departure_time, stops, arrival_time, destination_city, class, duration, days_left, price
 Schema: _c0, airline, flight, source_city, departure_time, stops, arrival_time, destination_city, class, duration, days_left, price
Expected: _c0 but found: 
CSV file: file:///app/data/Dataset.csv
                                                                                

+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+-----+
|_c0|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|count|
+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+-----+
+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+-----+

