# Get started with spark

In this tutorial, we will learn:
- how to create a spark session
- read a csv file
- read a parquet file


## 1. Create a spark session

In [1]:
from pyspark.sql import SparkSession

In [2]:

# create a spark session in local mode
spark = SparkSession.builder \
     .master("local[*]") \
    .appName("Analyze_fr_immo_transactions") \
    .getOrCreate()

In [3]:
data_path = "C:/Users/PLIU/Documents/ubuntu_share/data_set/france_immobilier/transactions_sample.csv"

In [4]:
fr_immo_transactions_df = spark.read.csv(data_path, header=True, inferSchema=True)

In [5]:
fr_immo_transactions_df.show()

+--------------+----------------+--------+-----------+--------+--------------------+-----------+--------------------+-------------+-----+--------+-----------------+--------------------+----------------+-----------------+-------------------+--------------------------+--------------------------+---------------------+-----------------------+
|id_transaction|date_transaction|    prix|departement|id_ville|               ville|code_postal|             adresse|type_batiment| vefa|n_pieces|surface_habitable|id_parcelle_cadastre|        latitude|        longitude|surface_dependances|surface_locaux_industriels|surface_terrains_agricoles|surface_terrains_sols|surface_terrains_nature|
+--------------+----------------+--------+-----------+--------+--------------------+-----------+--------------------+-------------+-----+--------+-----------------+--------------------+----------------+-----------------+-------------------+--------------------------+--------------------------+---------------------+--

In [6]:
fr_immo_transactions_df.printSchema()

root
 |-- id_transaction: integer (nullable = true)
 |-- date_transaction: date (nullable = true)
 |-- prix: double (nullable = true)
 |-- departement: integer (nullable = true)
 |-- id_ville: integer (nullable = true)
 |-- ville: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- adresse: string (nullable = true)
 |-- type_batiment: string (nullable = true)
 |-- vefa: boolean (nullable = true)
 |-- n_pieces: integer (nullable = true)
 |-- surface_habitable: integer (nullable = true)
 |-- id_parcelle_cadastre: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- surface_dependances: string (nullable = true)
 |-- surface_locaux_industriels: string (nullable = true)
 |-- surface_terrains_agricoles: string (nullable = true)
 |-- surface_terrains_sols: string (nullable = true)
 |-- surface_terrains_nature: string (nullable = true)



In [7]:
required_col = ["id_transaction","date_transaction","prix","departement","ville","code_postal","adresse","type_batiment","vefa","n_pieces","surface_habitable","latitude","longitude"]
clean_fr_immo_df = fr_immo_transactions_df.select(required_col)

In [8]:
clean_fr_immo_df.show()

+--------------+----------------+--------+-----------+--------------------+-----------+--------------------+-------------+-----+--------+-----------------+----------------+-----------------+
|id_transaction|date_transaction|    prix|departement|               ville|code_postal|             adresse|type_batiment| vefa|n_pieces|surface_habitable|        latitude|        longitude|
+--------------+----------------+--------+-----------+--------------------+-----------+--------------------+-------------+-----+--------+-----------------+----------------+-----------------+
|      10160888|      2015-07-22|222500.0|         63|               MUROL|      63790|         5148  COMBE|       Maison|false|       4|              123|45.5729726213494| 2.94997597336221|
|      10319766|      2024-06-18|218640.0|         73|          LES ALLUES|      73550|45 RUE DU GRAND C...|  Appartement|false|       1|               23|45.3987396582855| 6.56760164004644|
|      11545562|      2020-07-23|254950.0|   

In [10]:
from ydata_profiling import ProfileReport

report = ProfileReport(clean_fr_immo_df.toPandas())
report.to_file("fr_immo_transactions_report.html")

Summarize dataset:  56%|█████▌    | 10/18 [00:00<00:00, 68.00it/s, Describe variable: longitude]
100%|██████████| 13/13 [00:00<00:00, 1003.99it/s]
 2 7 4 9 7 4 6 9 0 3 6 8 3 4 1 6 1 4 2 7 2 3 8 7 3 0 6 1 3 9 6 9 4 7 7 3 8
 1 9 2 7 4 4 8 0 2 9 8 3 3 8 1 7 7 2 2 2 0 1 6 5 3 3]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  discretized_df.loc[:, column] = self._discretize_column(
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  discretized_df.loc[:, column] = self._discretize_column(
 3 7 4 9 7 4 7 9 1 3 6 8 3 4 1 7 2 4 3 7 2 3 8 7 3 0 7 1 3 8 6 9 5 8 7 3 8
 2 9 3 7 4 4 8 0 3 9 7 3 3 8 1 7 7 2 3 3 1 2 7 5 3 3]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  discretized_df.loc[:, column] = self._discretize_column(
 4 4 6 4 4 2 6 4 3 6 4 3 