# Get started with spark

In this tutorial, we will learn:
- how to create a spark session
- read a csv file
- read a parquet file


## 1. Create a spark session

In [1]:
from pyspark.sql import SparkSession, DataFrame

In [2]:
# create a spark session in local mode
spark = SparkSession.builder \
     .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .appName("Analyze_fr_immo_transactions") \
    .getOrCreate()

In [3]:
# you can get and set configuration of your spark session any moments
# get all conf
spark.sparkContext.getConf().getAll()

[('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),
 ('spark.driver.memory', '4g'),
 ('spark.driver.port', '55625'),
 ('spark.app.name', 'Analyze_fr_immo_t

> In this tutorial, we only focus on spark on local mode. CASD proposes `yarn` and `k8s` mode.

## 2. Read a csv file

In [4]:
csv_sample_file_path = "C:/Users/PLIU/Documents/ubuntu_share/data_set/france_immobilier/transactions_sample.csv"

# the option header
sample_df = spark.read.csv(csv_sample_file_path, header=True, inferSchema=True)

In [5]:
sample_df.show(5)

+--------------+----------------+--------+-----------+--------+--------------------+-----------+--------------------+-------------+-----+--------+-----------------+--------------------+----------------+----------------+-------------------+--------------------------+--------------------------+---------------------+-----------------------+
|id_transaction|date_transaction|    prix|departement|id_ville|               ville|code_postal|             adresse|type_batiment| vefa|n_pieces|surface_habitable|id_parcelle_cadastre|        latitude|       longitude|surface_dependances|surface_locaux_industriels|surface_terrains_agricoles|surface_terrains_sols|surface_terrains_nature|
+--------------+----------------+--------+-----------+--------+--------------------+-----------+--------------------+-------------+-----+--------+-----------------+--------------------+----------------+----------------+-------------------+--------------------------+--------------------------+---------------------+-----

In [9]:
sample_df.printSchema()

root
 |-- id_transaction: integer (nullable = true)
 |-- date_transaction: date (nullable = true)
 |-- prix: double (nullable = true)
 |-- departement: integer (nullable = true)
 |-- id_ville: integer (nullable = true)
 |-- ville: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- adresse: string (nullable = true)
 |-- type_batiment: string (nullable = true)
 |-- vefa: boolean (nullable = true)
 |-- n_pieces: integer (nullable = true)
 |-- surface_habitable: integer (nullable = true)
 |-- id_parcelle_cadastre: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- surface_dependances: string (nullable = true)
 |-- surface_locaux_industriels: string (nullable = true)
 |-- surface_terrains_agricoles: string (nullable = true)
 |-- surface_terrains_sols: string (nullable = true)
 |-- surface_terrains_nature: string (nullable = true)



## Read a parquet file



In [10]:
fr_immo_transaction_path = "C:/Users/PLIU/Documents/git/Seminar_PySpark_Sedona_GeoParquet/data/fr_immo_transaction.parquet"
fr_immo_transactions_df = spark.read.parquet(fr_immo_transaction_path)

In [11]:
required_col = ["id_transaction","date_transaction","prix","departement","ville","code_postal","adresse","type_batiment","n_pieces","surface_habitable","latitude","longitude"]
clean_fr_immo_df = fr_immo_transactions_df.select(required_col)

In [16]:
# cache the dataframe for better performence
clean_fr_immo_df.cache()
clean_fr_immo_df.show()

+--------------+----------------+--------+-----------+--------------------+-----------+--------------------+-------------+--------+-----------------+----------------+----------------+
|id_transaction|date_transaction|    prix|departement|               ville|code_postal|             adresse|type_batiment|n_pieces|surface_habitable|        latitude|       longitude|
+--------------+----------------+--------+-----------+--------------------+-----------+--------------------+-------------+--------+-----------------+----------------+----------------+
|        141653|      2014-01-02|197000.0|         01|             TREVOUX|       1600|  6346 MTE DES LILAS|  Appartement|       4|               84|45.9423014034837|4.77069364742062|
|        141970|      2014-01-02|157500.0|         01|              VIRIAT|       1440|1369 RTE DE STRAS...|       Maison|       4|              103|46.2364072868351|5.26293493674271|
|        139240|      2014-01-02|112000.0|         01|SAINT-JEAN-SUR-VEYLE|     

In [8]:
from ydata_profiling import ProfileReport

# this steps requires many memory to run, and take 20mins to finish. So be careful.
report = ProfileReport(clean_fr_immo_df)
report.to_file("fr_immo_transactions_report.html")

  from .autonotebook import tqdm as notebook_tqdm


Summarize dataset:  85%|████████▌ | 17/20 [16:37<02:56, 58.69s/it, Detecting duplicates]                


ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

## Clean dataframe

We want to check some basic information of the dataframe:
- Total row count
- schema(e.g.column name and data type)
- Empty rows (all-null)
- Rows with any missing values
- Duplicate rows
- Rows containing empty strings
- Nulls count per column

In [6]:
from pyspark.sql.functions import sum as spark_sum
from pyspark.sql.functions import col, when, isnan, trim
import pyspark.sql.types as spark_types

def get_empty_row_count_per_column(df:DataFrame):
    totalRowCount = df.count()

    nullSymbols = ["?","-"]
    aggExpression = []

    # step2: build the condition expression for detecting various null case
    for colName in df.columns:
        # temporal col name
        nullCountCol = f"{colName}__null"
        nanCountCol = f"{colName}__nan"
        blankCountCol = f"{colName}__blank"
        nullSymbolCountCol = f"{colName}__symbol"
        c = col(colName)
        colType = df.schema[colName].dataType
        # always test null
        nullExpr = when(c.isNull(), 1).otherwise(0).alias(nullCountCol)
        aggExpression.append(nullExpr)
        # test isnan for only numeric columns
        nanExpr = when(isnan(c), 1).otherwise(0).alias(nanCountCol)
        if isinstance(colType, spark_types.NumericType):
            aggExpression.append(nanExpr)
        # string null value only for string columns
        if isinstance(colType, spark_types.StringType):
            aggExpression.append(when(trim(c) == "", 1).otherwise(0).alias(blankCountCol))
            aggExpression.append(when(c.isin(nullSymbols), 1).otherwise(0).alias(nullSymbolCountCol))

        # Perform full-column conditional tagging
    flaggedDf = df.select(*aggExpression)

    # step3: sum all per-column null case flags in one single pass
    try:
        summed = flaggedDf.agg(*[spark_sum(c).alias(c) for c in flaggedDf.columns]).collect()[0].asDict()
    except Exception as e:
        print(f"Aggregation failed on flaggedDf columns: {flaggedDf.columns}: {e}")

    result = []
    # step4: build a list of dict which contains all info for the final result dataframe
    for colName in df.columns:
        # temporal col name
        nullCountCol = f"{colName}__null"
        nanCountCol = f"{colName}__nan"
        blankCountCol = f"{colName}__blank"
        nullSymbolCountCol = f"{colName}__symbol"
        nullCount = summed.get(nullCountCol, 0)
        nanCount = summed.get(nanCountCol, 0)
        blankCount = summed.get(blankCountCol, 0)
        symbolCount = summed.get(nullSymbolCountCol, 0)
        totalEmpty = nullCount + nanCount + blankCount + symbolCount

        result.append((
            colName, nullCount, nanCount, blankCount,
            symbolCount, totalEmpty, totalRowCount
        ))
    # convert the list of dict into a new dataframe
    resDf = spark.createDataFrame(result, ["column_name", "null_count", "nan_count", "blank_count",
                                                "null_symbol_count", "total_empty_row_count",
                                                "total_row_count"])
    #

    return resDf


In [17]:
def get_duplicated_row_count(df:DataFrame):
     duplicate_row_count = df.count() - df.dropDuplicates().count()
     print(f"Duplicate row count: {duplicate_row_count}")


In [13]:
null_col_stats = get_empty_row_count_per_column(clean_fr_immo_df)

In [15]:
null_col_stats.show(20)

+-----------------+----------+---------+-----------+-----------------+---------------------+---------------+
|      column_name|null_count|nan_count|blank_count|null_symbol_count|total_empty_row_count|total_row_count|
+-----------------+----------+---------+-----------+-----------------+---------------------+---------------+
|   id_transaction|         0|        0|          0|                0|                    0|        9141573|
| date_transaction|         0|        0|          0|                0|                    0|        9141573|
|             prix|         0|        0|          0|                0|                    0|        9141573|
|      departement|         0|        0|          0|                0|                    0|        9141573|
|            ville|         0|        0|          0|                0|                    0|        9141573|
|      code_postal|         0|        0|          0|                0|                    0|        9141573|
|          adresse|

In [18]:
get_duplicated_row_count(clean_fr_immo_df)

Duplicate row count: 0


In [None]:
# creating a geometry column

