In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
spark = SparkSession.builder.getOrCreate()

# Lecture des données

In [2]:
path = "./data/covid-hospit-incid-reg-2021-01-20-19h20.csv"
# lecture d'un fichier de manière la plus brute
df = spark.read.format('csv').options(header=True, inferShema=True, delimiter=';').load(path)
df.dtypes

AnalysisException: Path does not exist: file:/home/jovyan/work/data/covid-hospit-incid-reg-2021-01-20-19h20.csv;

## Checking dataframe size

In [None]:
print('File row number :%d' %df.count())

## Checking unique values of a column

In [None]:
# All Regions 
df.select('nomReg').distinct().show(df.count())

## Nombre d'admission depuis le début du COVID19 / Région 

In [None]:
df_with_total = df.groupBy("nomReg").agg({"incid_rea" : "sum"})
df_with_total.collect()

In [None]:
## Les Régions les plus frappés par le COVID19
df_with_total.sort(desc("sum(incid_rea)")).collect()

In [None]:
df_with_total= df_with_total.withColumnRenamed("sum(incid_rea)","Total")
df_with_total


In [None]:
test_df = df_with_total.toPandas()
sns.barplot(x="Total", y="nomReg", data=test_df)

## Progression d'admission au Réa pour chaque region du Covid19

In [None]:
g = sns.FacetGrid(df.toPandas(), col='nomReg', hue='nomReg', col_wrap=4, )
 
g = g.map(plt.plot, 'jour', 'incid_rea')

g = g.map(plt.fill_between, 'jour', 'incid_rea', alpha=0.2).set_titles("{col_name} nomReg")

g = g.set_titles("{col_name}")
 
# Add a title for the whole plo
plt.subplots_adjust(top=0.92)
g = g.fig.suptitle("Evolution d'admission en Réa")
 
plot.show()

In [None]:
result = (df.groupBy(F.date_format('jour', 'yyyy_MM').alias('month'), 'nomReg')
            .agg(F.avg('incid_rea').alias('mean'))
         )
result.show()

In [None]:
result.toPandas().reset_index()
result
plt.figure(figsize=(15,8))
sns.lineplot(data=result.toPandas(), x="month", y="mean", hue="nomReg")