Petite astuce pour éviter les bugs :

In [2]:
spark.conf.set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation","true")

#### 2. c)	Dans un notebook, chargez le fichier CSV en PySpark dans un dataframe

In [4]:
############# Chargement du fichier CSV #############
# infos du fichier
file_location = "/FileStore/tables/transactions.csv"
file_type = "csv"

# options csv
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# les options sont spécifiques aux fichiers CSV, elles seront ignorées si le fichier est d'un autre type
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

# ou encore : 
# spark.read.csv(file_location, header=first_row_is_header, inferSchema = infer_schema, sep = delimiter)

#### 2. d)	Affichez le dataframe et affichez uniquement les 10 premiers éléments de la colonne Amount

In [6]:
############# Affichage du Dataframe #############
display(df)

account_sender_name,country_sender,account_receiver_name,country_receiver,datetime_timestamp,amount
Nissan,US,Elon Musk,SPA,0,8192372
McKesson,JAP,Elon Musk,SWI,1000,4428350
McKesson,UK,Tesla,SPA,2000,863811
ConocoPhillips,CHI,John Coulberg,ITA,3000,7330833
McKesson,JAP,Altaran,FR,4000,5167665
Altaran,SPA,McKesson,FR,5000,830825
McKesson,SP,Nissan,CHI,6000,5123100
Exxon Mobil,CHI,Tesla,SWI,7000,3198840
Altaran,US,Laurent Alexandre,SPA,8000,5408199
Altaran,JAP,Nissan,ITA,9000,7215787


In [7]:
############# Affichage des 10 premières valeurs de Amount #############
df.select("amount").show(10)

In [8]:
# printschema() affiche l'ensemble des variables du dataframe et leurs types
df.printSchema()

#### 2. e)	Utilisez la méthode describe() et affichez les résultats

In [10]:
############# Affichage du describe #############
df.describe().show()
df.describe("datetime_timestamp", "amount").show()

### 2. f) Calculez en utilisant les éléments du module pyspark.sql.functions :  
◦	la moyenne, l'écart-type, la somme, le max et le min de la colonne Amount  
◦	affichez les 5 plus importantes transactions   
◦	la moyenne et la somme des transactions par pays émetteur  
◦	la somme des transactions envoyés et reçues par "Laurent Alexandre" et le solde net

In [12]:
print(df.filter(df.account_sender_name=="Laurent Alexandre").select(sum("amount")).collect()[0][0])

In [13]:
############# Manipulation de données avec pyspark #############
from pyspark.sql.functions import *

#la moyenne, l'écart-type, la somme le max et le min de la colonne Amount
df[avg("amount"), sum("amount"), stddev("amount"), min("amount"), max("amount")].show()

#affichez les 5 plus importantes transactions
df.orderBy(desc("amount")).show(5)

#la moyenne et la somme des transactions par pays émetteur
df.groupBy("country_sender").agg(avg("amount"), sum("amount")).show()

#la somme des transactions envoyés et reçues par "Laurent Alexandre" et calculer le solde net
LA_sent = df.filter(df.account_sender_name=="Laurent Alexandre").select(sum("amount")).collect()[0]["sum(amount)"]
LA_received = df.filter(df.account_receiver_name=="Laurent Alexandre").select(sum("amount")).collect()[0]["sum(amount)"]
diff = LA_sent-LA_received
print(LA_sent, LA_received, diff)

### 3. a) À l'aide de la méthode df.createOrRempTempView, mettez le dataframe dans une table

In [15]:
############# Création d'une table #############
df.createOrReplaceTempView("transactions_csv")

### 3. b) Refaites les calculs de la question 2.f) en SQL

In [17]:
%sql
SELECT * FROM (
(SELECT account_sender_name, sum(amount) AS sent FROM transactions_csv WHERE account_sender_name = "Laurent Alexandre" GROUP BY account_sender_name) AS A
LEFT JOIN
(SELECT account_receiver_name, sum(amount) AS received FROM transactions_csv WHERE account_receiver_name = "Laurent Alexandre" GROUP BY account_receiver_name) AS B
ON A.account_sender_name=B.account_receiver_name
);

account_sender_name,sent,account_receiver_name,received
Laurent Alexandre,1052820209,Laurent Alexandre,948521753


In [18]:
%sql
/************ Manipulation des données en SQL : questions 2.f) ************/
SELECT account_sender_name AS name, sent, received, sent-received AS solde FROM (
(SELECT account_sender_name, sum(amount) AS sent FROM transactions_csv WHERE account_sender_name = "Laurent Alexandre" GROUP BY account_sender_name) AS A
LEFT JOIN
(SELECT account_receiver_name, sum(amount) AS received FROM transactions_csv WHERE account_receiver_name = "Laurent Alexandre" GROUP BY account_receiver_name) AS B
ON A.account_sender_name=B.account_receiver_name
);

name,sent,received,solde
Laurent Alexandre,1052820209,948521753,104298456


#### 3. c)	Créez une table avec les transactions émises depuis la France et enregistrez-la pour pouvoir y accéder ailleurs que dans ce notebook

In [20]:
%sql
DROP TABLE IF EXISTS transa_fr; 
/************ Création de la table avec les lignes françaises seulement ************/
CREATE TABLE transa_fr AS SELECT * FROM transactions_csv WHERE country_sender = "FR";

SELECT * FROM transa_fr

account_sender_name,country_sender,account_receiver_name,country_receiver,datetime_timestamp,amount
ConocoPhillips,FR,John Coulberg,SWI,11000,4151347
Nissan,FR,Laurent Alexandre,US,19000,744248
McKesson,FR,Renault,UK,52000,4636359
Tesla,FR,Chevron,SWI,58000,6840434
Laurent Alexandre,FR,Activision Blizard,SWI,87000,8019988
Nissan,FR,McKesson,US,92000,6005378
Walmart,FR,Laurent Alexandre,SWI,95000,660870
Nissan,FR,Exxon Mobil,SPA,117000,801626
Chevron,FR,Elon Musk,SWI,127000,2228049
Activision Blizard,FR,McKesson,UK,142000,592827


In [21]:
# Pour enregistrer un dataframe comme une table SQL : 
df.write.mode("overwrite").saveAsTable("transa_fr_fromdf")

In [22]:
%sql
SELECT * FROM transa_fr_fromdf

account_sender_name,country_sender,account_receiver_name,country_receiver,datetime_timestamp,amount
Nissan,US,Elon Musk,SPA,0,8192372
McKesson,JAP,Elon Musk,SWI,1000,4428350
McKesson,UK,Tesla,SPA,2000,863811
ConocoPhillips,CHI,John Coulberg,ITA,3000,7330833
McKesson,JAP,Altaran,FR,4000,5167665
Altaran,SPA,McKesson,FR,5000,830825
McKesson,SP,Nissan,CHI,6000,5123100
Exxon Mobil,CHI,Tesla,SWI,7000,3198840
Altaran,US,Laurent Alexandre,SPA,8000,5408199
Altaran,JAP,Nissan,ITA,9000,7215787


In [23]:
# pour enregistrer au format parquet
df.write.format("parquet").mode("overwrite").save('/FileStore/parquet/transa_fr')

# ... et lire un fichier au format parquet
display(spark.read.load("/FileStore/parquet/transa_fr"))

account_sender_name,country_sender,account_receiver_name,country_receiver,datetime_timestamp,amount
Nissan,US,Elon Musk,SPA,0,8192372
McKesson,JAP,Elon Musk,SWI,1000,4428350
McKesson,UK,Tesla,SPA,2000,863811
ConocoPhillips,CHI,John Coulberg,ITA,3000,7330833
McKesson,JAP,Altaran,FR,4000,5167665
Altaran,SPA,McKesson,FR,5000,830825
McKesson,SP,Nissan,CHI,6000,5123100
Exxon Mobil,CHI,Tesla,SWI,7000,3198840
Altaran,US,Laurent Alexandre,SPA,8000,5408199
Altaran,JAP,Nissan,ITA,9000,7215787
