In [30]:
from pyspark.sql import SparkSession 
import pyspark.sql.functions as F

In [31]:
#Crée une session Spark
spark = SparkSession.builder.master("local[*]").appName("MigrationExample").getOrCreate()

In [32]:
# Données
matrix = [

[1, 34, 'Cardiology', 10],
[2, 45, 'Neurology', 12],
[3, 23, 'Cardiology', 5],
[4, 64, 'Orthopedics', 8],
[5, 52, 'Cardiology', 9],

]

columns = ['patient_id', 'age', 'department', 'visit_count']

In [33]:
df = spark.createDataFrame(matrix, columns)

df.show()

+----------+---+-----------+-----------+
|patient_id|age| department|visit_count|
+----------+---+-----------+-----------+
|         1| 34| Cardiology|         10|
|         2| 45|  Neurology|         12|
|         3| 23| Cardiology|          5|
|         4| 64|Orthopedics|          8|
|         5| 52| Cardiology|          9|
+----------+---+-----------+-----------+



### Réaliser des agrégations par département médical

In [34]:
agg_df = df.groupBy(
    "department").agg(
    F.sum("visit_count").alias("total_visit_count"),  # Somme de la colonne 'visit_count'
    F.mean("age").alias("mean_age"),                 # Moyenne de la colonne 'age'
    F.max("age").alias("max_age")                    # Valeur maximale de la colonne 'age'
)

In [35]:
agg_df.show()

+-----------+-----------------+------------------+-------+
| department|total_visit_count|          mean_age|max_age|
+-----------+-----------------+------------------+-------+
| Cardiology|               24|36.333333333333336|     52|
|  Neurology|               12|              45.0|     45|
|Orthopedics|                8|              64.0|     64|
+-----------+-----------------+------------------+-------+



In [51]:
data = [
    ('John Doe', 'Diabetes'),
    ('Jane Smith', 'Heart Disease'),
    ('Alice Brown', 'Hypertension')
]

# Création du DataFrame PySpark
columns = ['patient_name', 'diagnosis']
df2 = spark.createDataFrame(
    data, schema=columns)

# Conversion en minuscules et ajout d'un champ
df2 = df2.withColumn(
    "diagnosis_lower", F.lower("diagnosis"))
df2 = df2.withColumn(
    "full_info", F.concat_ws(' - ', 
                      df2.patient_name, 
                      df2.diagnosis_lower))

df2.show(
    truncate = False)

+------------+-------------+---------------+--------------------------+
|patient_name|diagnosis    |diagnosis_lower|full_info                 |
+------------+-------------+---------------+--------------------------+
|John Doe    |Diabetes     |diabetes       |John Doe - diabetes       |
|Jane Smith  |Heart Disease|heart disease  |Jane Smith - heart disease|
|Alice Brown |Hypertension |hypertension   |Alice Brown - hypertension|
+------------+-------------+---------------+--------------------------+



In [52]:
import pandas as pd

# Données
df = pd.DataFrame(
    {
        'patient_id': [1, 2, 3, 4, 5],
        'age': [34, 70, 50, 20, 15],
        'department': [
            'Cardiology',
            'Neurology',
            'Orthopedics',
            'Cardiology',
            'Neurology',
        ],
    }
)

# Ajout d'une colonne conditionnelle (catégorie d'âge)
df['age_category'] = df['age'].apply(
    lambda x: 'senior' if x > 60 else 'adult' if x > 18 else 'minor'
)

df

Unnamed: 0,patient_id,age,department,age_category
0,1,34,Cardiology,adult
1,2,70,Neurology,senior
2,3,50,Orthopedics,adult
3,4,20,Cardiology,adult
4,5,15,Neurology,minor


In [60]:
# Données sous forme de liste de tuples
data = [
    (1, 34, 'Cardiology'),
    (2, 70, 'Neurology'),
    (3, 50, 'Orthopedics'),
    (4, 20, 'Cardiology'),
    (5, 15, 'Neurology')
]

# Création du DataFrame PySpark
columns = ['patient_id', 'age', 'department']
df3 = spark.createDataFrame(
    data, schema=columns)

df3.show()

# Ajout d'une colonne conditionnelle (catégorie d'âge)
df3 = df3.withColumn(
    'age_category', 
    F.when(df3.age > 60, 'senior')
    .when(df3.age > 18, 'adult')
    .otherwise('minor'))

df3.show()

+----------+---+-----------+
|patient_id|age| department|
+----------+---+-----------+
|         1| 34| Cardiology|
|         2| 70|  Neurology|
|         3| 50|Orthopedics|
|         4| 20| Cardiology|
|         5| 15|  Neurology|
+----------+---+-----------+

+----------+---+-----------+------------+
|patient_id|age| department|age_category|
+----------+---+-----------+------------+
|         1| 34| Cardiology|       adult|
|         2| 70|  Neurology|      senior|
|         3| 50|Orthopedics|       adult|
|         4| 20| Cardiology|       adult|
|         5| 15|  Neurology|       minor|
+----------+---+-----------+------------+



In [66]:
import numpy as np
import pandas as pd

# Données avec des valeurs manquantes
df = pd.DataFrame(
    {
        'patient_id': [1, 2, 3, 4, 5],
        'age': [34, np.nan, 50, np.nan, 15],
        'department': ['Cardiology', 'Neurology', 'Orthopedics', np.nan, 'Neurology'],
    }
)
print(df)

# Remplacement des valeurs manquantes
df['age'].fillna(df['age'].mean(), inplace=True)
df['department'].fillna('Unknown', inplace=True)

df

   patient_id   age   department
0           1  34.0   Cardiology
1           2   NaN    Neurology
2           3  50.0  Orthopedics
3           4   NaN          NaN
4           5  15.0    Neurology


Unnamed: 0,patient_id,age,department
0,1,34.0,Cardiology
1,2,33.0,Neurology
2,3,50.0,Orthopedics
3,4,33.0,Unknown
4,5,15.0,Neurology


In [78]:
# Données sous forme de liste de tuples avec des valeurs manquantes (None pour null)
data = [
    (1, 34.0, 'Cardiology'),
    (2, float('nan'), 'Neurology'),
    (3, 50.0, 'Orthopedics'),
    (4, float('nan'), float('nan')),
    (5, 15.0, 'Neurology')
]

# Noms des colonnes
columns = ['patient_id', 'age', 'department']

# Création du DataFrame PySpark
df = spark.createDataFrame(data, columns)

# Calcul de la moyenne de la colonne 'age', en ignorant les NaN
mean_age = df.select(F.avg(F.col('age'))).first()[0]

# Affichage de la moyenne des âges
print(f"La moyenne des âges en ignorant les NaN est : {mean_age}")




La moyenne des âges en ignorant les NaN est : nan
