In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=312fe6025943775ca52cdd0a801002ee77b3cc6bd2a68bb4a37b2da5e505524e
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col




In [None]:
spark = SparkSession.builder.appName("clean").getOrCreate()

In [None]:
df = spark.read.option("header", "true").csv("worldcitiespop.txt")



In [None]:
df = df.withColumn("Population", col("Population").cast("bigint"))


In [None]:
cleaned_df = df.filter(col("Population").cast("bigint").isNotNull())


In [None]:
min_population = cleaned_df.selectExpr("min(Population)").first()[0]
print("Population minimale: {}".format(min_population))

Population minimale: 10


In [None]:
max_population = cleaned_df.selectExpr("max(Population)").first()[0]
print("Population max: {}".format(max_population))

Population max: 9998


In [None]:
sum_population = cleaned_df.selectExpr("sum(Population)").first()[0]
print("Population total: {}".format(sum_population))

Population total: 2289584999.0


In [None]:
average_population = cleaned_df.selectExpr("avg(Population)").first()[0]
print("Population total: {}".format(average_population))

Population total: 47719.57063359733


In [None]:
from pyspark.sql.functions import log10, floor

In [None]:
histogram_df = cleaned_df.withColumn("log_population", floor(log10(col("Population"))))
histogram_result = histogram_df.groupBy("log_population").count().orderBy("log_population").collect()

print("Histogramme de fréquences des populations des villes (échelle logarithmique) :")
for row in histogram_result:
    print("Classe {}: {} villes".format(row["log_population"], row["count"]))


Histogramme de fréquences des populations des villes (échelle logarithmique) :
Classe 0: 5 villes
Classe 1: 174 villes
Classe 2: 2187 villes
Classe 3: 20537 villes
Classe 4: 21550 villes
Classe 5: 3248 villes
Classe 6: 269 villes
Classe 7: 10 villes


In [None]:
top_10_cities = df.orderBy(col("Population"), ascending=False).limit(10).select("City").collect()


In [None]:
print("\nTop 10 villes ayant la population la plus importante :")
for city in top_10_cities:
    print(city["City"])


Top 10 villes ayant la population la plus importante :
tokyo
shanghai
bombay
karachi
delhi
new delhi
manila
moscow
seoul
sao paulo


In [None]:
cleaned_df.head()

Row(Country='ad', City='andorra la vella', AccentCity='Andorra la Vella', Region='07', Population='20430', Latitude='42.5', Longitude='1.5166667')

In [None]:
df[(cleaned_df['Country'] == "fr") & (cleaned_df['City'] == "villepinte")].head(10)

[Row(Country='fr', City='villepinte', AccentCity='Villepinte', Region='A8', Population=36557, Latitude='48.962034', Longitude='2.532534'),
 Row(Country='fr', City='villepinte', AccentCity='Villepinte', Region='A9', Population=None, Latitude='43.282129', Longitude='2.087603')]