In [None]:
from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .appName("TitanicAnalysis") \
    .getOrCreate()

df = spark.read.csv("hdfs://namenode:9000/titanic_lab/titanic.csv", header=True, inferSchema=True)

df.show(5)

df.printSchema()


+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|gender| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [7]:
df.filter(df.Cabin.isNull()).groupBy("gender").count().show()


+------+-----+
|gender|count|
+------+-----+
|female|  217|
|  male|  470|
+------+-----+



In [9]:
from pyspark.sql.functions import avg
df.select(avg("age")).show()

+-----------------+
|         avg(age)|
+-----------------+
|29.69911764705882|
+-----------------+



In [10]:
avg_age=df.select(avg("age")).first()[0]
df=df.na.fill({"age":avg_age})

In [13]:
df.write.mode("overwrite").csv("hdfs://namenode:9000/depi_folder/titanic_processed")

In [15]:
df.groupBy("survived").count().show()

+--------+-----+
|survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [16]:
df.groupBy("Embarked").count().orderBy("count",ascending=False).show(5)

+--------+-----+
|Embarked|count|
+--------+-----+
|       S|  644|
|       C|  168|
|       Q|   77|
|    NULL|    2|
+--------+-----+



In [18]:
from pyspark.sql.functions import col

df.groupBy("pclass").agg((avg(col("Survived"))*100).alias("survivalRate")
                          ).show(5)

+------+------------------+
|pclass|      survivalRate|
+------+------------------+
|     1| 62.96296296296296|
|     3|24.236252545824847|
|     2| 47.28260869565217|
+------+------------------+



In [19]:
from pyspark.sql.functions import max, min

df.select(max("Fare"),min("Fare"),avg("Fare")).show(5)

+---------+---------+----------------+
|max(Fare)|min(Fare)|       avg(Fare)|
+---------+---------+----------------+
| 512.3292|      0.0|32.2042079685746|
+---------+---------+----------------+



In [20]:
from pyspark.sql.functions import when

df= df.withColumn("AgeGroup",
                  when(df.Age<=18,"0-18")
                  .when((df.Age>18)&(df.Age<=35),"19-35")
                  .when((df.Age > 35) & (df.Age <= 60), "36-60")
                    .otherwise("60+")
                  
                  )
df.groupBy("AgeGroup").count().show()

+--------+-----+
|AgeGroup|count|
+--------+-----+
|   19-35|  535|
|     60+|   22|
|   36-60|  195|
|    0-18|  139|
+--------+-----+

