In [5]:
import findspark;

In [8]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder \
        .master("local") \
        .appName("Linear Regres Model") \
        .config("spark.executor.memory", "1gb") \
        .getOrCreate()

sc = spark.sparkContext

In [12]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(sc)
df = sqlContext.read.load('titanic.csv', 
              format='com.databricks.spark.csv', 
              header='true', 
              inferSchema='true')

In [15]:
df.show()

+--------+------+--------------------+------+----+-----------------------+-----------------------+-------+
|Survived|Pclass|                Name|   Sex| Age|Siblings/Spouses Aboard|Parents/Children Aboard|   Fare|
+--------+------+--------------------+------+----+-----------------------+-----------------------+-------+
|       0|     3|Mr. Owen Harris B...|  male|22.0|                      1|                      0|   7.25|
|       1|     1|Mrs. John Bradley...|female|38.0|                      1|                      0|71.2833|
|       1|     3|Miss. Laina Heikk...|female|26.0|                      0|                      0|  7.925|
|       1|     1|Mrs. Jacques Heat...|female|35.0|                      1|                      0|   53.1|
|       0|     3|Mr. William Henry...|  male|35.0|                      0|                      0|   8.05|
|       0|     3|     Mr. James Moran|  male|27.0|                      0|                      0| 8.4583|
|       0|     1|Mr. Timothy J McC...

In [30]:
from pyspark.sql import functions as F
probability_df = (df.groupby(["Survived", "Sex", "Pclass"])
                            .agg(F.count(F.lit(1)).alias("survived_sex_count"))
                            .join(df.groupby("Sex").agg(F.count(F.lit(1)).alias("sex_count")), on="Sex")
                            .withColumn("conditional_probability", F.round(F.col("survived_sex_count")/F.col("sex_count"), 2))
                            .select(["Survived", "Pclass", "Sex", "Conditional_Probability"])
                            .sort(["Survived", "Sex", "Pclass"]))
probability_df.show()

+--------+------+------+-----------------------+
|Survived|Pclass|   Sex|Conditional_Probability|
+--------+------+------+-----------------------+
|       0|     1|female|                   0.01|
|       0|     2|female|                   0.02|
|       0|     3|female|                   0.23|
|       0|     1|  male|                   0.13|
|       0|     2|  male|                   0.16|
|       0|     3|  male|                   0.52|
|       1|     1|female|                   0.29|
|       1|     2|female|                   0.22|
|       1|     3|female|                   0.23|
|       1|     1|  male|                   0.08|
|       1|     2|  male|                   0.03|
|       1|     3|  male|                   0.08|
+--------+------+------+-----------------------+



In [31]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils

In [79]:
from pyspark.sql import functions as F
fare_average = (df.groupby(["Pclass"]).mean())
result = fare_average.select(["Pclass", "avg(Fare)"]).sort(["Pclass"])

result.show()

+------+------------------+
|Pclass|         avg(Fare)|
+------+------------------+
|     1| 84.15468749999992|
|     2| 20.66218315217391|
|     3|13.707707392197129|
+------+------------------+

