In [3]:
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SparkSession

spark = SparkSession \
   .builder \
   .appName("Python Spark regression example") \
   .config("spark.some.config.option", "some-value") \
   .getOrCreate()

In [5]:
df = spark.read.csv('titanic.csv',header=True, inferSchema = True)

In [6]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [7]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [12]:
df.groupby(df.Fare > 50).count().show()

+-----------+-----+
|(Fare > 50)|count|
+-----------+-----+
|       true|  160|
|      false|  731|
+-----------+-----+



In [14]:
df.filter(df.Age > 50).show()

+-----------+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|     Ticket|    Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----+--------+
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|      17463| 51.8625|  E46|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|     113783|   26.55| C103|       S|
|         16|       1|     2|Hewlett, Mrs. (Ma...|female|55.0|    0|    0|     248706|    16.0| null|       S|
|         34|       0|     2|Wheadon, Mr. Edwa...|  male|66.0|    0|    0| C.A. 24579|    10.5| null|       S|
|         55|       0|     1|Ostby, Mr. Engelh...|  male|65.0|    0|    1|     113509| 61.9792|  B30|       C|
|         95|       0|     3|   Coxon, Mr. Daniel|  male|59.0|    0|    0|     364500|    7.25| null|       S|
|

In [15]:
df.describe()

DataFrame[summary: string, PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]

In [16]:
from pyspark.sql.functions import mean, min, max


df.select([mean('Age'), min('Age'), max('Age')]).show()

+-----------------+--------+--------+
|         avg(Age)|min(Age)|max(Age)|
+-----------------+--------+--------+
|29.69911764705882|    0.42|    80.0|
+-----------------+--------+--------+



In [17]:
df.select([mean('Fare'), min('Fare'), max('Fare')]).show()

+----------------+---------+---------+
|       avg(Fare)|min(Fare)|max(Fare)|
+----------------+---------+---------+
|32.2042079685746|      0.0| 512.3292|
+----------------+---------+---------+



## Grab All Values of Names In A Column

In [49]:
import pyspark.sql.functions as f

names_list = df.select(f.collect_list('Name')).first()[0]

first_name = names_list[0]
first_name

'Braund, Mr. Owen Harris'

## Using Reduce

In [50]:
from functools import reduce 

print(reduce(lambda val1, val2: val1 + val2, first_name))

Braund, Mr. Owen Harris


In [52]:
age_list = df.select(f.collect_list('Age')).first()[0]

top_two = age_list[0:2]
top_two

[22.0, 38.0]

In [53]:
print(reduce(lambda val1, val2: val1 + val2, top_two))

60.0
