In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/home/yacine/moocs/UW_ML_Case_Study/w1/people-example.csv')
df.head()

Unnamed: 0,First Name,Last Name,Country,age
0,Bob,Smith,United States,24
1,Alice,Williams,Canada,23
2,Malcolm,Jone,England,22
3,Felix,Brown,USA,23
4,Alex,Cooper,Poland,23


In [3]:
df['First Name']

0        Bob
1      Alice
2    Malcolm
3      Felix
4       Alex
5        Tod
6      Derek
Name: First Name, dtype: object

In [97]:
spark_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(path='/home/yacine/moocs/UW_ML_Case_Study/w1/people-example.csv', inferSchema=False)
spark_df.head()

Row(First Name=u'Bob', Last Name=u'Smith', Country=u'United States', age=u'24')

# show() is much better than head()

In [98]:
spark_df.show()

spark_df.show(3)

+----------+---------+-------------+---+
|First Name|Last Name|      Country|age|
+----------+---------+-------------+---+
|       Bob|    Smith|United States| 24|
|     Alice| Williams|       Canada| 23|
|   Malcolm|     Jone|      England| 22|
|     Felix|    Brown|          USA| 23|
|      Alex|   Cooper|       Poland| 23|
|       Tod| Campbell|United States| 22|
|     Derek|     Ward|  Switzerland| 25|
+----------+---------+-------------+---+

+----------+---------+-------------+---+
|First Name|Last Name|      Country|age|
+----------+---------+-------------+---+
|       Bob|    Smith|United States| 24|
|     Alice| Williams|       Canada| 23|
|   Malcolm|     Jone|      England| 22|
+----------+---------+-------------+---+
only showing top 3 rows



In [99]:
spark_df.head()

Row(First Name=u'Bob', Last Name=u'Smith', Country=u'United States', age=u'24')

# Spark DF has no 'tail()'

In [100]:
## spark_df.tail()

# By defalut column types are String
# This can be changed by using
## inferSchema=True in "load"
## cast the column using "astype"

In [101]:
spark_df.columns
spark_df.printSchema()

root
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- age: string (nullable = true)



In [102]:
spark_df.schema
spark_df = spark_df.withColumn('age', spark_df['age'].astype('int'))
spark_df.show()
spark_df.schema

+----------+---------+-------------+---+
|First Name|Last Name|      Country|age|
+----------+---------+-------------+---+
|       Bob|    Smith|United States| 24|
|     Alice| Williams|       Canada| 23|
|   Malcolm|     Jone|      England| 22|
|     Felix|    Brown|          USA| 23|
|      Alex|   Cooper|       Poland| 23|
|       Tod| Campbell|United States| 22|
|     Derek|     Ward|  Switzerland| 25|
+----------+---------+-------------+---+



StructType(List(StructField(First Name,StringType,true),StructField(Last Name,StringType,true),StructField(Country,StringType,true),StructField(age,IntegerType,true)))

In [103]:
df.describe()

Unnamed: 0,age
count,7.0
mean,23.142857
std,1.069045
min,22.0
25%,22.5
50%,23.0
75%,23.5
max,25.0


In [107]:
spark_df.describe().show()

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|                 7|
|   mean|23.142857142857142|
| stddev|0.9897433186107893|
|    min|                22|
|    max|                25|
+-------+------------------+



In [129]:
from pyspark.sql.functions import mean, min, max
spark_df.select([min('age'), mean('age'),max('age')]).show()

+--------+------------------+--------+
|min(age)|          avg(age)|max(age)|
+--------+------------------+--------+
|      22|23.142857142857142|      25|
+--------+------------------+--------+



In [140]:
a = spark_df.rdd.map(lambda e: e.age).mean()
print a

23.1428571429


# The equivalent of "apply"

In [157]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

transform_country = udf(lambda country: 'United States' if country == 'USA' else country, StringType())
spark_df.withColumn('Country', transform_country(spark_df['Country'])).show()

+----------+---------+-------------+---+
|First Name|Last Name|      Country|age|
+----------+---------+-------------+---+
|       Bob|    Smith|United States| 24|
|     Alice| Williams|       Canada| 23|
|   Malcolm|     Jone|      England| 22|
|     Felix|    Brown|United States| 23|
|      Alex|   Cooper|       Poland| 23|
|       Tod| Campbell|United States| 22|
|     Derek|     Ward|  Switzerland| 25|
+----------+---------+-------------+---+

