In [21]:
import findspark

findspark.init("/usr/local/spark")

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *

In [22]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Linear Regression Model") \
    .config("spark.executor.memory", "1gb") \
    .getOrCreate()

sc = spark.sparkContext

In [23]:
# EXERCISE 3

firstNames = sc.parallelize([(1, 'Aaron'),(2, 'Abdi'),(3, 'Bart'),(4, 'Calvin'),(5, 'Debbie')])
lastNames = sc.parallelize([(1, 'Armin'), (2, 'Hulu'), (2, 'Gerd'), (3, 'Polo'), (4, 'Klein'), (5, 'Bender')])

hobbies = sc.parallelize([
    ('Sport', 'Tennis'),
    ('Sport', 'Football'),
    ('Entertainment', 'Gaming'),
    ('Music', 'Guitar'),
    ('Music', 'Piano')
])

numbers1 = sc.parallelize([12, 5, 900, 1, 3, 231, 134, 2])
numbers2 = sc.parallelize([12, 1, 89, 234, 21, 12, 2])

In [24]:
firstNames.join(lastNames).collect()

[(2, ('Abdi', 'Hulu')),
 (2, ('Abdi', 'Gerd')),
 (4, ('Calvin', 'Klein')),
 (1, ('Aaron', 'Armin')),
 (3, ('Bart', 'Polo')),
 (5, ('Debbie', 'Bender'))]

In [25]:
hobbies.groupByKey().mapValues(list).collect()

[('Sport', ['Tennis', 'Football']),
 ('Entertainment', ['Gaming']),
 ('Music', ['Guitar', 'Piano'])]

In [26]:
numbers1.intersection(numbers2).collect()

[12, 2, 1]

In [27]:
hobbies.first()

('Sport', 'Tennis')

In [28]:
lastNames.count()

6

In [29]:
numbers2.reduce(lambda x,y: x + y)

371

In [30]:
# EXERCISE 4

# Load in the data
rdd = sc.textFile('cal_housing.data')

# Load in the header
header = sc.textFile('cal_housing.domain')
df = rdd \
    .map(lambda line: line.split(',')) \
    .map(lambda line: Row(
        longitude=line[0], 
        latitude=line[1], 
        housingMedianAge=line[2],
        totalRooms=line[3],
        totalBedRooms=line[4],
        population=line[5], 
        households=line[6],
        medianIncome=line[7],
        medianHouseValue=line[8])).toDF()

def convertColumns(df, names, newType):
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df

columns = [
    'households',
    'housingMedianAge',
    'latitude',
    'longitude',
    'medianHouseValue',
    'medianIncome',
    'population',
    'totalBedRooms',
    'totalRooms'
]

df = convertColumns(df, columns, FloatType())

In [31]:
# Compute latitude of the northernmost household from California
df.agg({"latitude": "max"}).first()

Row(max(latitude)=41.95000076293945)

In [32]:
# Most common household size (all entries are equally common)
df.groupBy('households').count().sort("households", ascending=False).show(10)

+----------+-----+
|households|count|
+----------+-----+
|    6082.0|    1|
|    5358.0|    1|
|    5189.0|    1|
|    5050.0|    1|
|    4930.0|    1|
|    4855.0|    1|
|    4769.0|    1|
|    4616.0|    1|
|    4490.0|    1|
|    4372.0|    1|
+----------+-----+
only showing top 10 rows



In [33]:
# Highest ratio of bedroom per population in the dataset
df.select((df.totalBedRooms / df.population).alias("ratio")).agg({"ratio": "max"}).first()

Row(max(ratio)=14.194444444444445)