In [14]:
import findspark

findspark.init("/usr/local/spark")

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *

In [15]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Linear Regression Model") \
    .config("spark.executor.memory", "1gb") \
    .getOrCreate()

sc = spark.sparkContext

In [68]:
# Load in the data
rdd = sc.textFile('cal_housing.data')

# Load in the header
header = sc.textFile('cal_housing.domain')
df = rdd \
    .map(lambda line: line.split(',')) \
    .map(lambda line: Row(
        longitude=line[0], 
        latitude=line[1], 
        housingMedianAge=line[2],
        totalRooms=line[3],
        totalBedRooms=line[4],
        population=line[5], 
        households=line[6],
        medianIncome=line[7],
        medianHouseValue=line[8])).toDF()

def convertColumns(df, names, newType):
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df

columns = [
    'households',
    'housingMedianAge',
    'latitude',
    'longitude',
    'medianHouseValue',
    'medianIncome',
    'population',
    'totalBedRooms',
    'totalRooms'
]

df = convertColumns(df, columns, FloatType())

# Compute latitude of the northernmost household from California
df.agg({"latitude": "max"}).first()

# Most common household size (all entries are equally common)
df.groupBy('households').count().sort("households", ascending=False).show(10)

# Highest ratio of bedroom per population in the dataset
df.select((df.totalBedRooms / df.population).alias("ratio")).agg({"ratio": "max"}).first()

+----------+-----+
|households|count|
+----------+-----+
|    6082.0|    1|
|    5358.0|    1|
|    5189.0|    1|
|    5050.0|    1|
|    4930.0|    1|
|    4855.0|    1|
|    4769.0|    1|
|    4616.0|    1|
|    4490.0|    1|
|    4372.0|    1|
+----------+-----+
only showing top 10 rows



Row(max(ratio)=14.194444444444445)