# Load Data from File

In [None]:
from pyspark.sql.types import *


raw_data = sc.textFile('/user/cloudera/data/bike-sharing/hour_nohead.csv')
column_data = raw_data.map(lambda x: x.split(','))

schema = StructType([
    StructField('row_id',StringType(),True),
    StructField('date',StringType(), True),
    StructField('season',StringType(), True),
    StructField('year',StringType(), True),
    StructField('month',StringType(), True),
    StructField('hour',StringType(), True),
    StructField('holiday',StringType(), True),
    StructField('weekday',StringType(), True),
    StructField('workingday',StringType(), True),
    StructField('weather',StringType(), True),
    StructField('temperature',StringType(), True),
    StructField('apparent_temperature',StringType(), True),
    StructField('humidity',StringType(), True),
    StructField('wind_speed',StringType(), True),
    StructField('casual',StringType(), True),
    StructField('registered',StringType(), True),
    StructField('counter',StringType(), True)
    ])
structured_data = sqlContext.createDataFrame(column_data, schema)    
data = structured_data.select(
    structured_data.row_id.cast('int'),
    structured_data.date.cast('string'),
    structured_data.season.cast('int'),
    structured_data.year.cast('int'),
    structured_data.month.cast('int'),
    structured_data.hour.cast('int'),
    structured_data.holiday.cast('int'),
    structured_data.weekday.cast('int'),
    structured_data.workingday.cast('int'),
    structured_data.weather.cast('int'),
    structured_data.temperature.cast('double'),
    structured_data.apparent_temperature.cast('double'),
    structured_data.humidity.cast('double'),
    structured_data.wind_speed.cast('double'),
    structured_data.casual.cast('int'),
    structured_data.registered.cast('int'),
    structured_data.counter.cast('int')
    )

# Prepare Data

In [None]:
from pyspark.sql.functions import *


ddata = data.select(
        data.date,
        unix_timestamp(data.date, "yyyy-MM-dd").alias('ts'),
        data.season.cast("double"),
        data.year.cast("double"),
        data.month.cast("double"),
        data.hour.cast("double"),
        data.holiday.cast("double"),
        data.weekday.cast("double"),
        data.workingday.cast("double"),
        data.weather.cast("double"),
        data.temperature,
        data.apparent_temperature,
        data.humidity,
        data.wind_speed,
        data.casual.cast("double"),
        data.registered.cast("double"),
        data.counter.cast("double")
    )

# Make some Pictures

First we need to import matplotlib.pyplot and also make all plots appear inline in the notebook

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

## Make a Plot of Rents per Day
The original data contains rents per hour, we want to have the data per day

In [None]:
# Generate Pandas DataFrame with summed data per day
pdf = ...

plt.figure(figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(pdf['ts'],pdf['sum(counter)'])

In [None]:
# Now only look at casual renters
pdf = ...

plt.figure(figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(pdf['ts'],pdf['sum(casual)'])

In [None]:
# Now only look at registered renters
pdf = ...

plt.figure(figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(pdf['ts'],pdf['sum(registered)'])    

# Initial Statistics

Of course we are interested in some initial statistics on all columns.

In [None]:
schema = ddata.schema

for field in schema.fields:
    # Print statistcs for field if field is Double Type

# Extract Vectors for Regression

Spark ML needs a special data type (Vector) for most operations. So we need to transform columns of interest into that special data type.

A Vector can be created from a double Array via

    from pyspark.mllib.linalg import Vectors
    Vectors.dense([1.0,2.0,3.0])

In [None]:
def extract_vector(row, cols):
    pass

print extract_vector(Row('name','age')('Bob',23), [1])

## Transform DataFrame

Now that we have extract_vector, we can use it in order to extract the relevant features from our DataFrame

In [None]:
# Use the following columns
cols = [1,2,3,4,5,6,7,8,9,10,11,12,13]

# Transform all records ddata into vectors [feature, counter]
# counter can be found in column row[16]
rdd = ...

# Now create new DataFrame
features_labels = sqlContext.createDataFrame(rdd, ['features','counter'])

# Peek inside, convert first 10 rows to Pandas

# Split Data into Training and Test Set

In [None]:
train_data, test_data = ...
print train_data.count()
print test_data.count()

# Perform Linear Regression

In [None]:
from pyspark.ml.regression import *

### Peek into the Model

Let us have a look at the coefficients and at the intercept

# Perform Prediction

Predict new Data by applying the model to the test data

# Evaluate Model

# Use VectorAssembler

Manual feature extraction (i.e. creation of the Vector) is a little bit tedious and not very comfortable. But luckily, there is a valuable helper called VectorAssembler.

We use it to automatically extract the columns

    season, year, month, hour, holiday, weekday, workingday, weather, 
    temperature, apparent_temperature, humidity, wind_speed
    
into the new output column 'features'

## Split Train and Test Data

Since we found an easier way to generate features, we split incoming data first and apply the VectorAssembler

In [None]:
train_data, test_data = ddata.randomSplit([0.8,0.2], seed=0)
print train_data.count()
print test_data.count()

## Perform Regression

1. Apply VectorAssembler
2. Perform Fitting

In [None]:
asm = ...
regression = ...
model = ...

## Predict

Make predictions from test data and print some results

In [None]:
p

## Evaluation

Finally lets evaluate the prediction

# Make New Pictures of Regression

In [None]:
tmp = prediction \
    .groupBy('ts').agg({'counter':'sum', 'prediction':'sum'}) \
    .orderBy('ts')
    
pdf = tmp.toPandas()

min_ts,max_ts = prediction.agg(min('ts'), max('ts')).collect()[0]

plt.figure(figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k', tight_layout=True)
plt.plot(pdf['ts'],pdf['sum(counter)'])
plt.plot(pdf['ts'],pdf['sum(prediction)'])    
axes = plt.gca()
axes.set_xlim([min_ts,max_ts])