# Linear Regression

Here our goal is to create a model to predict the number of crew members needed to staff a given cruise ship. We'll explore the data, perform some pre-processing, create a train-test split, and then evaluate a number of models including ridge and LASSO.

In [1]:
import findspark
findspark.init("/home/bryan/Documents/Code/spark-2.4.5-bin-hadoop2.7")

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_crew_size').getOrCreate()

# EDA

In [3]:
data = spark.read.csv("cruise_ship_info.csv", inferSchema=True, header=True)

In [4]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
assert data.count() == data.na.drop().count(), "Check for missing data."

> ### There are no NaN values in the given data.

In [6]:
data.show(3)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 3 rows



In [7]:
round(data.select('Ship_name').distinct().count()/data.count(),2)

0.87

> ### Ship_name is essentially a unique value, 87% of records have a unique value.

In [8]:
round(data.select("Cruise_line").distinct().count()/data.count(),2)

0.13

> ### Cruise_line is much less distinct, 13% of records have a unique value.

In [9]:
these_features = data.columns[1:-1] # grab features
these_features

['Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density']

### Here we'll leave out Ship_name since it's essentially a unique value and we won't use it in our initial models.

In [10]:
this_target = data.columns[-1] # the target
this_target

'crew'

# Train/test split & pre-processing

In [11]:
# create train/test split
train_data,test_data = data.randomSplit([0.8,0.2],7)

In [12]:
assert data.count()*0.75 < train_data.count() < data.count()*0.8, "Check output of randomSplit"

In [13]:
from pyspark.ml.feature import StringIndexer

> StringIndexer ~ LabelEncoder in sklearn

In [14]:
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_index', handleInvalid='keep')
fitted_indexer = indexer.fit(train_data)

train_data = fitted_indexer.transform(train_data)
test_data = fitted_indexer.transform(test_data)

In [15]:
train_data.groupBy("Cruise_line").avg('Cruise_line_index').sort("avg(Cruise_line_index)").show()

+-----------------+----------------------+
|      Cruise_line|avg(Cruise_line_index)|
+-----------------+----------------------+
|         Carnival|                   0.0|
|         Princess|                   1.0|
|  Royal_Caribbean|                   2.0|
| Holland_American|                   3.0|
|        Norwegian|                   4.0|
|            Costa|                   5.0|
|        Celebrity|                   6.0|
|              MSC|                   7.0|
|             Star|                   8.0|
|              P&O|                   9.0|
|Regent_Seven_Seas|                  10.0|
|          Oceania|                  11.0|
|         Windstar|                  12.0|
|          Azamara|                  13.0|
|           Disney|                  14.0|
|        Silversea|                  15.0|
|         Seabourn|                  16.0|
|          Crystal|                  17.0|
+-----------------+----------------------+



> The above is a crude way of visualizing the mapping created by StringIndexer.

In [16]:
train_data = train_data.drop('Cruise_line')
test_data = test_data.drop('Cruise_line')

In [17]:
assert 'Cruise_line' not in train_data.columns and test_data.columns, "Check 'Cruise_line' was dropped."

In [18]:
from pyspark.ml.feature import VectorAssembler

In [19]:
assembler = VectorAssembler(
    inputCols = these_features[1:],
    outputCol = "features")

train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

# Baseline linear regression

In [20]:
train_data.show(3)

+---------+---+-------+----------+------+------+-----------------+----+-----------------+--------------------+
|Ship_name|Age|Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_index|            features|
+---------+---+-------+----------+------+------+-----------------+----+-----------------+--------------------+
|  Allegra| 21|  28.43|      8.08|  6.16|   4.1|            35.19| 4.0|              5.0|[21.0,28.43,8.08,...|
|Amsterdam| 13|   61.0|      13.8|   7.8|  6.88|             44.2| 6.0|              3.0|[13.0,61.0,13.8,7...|
|  Arcadia|  9|   85.0|     19.68|  9.35|  9.84|            43.19|8.69|              9.0|[9.0,85.0,19.68,9...|
+---------+---+-------+----------+------+------+-----------------+----+-----------------+--------------------+
only showing top 3 rows



In [21]:
from pyspark.ml.regression import LinearRegression

In [22]:
lr = LinearRegression(
    featuresCol='features', 
    labelCol='crew', 
    predictionCol='prediction'
) # NOT recommended to create unregularized linear model especially with small data

lr_model = lr.fit(train_data)

In [23]:
def print_results(this_model):
    """
    Print out select attributes of models.
    
    Args:
        this_model - pyspark.ml model; a model that's been fitted to training data.
    
    Returns:
        n/a - prints out attributes.
    """
    this_summary = this_model.summary
    
    print("MAE: {}".format(round(this_summary.meanAbsoluteError,5)))
    print("r2: {}".format(round(this_summary.r2,5)))
    print("adjusted r2: {}".format(round(this_summary.r2adj,5)))

In [24]:
print_results(lr_model)

MAE: 0.58346
r2: 0.94315
adjusted r2: 0.94019


> This could be overfitting since there's no regularization.

# Ridge regression

In [25]:
ridge = LinearRegression(
    featuresCol='features', 
    labelCol='crew', 
    predictionCol='prediction', 
    regParam=0.1, 
    elasticNetParam=0.0
)

ridge_model = ridge.fit(train_data)
print_results(ridge_model)

MAE: 0.61027
r2: 0.93661
adjusted r2: 0.9333


# LASSO regression

In [26]:
lasso = LinearRegression(
    featuresCol='features', 
    labelCol='crew', 
    predictionCol='prediction', 
    regParam=0.1, 
    elasticNetParam=1
)

lasso_model = lasso.fit(train_data)
print_results(lasso_model)

MAE: 0.6089
r2: 0.93752
adjusted r2: 0.93426


# END