## Mini project
Predict the number of crew members required on a cruiseliner.

In [2]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.0-bin-hadoop3.2')
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [3]:
DATA_PATH = '../data/Spark_for_Machine_Learning/Linear_Regression/'

In [4]:
spark = SparkSession.builder.appName('lr').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/27 15:14:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
data = spark.read.csv(DATA_PATH + 'cruise_ship_info.csv', inferSchema=True, header=True)

                                                                                

In [7]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [55]:
# Check for nans. None!
assert data.count()==data.na.drop().count()

In [70]:
# All the features are pretty close in scale. Probabily OK to avoid scaling for now.
data.select(['Age',
         'Tonnage',
         'passengers',
         'length',
         'cabins',
         'passenger_density']).describe().show()

+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+
|summary|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|
+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+
|  count|               158|               158|              158|              158|               158|              158|
|   mean|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|
| stddev| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|
|    min|                 4|             2.329|             0.66|             2.79|              0.33|             17.7|
|    max|                48|             220.0|             54.0|            11.82|              27.0|            71.43|
+-------+------------------+----

## Feature building with VectorAssembler

In [19]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

In [38]:
string_indexer = StringIndexer(inputCol='Cruise_line', outputCol='output')
sr_fit = string_indexer.fit(data.select('Cruise_line'))

ohe = OneHotEncoder(inputCol='output',outputCol='ohe_cruise_line')
ohe_fit = ohe.fit(sr_fit.transform(data.select('Cruise_line')))

In [41]:
processed_data = ohe_fit.transform(sr_fit.transform(data))

In [45]:
processed_data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'output',
 'ohe_cruise_line']

In [48]:
feats = ['Age',
         'Tonnage',
         'passengers',
         'length',
         'cabins',
         'passenger_density',
         'ohe_cruise_line']

assembler = VectorAssembler(inputCols=feats, outputCol='features')

In [49]:
output = assembler.transform(processed_data)

In [92]:
train_data, test_data = output.select(['features','crew']).randomSplit([0.7,0.3])

In [93]:
lr = LinearRegression(featuresCol='features', labelCol='crew', regParam=0.3)

In [94]:
lr_model = lr.fit(train_data)

In [95]:
test_results = lr_model.evaluate(test_data)

In [96]:
test_results.rootMeanSquaredError

0.8518323449892565

In [97]:
test_results.r2

0.9316308562213783

In [98]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| 0.41023786206403656|
| -0.6194615412331412|
| -0.8752598173479829|
| -0.3374104999439549|
| -0.3225801734693601|
|-0.29746460887420945|
|-0.26709436770893014|
|-0.34190924712629034|
| -1.4957817662743818|
|-0.08850795116183718|
|  1.4368866185429097|
| -0.8990229387502584|
| -1.0660028882238173|
| 0.06555187511516358|
|   -1.26889021575305|
| 0.14971298153195178|
| 0.01848263137760764|
|  1.0128209982168492|
|   0.315993709939443|
| -1.2086433942138672|
+--------------------+
only showing top 20 rows



In [99]:
lr_model.coefficients

DenseVector([-0.0152, 0.0267, 0.0587, 0.4963, 0.2492, 0.0064, -1.0885, 0.6574, -0.0535, -0.3481, 0.4759, -0.4422, 0.5535, -0.0919, -0.1282, 1.4871, -0.346, -0.2757, -0.5233, 0.267, 0.1452, -0.1731, -0.3013, 0.0, 0.0])

In [100]:
lr_model.intercept

-1.3554636384284837