In [None]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=0df33e8989bbeda9c0212a78957211b7c3ee945450d3b16fd7281dbeb4bcf415
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [32]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [33]:
file_path = '/content/drive/MyDrive/cruise_ship_info.csv'
spark = SparkSession.builder.appName('CruiseShipCrewPrediction').getOrCreate()

In [34]:
# load the data
data = spark.read.csv(file_path, header=True, inferSchema=True)
data.show(10)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [35]:
# encode the "Cruise_line"
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_index')
data_indexed = indexer.fit(data).transform(data)
data_indexed.select('Cruise_line', 'Cruise_line_index').show(10)

+-----------+-----------------+
|Cruise_line|Cruise_line_index|
+-----------+-----------------+
|    Azamara|             16.0|
|    Azamara|             16.0|
|   Carnival|              1.0|
|   Carnival|              1.0|
|   Carnival|              1.0|
|   Carnival|              1.0|
|   Carnival|              1.0|
|   Carnival|              1.0|
|   Carnival|              1.0|
|   Carnival|              1.0|
+-----------+-----------------+
only showing top 10 rows



In [36]:
# construct feature vector
assembler = VectorAssembler(
    inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'Cruise_line_index'],
    outputCol='features'
)
data_vector = assembler.transform(data_indexed)

In [37]:
# split data into train set and test set
train_data, test_data = data_vector.randomSplit([0.8, 0.2], seed=43)

In [38]:
# train the linear regression model
lr = LinearRegression(featuresCol='features', labelCol='crew')
lr_model = lr.fit(train_data)

In [39]:
# check the statistical indicators
test_results = lr_model.evaluate(test_data)
print(f'Root Mean Squared Error (RMSE): {test_results.rootMeanSquaredError}')
print(f'R-squared: {test_results.r2}')

# check the cofficient and intercept
print(f'Coefficients: {lr_model.coefficients}')
print(f'Intercept: {lr_model.intercept}')

Root Mean Squared Error (RMSE): 0.6490677370675212
R-squared: 0.9581926929153465
Coefficients: [-0.014011543948730132,0.01244702646449247,-0.15209294436078563,0.39745655465491214,0.8521980123383476,-0.008894210125693029,0.05238342789726147]
Intercept: -0.7323773303044169


In [40]:
# make prediction
predictions = lr_model.transform(test_data)
predictions.select('features', 'crew', 'prediction').show()

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[12.0,138.0,31.14...|11.85|13.009588023833379|
|[12.0,58.6,15.66,...|  7.0| 7.428719483783312|
|[29.0,45.0,11.78,...|  5.2| 5.222529685382414|
|[19.0,16.8,2.96,5...|  2.1|2.1358803532878015|
|[25.0,34.25,10.52...|  4.7| 4.590484538634631|
|[16.0,77.499,19.5...|  9.0| 9.143875838250462|
|[13.0,138.0,31.14...|11.76| 12.99557647988465|
|[23.0,70.367,20.5...|  9.2|  8.54983797773618|
|[10.0,105.0,27.2,...|10.68| 11.30928593473789|
|[15.0,108.806,26....| 11.1|11.048274275780775|
|[18.0,70.367,20.5...|  9.2| 8.608401802399953|
|[9.0,90.09,25.01,...| 8.69| 9.285235711914204|
|[6.0,30.276999999...| 3.55| 4.349967597452733|
|[21.0,10.0,2.08,4...|  1.6|1.7223263281785761|
|[6.0,158.0,43.7,1...| 13.6|13.993057742639639|
|[21.0,73.941,27.4...| 8.22| 8.991540800374715|
|[10.0,77.0,20.16,...|  9.0| 8.810290776283576|
|[24.0,40.05300000...|  7.5| 6.581443231

## According to the prediction results, the model performs so well :)