In [1]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('cruise').getOrCreate()

In [None]:
from google.colab import files
uploaded = files.upload()

Saving cruise_ship_info.csv to cruise_ship_info (2).csv


In [None]:
df = spark.read.csv('cruise_ship_info.csv', inferSchema = True, header = True)

In [None]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [None]:
for ship in df.head(5):
  print(ship)
  print('\n')

Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)


Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)


Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7)


Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1)


Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0)




In [None]:
df.groupby('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



# **Assess correlations between the features and the target**

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
df.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [None]:
from scipy.stats import shapiro

feature_cols = ['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density']

results = {}

for col in feature_cols:

    sampled_data = df_encoded.select(col).dropna().sample(fraction=0.1, seed=42).toPandas()[col]

    if len(sampled_data) < 3:
        print(f"{col}: data size is too small")
        continue

    stat, p_value = shapiro(sampled_data)
    results[col] = (stat, p_value)

    print(f"{col} - Shapiro-Wilk: {stat:.4f}, p-value: {p_value:.4f}")
    if p_value > 0.05:
        print(f"  => {col} follows a normal distribution.\n")
    else:
        print(f"  => {col} dows not follow a normal distribution.\n")

Age - Shapiro-Wilk: 0.8101, p-value: 0.0016
  => Age dows not follow a normal distribution.

Tonnage - Shapiro-Wilk: 0.9726, p-value: 0.8277
  => Tonnage follows a normal distribution.

passengers - Shapiro-Wilk: 0.9550, p-value: 0.4784
  => passengers follows a normal distribution.

length - Shapiro-Wilk: 0.8481, p-value: 0.0062
  => length dows not follow a normal distribution.

cabins - Shapiro-Wilk: 0.9607, p-value: 0.5853
  => cabins follows a normal distribution.

passenger_density - Shapiro-Wilk: 0.8824, p-value: 0.0236
  => passenger_density dows not follow a normal distribution.



In [None]:
from scipy.stats import pearsonr, spearmanr

features = ['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density']

for col in features:
    pdf = df.select(col, 'crew').dropna().toPandas()

    pearson_corr, pearson_p = pearsonr(pdf[col], pdf['crew'])
    spearman_corr, spearman_p = spearmanr(pdf[col], pdf['crew'])

    print(f"{col}: Pearson={pearson_corr:.4f} (p={pearson_p:.4g}), Spearman={spearman_corr:.4f} (p={spearman_p:.4g})")

Age: Pearson=-0.5307 (p=7.411e-13), Spearman=-0.5904 (p=3.248e-16)
Tonnage: Pearson=0.9276 (p=1.386e-68), Spearman=0.9303 (p=7.683e-70)
passengers: Pearson=0.9152 (p=1.808e-63), Spearman=0.9126 (p=1.723e-62)
length: Pearson=0.8959 (p=7.868e-57), Spearman=0.8764 (p=2.237e-51)
cabins: Pearson=0.9508 (p=2.636e-81), Spearman=0.9414 (p=1.598e-75)
passenger_density: Pearson=-0.1555 (p=0.05105), Spearman=-0.0589 (p=0.4623)


# **Select predictors**

In [None]:
assembler = VectorAssembler(inputCols = ['Age','Tonnage','passengers','length','cabins'], outputCol = 'features')

In [None]:
assembler_fit = assembler.transform(df)

In [None]:
assembler_fit.select('features','crew').show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [None]:
final_df = assembler_fit.select(['features','crew'])

# **Develop a linear model**

In [None]:
train_data, test_data = final_df.randomSplit([0.7,0.3], seed=42)

In [None]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               110|
|   mean| 7.538818181818191|
| stddev|3.7889277929052527|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [None]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                48|
|   mean| 8.379375000000001|
| stddev|2.6843584805550207|
|    min|              3.55|
|    max|              13.6|
+-------+------------------+



In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
lr_model = LinearRegression(labelCol = 'crew', featuresCol = 'features')

In [None]:
lr_model_fit = lr_model.fit(train_data)

In [None]:
lr_model_test = lr_model_fit.evaluate(test_data)

In [None]:
lr_model_fit.evaluate(test_data).rootMeanSquaredError

0.731867371193892

In [None]:
lr_model_test.r2

0.9240850824669404

# **Assess overfitting**

In [None]:
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

assembler = VectorAssembler(
    inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins'],
    outputCol='features'
)

train_assembled = assembler.transform(train_data)
test_assembled = assembler.transform(test_data)

lr = LinearRegression(labelCol='crew', featuresCol='features')
lr_model = lr.fit(train_assembled)

train_summary = lr_model.evaluate(train_assembled)
print(f"Train RMSE: {train_summary.rootMeanSquaredError:.4f}, R2: {train_summary.r2:.4f}")

test_summary = lr_model.evaluate(test_assembled)
print(f"Test RMSE: {test_summary.rootMeanSquaredError:.4f}, R2: {test_summary.r2:.4f}")

Train RMSE: 0.7210, R2: 0.9538
Test RMSE: 1.3682, R2: 0.8698


It seems this model may suffer from overfitting, but the model performance is acceptable due to still high r-squred score on the test data.