# Linear Regression with PySpark

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("crew_prediction").getOrCreate()

In [None]:
df = spark.read.csv("../input/cruise-used-for-pyspark/cruise_dataset.csv", inferSchema=True, header=True)

In [None]:
df.printSchema()

In [None]:
df.show()

In [None]:
df.describe().show()

In [None]:
df.groupBy("Cruise_line").count().show()

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_cat")
indexed = indexer.fit(df).transform(df)

In [None]:
indexed.show()

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
indexed.columns

In [None]:
assembler = VectorAssembler(
    inputCols=['Age',
               'Tonnage',
               'passengers',
               'length',
               'cabins',
               'passenger_density',
               'cruise_cat'],
    outputCol="features"
)

In [None]:
output = assembler.transform(indexed)

In [None]:
output.select("features","crew").show()

In [None]:
final_data = output.select("features","crew")

In [None]:
train_data, test_data = final_data.randomSplit([0.7, 0.3], seed=42)

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
lr = LinearRegression(labelCol="crew")

In [None]:
model = lr.fit(train_data)

In [None]:
import pandas as pd

In [None]:
coeff = model.coefficients
col_names = ['Age',
               'Tonnage',
               'passengers',
               'length',
               'cabins',
               'passenger_density',
               'cruise_cat']

In [None]:
pd.DataFrame({"Variable Names":col_names, "Coefficients":coeff})

In [None]:
res = model.evaluate(test_data)

In [None]:
print(f"""
Linear Regression Results Report
==================================
RMSE:\t {res.rootMeanSquaredError}
MSE:\t {res.meanSquaredError}
R2:\t {res.r2}
Adj R2:\t{res.r2adj}
==================================
"""
)

In [None]:
from pyspark.sql.functions import corr

In [None]:
df.select(corr('crew', 'passengers')).show()

In [None]:
df.select(corr('crew', 'cabins')).show()

In [None]:
df.select(corr('crew', 'Age')).show()