In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('sol').getOrCreate()

In [3]:
%fs ls /FileStore/tables

path,name,size
dbfs:/FileStore/tables/ContainsNull.csv,ContainsNull.csv,61
dbfs:/FileStore/tables/Ecommerce_Customers.csv,Ecommerce_Customers.csv,86871
dbfs:/FileStore/tables/Spark_Essentials-5d27c.dbc,Spark_Essentials-5d27c.dbc,1414841
dbfs:/FileStore/tables/appl_stock.csv,appl_stock.csv,143130
dbfs:/FileStore/tables/cogsley_clients.csv,cogsley_clients.csv,384219
dbfs:/FileStore/tables/cogsley_sales.csv,cogsley_sales.csv,2176442
dbfs:/FileStore/tables/cruise_ship_info.csv,cruise_ship_info.csv,8734
dbfs:/FileStore/tables/people.json,people.json,73
dbfs:/FileStore/tables/sales_info.csv,sales_info.csv,196
dbfs:/FileStore/tables/sample_linear_regression_data.txt,sample_linear_regression_data.txt,119069


In [4]:
path= '/FileStore/tables/cruise_ship_info.csv'
df = spark.read.csv(path,inferSchema= True,header= True)

In [5]:
df.show(5)

In [6]:
df.groupBy('Cruise_line').count().show()

In [7]:
from pyspark.ml.feature import StringIndexer

In [8]:
# we can also create dummy variables by first string indexing and then one hot encoding
indexer = StringIndexer(inputCol='Cruise_line',outputCol='Cruise_line_index')
indexed = indexer.fit(df).transform(df)

In [9]:
indexed.show()

In [10]:
type(indexed)

In [11]:
df.groupBy('Ship_name').count().show()

In [12]:
df2 = indexed.select(['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_line_index'])


In [13]:
df2.show(5)

In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [15]:
assembler =  VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density','Cruise_line_index'],
                            outputCol = 'features')

In [16]:
output = assembler.transform(df2)

In [17]:
output.show(5)

In [18]:
final_data = output.select(['features','crew'])

In [19]:
final_data.show(5)

In [20]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [21]:
train_data.show(5)

In [22]:
from pyspark.ml.regression import LinearRegression

In [23]:
lr=LinearRegression(featuresCol='features',labelCol='crew',predictionCol='prediction')

In [24]:
lr_model = lr.fit(train_data)

In [25]:
model_summary = lr_model.summary

In [26]:
print('p_values: {}'.format(model_summary.pValues) )
print('\n')
print('r2 {}'.format(model_summary.r2) )
print('\n')
print('rmse{}'.format(model_summary.rootMeanSquaredError))

In [27]:
df.describe().show()

In [28]:
test_result = lr_model.evaluate(test_data)

In [29]:
print(test_result.r2)
print(test_result.rootMeanSquaredError)

In [30]:
unlabeled_data = test_data.select('features')

In [31]:
pred = lr_model.transform(unlabeled_data)

In [32]:
pred.show(5)

In [33]:
from pyspark.sql.functions import corr

In [34]:
df.select(corr('crew','passengers')).show()