## Total Recall

In [7]:
from IPython.core.display import display, HTML
display(HTML(
    '<style>'
        '#notebook { padding-top:0px !important; } ' 
        '.container { width:100% !important; } '
        '.end_space { min-height:0px !important; } '
    '</style>'
))

In [1]:
import findspark
findspark.init()

In [2]:
findspark.find()

'D:\\Softwares\\Spark\\spark-2.4.7-bin-hadoop2.7'

In [4]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("confRecall").setMaster('local')

sc = SparkContext(conf= conf)


# Basics Recap

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession(sc).builder.appName('appRecall').getOrCreate()

In [6]:
df = spark.read.csv("./Resources/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/appl_stock.csv", inferSchema=True, header=True)

In [8]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [9]:
sc.stop()

# Linear Regression

In [10]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("lr_Conf").setMaster('local')

sc = SparkContext(conf= conf)


from pyspark.sql import SparkSession

spark = SparkSession(sc).builder.appName('lr_app').getOrCreate()

In [11]:
df = spark.read.csv("./Resources/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/Ecommerce_Customers.csv", inferSchema=True, header=True)

In [15]:
for i in df.head(1)[0]:
    print(i)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [18]:
df.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [16]:
from pyspark.ml.regression import LinearRegression

In [17]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [19]:
assembler = VectorAssembler(inputCols=['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'], outputCol='features')

In [20]:
outputData = assembler.transform(df)

In [21]:
outputData.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37

In [22]:
final_data = outputData.select(['features','Yearly Amount Spent'])

In [23]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [24]:
train, test = final_data.randomSplit([0.7,0.3])

In [25]:
train.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                336|
|   mean| 500.44649315022264|
| stddev|  80.48835997313314|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



In [26]:
test.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                164|
|   mean| 496.99388677329614|
| stddev|  77.04499233584619|
|    min| 256.67058229005585|
|    max|  684.1634310159512|
+-------+-------------------+



In [29]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [30]:
lr_model = lr.fit(train)

In [31]:
test_results = lr_model.evaluate(test)

In [32]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -6.883839242259057|
|   9.47455917626337|
| -5.183055513968554|
|-14.353131483532195|
|-7.9859388178972495|
| 1.6942129182133385|
|-0.8273884711668416|
| -9.479566836267281|
| -5.835711458894593|
| -9.786942885964379|
|-14.996483299822785|
| -2.747512629889684|
| -4.976316792464161|
| -2.254373002565103|
|-2.8336824955967472|
|-5.3158910524334715|
|-17.615852166504567|
|  6.046612121593114|
|-3.2068551436468624|
|-1.8625266308996515|
+-------------------+
only showing top 20 rows



In [33]:
test_results.r2

0.9844629980457957

In [35]:
test_results.rootMeanSquaredError

9.574142885654114

In [36]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [37]:
sc.stop()

# Linear Regg Ex

In [38]:
import findspark
findspark.init()

In [40]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('linearConf').setMaster('local')
sc = SparkContext(conf = conf)

In [41]:
from pyspark.sql import SparkSession
spark = SparkSession(sc).builder.appName('linearApp').getOrCreate()

In [42]:
df = spark.read.csv("./Resources/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv", inferSchema=True, header=True)

In [44]:
df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [45]:
df.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|      NaN|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

In [46]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [47]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [48]:
df.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [49]:
assembler = VectorAssembler(inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'], outputCol='features')

In [50]:
opData = assembler.transform(df)

In [51]:
opData.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|[6.0,30.276999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|[6.0,30.276999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|[26.0,47.262,14.8...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|[11.0,110.0,29.74...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|[17.0,101.353,26....|
|    Ecstasy|   Carnival| 22|            70.367|     20.

In [52]:
final_data = opData.select(['features','crew'])

In [53]:
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [54]:
train, test = final_data.randomSplit([0.7,0.3])

In [55]:
lr = LinearRegression(labelCol='crew')

In [56]:
lr_model = lr.fit(train)

In [57]:
test_results = lr_model.evaluate(test)

In [59]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|  0.6992113243063862|
|  0.2824993549773893|
| -0.5642994507618813|
| -0.5642994507618813|
| -0.7992768987365739|
|  0.5183481892613635|
| 0.43078058534461583|
| -0.2906715074208215|
| -0.4836103778167242|
|  0.5029647669954986|
|   -0.85349845344256|
|   0.986421497339963|
| 0.34274728999788273|
| -0.5859306677255152|
| -1.2164580629945423|
| -0.9936794523228212|
| -0.4417868197242125|
|  0.8731756981902503|
|-0.30218796433060835|
|-0.18508915732175923|
+--------------------+
only showing top 20 rows



In [60]:
test_results.r2

0.9441442152689816

In [61]:
test_results.rootMeanSquaredError

0.7821659625991745

In [62]:
final_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



### Predicting unlabelled data

In [63]:
unlabeled_data = test.select('features')

In [65]:
predictions = lr_model.transform(unlabeled_data)

In [66]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[5.0,115.0,35.74,...|11.500788675693613|
|[5.0,133.5,39.59,...|12.847500645022611|
|[6.0,30.276999999...| 4.114299450761881|
|[6.0,30.276999999...| 4.114299450761881|
|[6.0,90.0,20.0,9....| 9.799276898736574|
|[6.0,93.0,23.94,9...|10.571651810738636|
|[6.0,113.0,37.82,...|11.569219414655384|
|[7.0,158.0,43.7,1...|13.890671507420821|
|[8.0,110.0,29.74,...|12.083610377816724|
|[9.0,88.5,21.24,9...| 9.797035233004502|
|[9.0,90.09,25.01,...|  9.54349845344256|
|[9.0,113.0,26.74,...|11.393578502660038|
|[10.0,77.0,20.16,...| 8.657252710002117|
|[10.0,81.76899999...| 9.005930667725515|
|[10.0,138.0,31.14...|13.066458062994542|
|[11.0,85.0,18.48,...| 8.993679452322821|
|[11.0,91.62700000...| 9.441786819724213|
|[11.0,108.977,26....| 11.12682430180975|
|[12.0,2.329,0.94,...|0.9021879643306083|
|[12.0,25.0,3.88,5...|3.0550891573217593|
+--------------------+------------