In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 16.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=84753df53d04b0fc67717c4ac0315933131863d65c0d5667acaf4495c499c3f6
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
.appName("LinearRegression") \
.master("local[4]") \
.config("spark.driver.memory","4g") \
.config("spark.executor.memory","4g") \
.getOrCreate()

In [4]:
df = spark.read.format("csv") \
.option("header",True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("/content/Advertising.csv")

In [5]:
df.show()

+---+-----+-----+---------+-----+
| ID|   TV|Radio|Newspaper|Sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
|  6|  8.7| 48.9|     75.0|  7.2|
|  7| 57.5| 32.8|     23.5| 11.8|
|  8|120.2| 19.6|     11.6| 13.2|
|  9|  8.6|  2.1|      1.0|  4.8|
| 10|199.8|  2.6|     21.2| 10.6|
| 11| 66.1|  5.8|     24.2|  8.6|
| 12|214.7| 24.0|      4.0| 17.4|
| 13| 23.8| 35.1|     65.9|  9.2|
| 14| 97.5|  7.6|      7.2|  9.7|
| 15|204.1| 32.9|     46.0| 19.0|
| 16|195.4| 47.7|     52.9| 22.4|
| 17| 67.8| 36.6|    114.0| 12.5|
| 18|281.4| 39.6|     55.8| 24.4|
| 19| 69.2| 20.5|     18.3| 11.3|
| 20|147.3| 23.9|     19.1| 14.6|
+---+-----+-----+---------+-----+
only showing top 20 rows



In [8]:
df2 = df.withColumn("Advertisement", (df.TV + df.Radio + df.Newspaper)) \
.withColumnRenamed("Sales","label") \
.drop("TV","Radio","Newspaper")

df2.toPandas().head()


Unnamed: 0,ID,label,Advertisement
0,1,22.1,337.1
1,2,10.4,128.9
2,3,9.3,132.4
3,4,18.5,251.3
4,5,12.9,250.0


In [9]:
df2.describe("label","Advertisement").toPandas().head()

Unnamed: 0,summary,label,Advertisement
0,count,200.0,200.0
1,mean,14.022500000000004,200.86049999999992
2,stddev,5.217456565710477,92.9851805869837
3,min,1.6,11.7
4,max,27.0,433.6


In [10]:
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler() \
.setInputCols(["Advertisement"]) \
.setOutputCol("features")

In [13]:
train_df, test_df = df2.randomSplit([0.8, 0.2], seed=142)

In [11]:
from pyspark.ml.regression import LinearRegression
linear_reg_obj = LinearRegression() \
.setLabelCol("label")\
.setFeaturesCol("features")

In [12]:
from pyspark.ml import Pipeline
pipeline_obj = Pipeline() \
.setStages([vector_assembler, linear_reg_obj])

In [14]:
pipeline_model = pipeline_obj.fit(train_df)

In [15]:
result_df = pipeline_model.transform(test_df)

In [16]:
result_df.toPandas().head()

Unnamed: 0,ID,label,Advertisement,features,prediction
0,4,18.5,251.3,[251.3],16.55387
1,9,4.8,11.7,[11.7],4.929785
2,11,8.6,96.1,[96.1],9.024412
3,15,19.0,283.0,[283.0],18.091781
4,25,9.7,93.2,[93.19999999999999],8.88372


In [17]:
pipeline_model.stages

[VectorAssembler_bb154e683ed1,
 LinearRegressionModel: uid=LinearRegression_6683b9895d57, numFeatures=1]

In [19]:
lr_model = pipeline_model.stages[1]

In [20]:
lr_model.coefficients

DenseVector([0.0485])

In [21]:
lr_model.intercept

4.362164413237513

In [22]:
lr_model.summary.r2

0.754183281633372

In [23]:
lr_model.summary.pValues

[0.0, 6.661338147750939e-16]

In [24]:
lr_model.summary.rootMeanSquaredError

2.5407401281208677

In [25]:
# y = 4.537119328969264 + 0.0472 * Advertisement

In [27]:
# 100 bin liralık bir reklam bütçesi ile ne kadar satış gerçekleşeceğini tahmin edelim.
df_predict_rdd = spark.sparkContext.parallelize([100.0])
df_predict = df_predict_rdd.map(lambda x: (x,)).toDF(["Advertisement"])

In [28]:
df_predict.show()

+-------------+
|Advertisement|
+-------------+
|        100.0|
+-------------+



In [29]:
df_pred_vec = vector_assembler.transform(df_predict)

In [30]:
lr_model.transform(df_pred_vec).toPandas().head()

Unnamed: 0,Advertisement,features,prediction
0,100.0,[100.0],9.213619
