# Linear Regression using Pyspark

In [4]:
pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/27/67/5158f846202d7f012d1c9ca21c3549a58fd3c6707ae8ee823adcaca6473c/pyspark-3.0.2.tar.gz (204.8MB)
[K     |████████████████████████████████| 204.8MB 68kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 40.9MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.2-py2.py3-none-any.whl size=205186687 sha256=da809bb941812c7792e4525b63ea95a2371d3a85c89a0c3cccb8c9e661f84713
  Stored in directory: /root/.cache/pip/wheels/8b/09/da/c1f2859bcc86375dc972c5b6af4881b3603269bcc4c9be5d16
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.2


In [5]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [6]:
from pyspark.ml.regression import LinearRegression

In [7]:
df = spark.read.csv('/content/drive/MyDrive/datacamp/Linear_regression_dataset.csv', inferSchema=True, header = True)

In [8]:
print(df.count(), len(df.columns))

1232 6


In [10]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [11]:
df.describe().show(5,False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|min    |463              |472              |40                |0.277               |0.214               |0.301              |
|max    |1009             |1103             |116               |0.373               |0.294               |0.491

In [12]:
df.head(3)

[Row(var_1=734, var_2=688, var_3=81, var_4=0.328, var_5=0.259, output=0.418),
 Row(var_1=700, var_2=600, var_3=94, var_4=0.32, var_5=0.247, output=0.389),
 Row(var_1=712, var_2=705, var_3=93, var_4=0.311, var_5=0.247, output=0.417)]

In [13]:
df.show(5)

+-----+-----+-----+-----+-----+------+
|var_1|var_2|var_3|var_4|var_5|output|
+-----+-----+-----+-----+-----+------+
|  734|  688|   81|0.328|0.259| 0.418|
|  700|  600|   94| 0.32|0.247| 0.389|
|  712|  705|   93|0.311|0.247| 0.417|
|  734|  806|   69|0.315| 0.26| 0.415|
|  613|  759|   61|0.302| 0.24| 0.378|
+-----+-----+-----+-----+-----+------+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import corr

In [15]:
df.select(corr('var_1', 'output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



In [16]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [17]:
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [18]:
vec_assembler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features')

In [20]:
features_df = vec_assembler.transform(df)

In [21]:
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [22]:
features_df.select('features').show(5,False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [23]:
model_df=features_df.select('features','output')

In [24]:
model_df.show(5,False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
+------------------------------+------+
only showing top 5 rows



In [25]:
print((model_df.count(), len(model_df.columns)))

(1232, 2)


In [26]:
train_df,test_df=model_df.randomSplit([0.7,0.3])

In [27]:
print((train_df.count(), len(train_df.columns)))
print((test_df.count(), len(test_df.columns)))

(859, 2)
(373, 2)


In [28]:
train_df.describe().show()

+-------+--------------------+
|summary|              output|
+-------+--------------------+
|  count|                 859|
|   mean| 0.39752153667054696|
| stddev|0.032958976259143784|
|    min|               0.301|
|    max|               0.485|
+-------+--------------------+



In [29]:
lin_Reg=LinearRegression(labelCol='output')
lr_model=lin_Reg.fit(train_df)
lr_model.intercept

0.19717606882401417

In [30]:
print(lr_model.coefficients)

[0.00033817232155032535,5.563974309090355e-05,0.00021882662412950267,-0.6530682119276883,0.44040435777381276]


In [31]:
training_predictions=lr_model.evaluate(train_df)

In [32]:
training_predictions.meanSquaredError

0.00014659879898667425

In [33]:
training_predictions.r2

0.8648895746657159

In [34]:
test_results=lr_model.evaluate(test_df)

In [35]:
test_results.residuals.show(10)

+--------------------+
|           residuals|
+--------------------+
|-0.00277017830188...|
|-0.00678325436630...|
|-0.00823471818075...|
|-0.01279260783857...|
|-0.01391739612072...|
|-0.01258838580054...|
|-0.00701742197706573|
|-0.01169436123971...|
|-0.00926506591514...|
|-0.00239189853437...|
+--------------------+
only showing top 10 rows



In [36]:
test_results.r2

0.8774023906395844

In [39]:
test_results.rootMeanSquaredError

0.01189108014277637

In [38]:
test_results.meanSquaredError

0.00014139778696193052

# Logistic_Regression_Pyspark

In [40]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('log_reg').getOrCreate()

In [41]:
df=spark.read.csv('/content/drive/MyDrive/datacamp/Log_Reg_dataset.csv',inferSchema=True,header=True)

In [42]:
from pyspark.sql.functions import *

In [43]:
print((df.count(),len(df.columns)))

(20000, 6)


In [44]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)



In [45]:
df.columns

['Country', 'Age', 'Repeat_Visitor', 'Platform', 'Web_pages_viewed', 'Status']

In [46]:
df.show(5)

+---------+---+--------------+--------+----------------+------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|
+---------+---+--------------+--------+----------------+------+
|    India| 41|             1|   Yahoo|              21|     1|
|   Brazil| 28|             1|   Yahoo|               5|     0|
|   Brazil| 40|             0|  Google|               3|     0|
|Indonesia| 31|             1|    Bing|              15|     1|
| Malaysia| 32|             0|  Google|              15|     1|
+---------+---+--------------+--------+----------------+------+
only showing top 5 rows



In [47]:
df.describe().show()

+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|  count|   20000|            20000|            20000|   20000|            20000|             20000|
|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|
| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|
|    min|  Brazil|               17|                0|    Bing|                1|                 0|
|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+



In [48]:
df.groupBy('Country').count().show()

+---------+-----+
|  Country|count|
+---------+-----+
| Malaysia| 1218|
|    India| 4018|
|Indonesia|12178|
|   Brazil| 2586|
+---------+-----+



In [50]:
df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1|10000|
|     0|10000|
+------+-----+



In [51]:
df.groupBy('Country').mean().show()

+---------+------------------+-------------------+---------------------+--------------------+
|  Country|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|         avg(Status)|
+---------+------------------+-------------------+---------------------+--------------------+
| Malaysia|27.792282430213465| 0.5730706075533661|   11.192118226600986|  0.6568144499178982|
|    India|27.976854156296664| 0.5433051269288203|   10.727227476356397|  0.6212045793927327|
|Indonesia| 28.43159796354081| 0.5207751683363442|    9.985711939563148|  0.5422893742814913|
|   Brazil|30.274168600154677|  0.322892498066512|    4.921113689095128|0.038669760247486466|
+---------+------------------+-------------------+---------------------+--------------------+



In [52]:
df.groupBy('Platform').mean().show()

+--------+------------------+-------------------+---------------------+------------------+
|Platform|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|       avg(Status)|
+--------+------------------+-------------------+---------------------+------------------+
|   Yahoo|28.569226087838523| 0.5094837204584644|    9.599655137437875|0.5071508266558474|
|    Bing| 28.68394495412844| 0.4720183486238532|    9.114908256880733|0.4559633027522936|
|  Google|28.380038055699707| 0.5149628092025601|    9.804878048780488|0.5210171250648676|
+--------+------------------+-------------------+---------------------+------------------+



In [53]:
df.groupBy('Status').mean().show()

+------+--------+-------------------+---------------------+-----------+
|Status|avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|avg(Status)|
+------+--------+-------------------+---------------------+-----------+
|     1| 26.5435|             0.7019|              14.5617|        1.0|
|     0| 30.5356|             0.3039|               4.5449|        0.0|
+------+--------+-------------------+---------------------+-----------+



StringIndexer는 레이블의 문자열 컬럼을 레이블 인덱스의 컬럼으로 인코딩한다. 


In [54]:
# 범주화 데이터 수치형으로 변경
from pyspark.ml.feature import StringIndexer
search_engine_indexer = StringIndexer(inputCol="Platform", outputCol="Search_Engine_Num").fit(df)
df = search_engine_indexer.transform(df)

In [55]:
df.show(3,False)

+-------+---+--------------+--------+----------------+------+-----------------+
|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Search_Engine_Num|
+-------+---+--------------+--------+----------------+------+-----------------+
|India  |41 |1             |Yahoo   |21              |1     |0.0              |
|Brazil |28 |1             |Yahoo   |5               |0     |0.0              |
|Brazil |40 |0             |Google  |3               |0     |1.0              |
+-------+---+--------------+--------+----------------+------+-----------------+
only showing top 3 rows



In [56]:
from pyspark.ml.feature import OneHotEncoder

In [65]:
search_engine_encoder = OneHotEncoder(inputCol="Search_Engine_Num", outputCol="Search_Engine_Vector")
search_engine_encoder.setDropLast(False)
search_engine_ohe =search_engine_encoder.fit(df)
df = search_engine_ohe.transform(df)

In [66]:
df.show(5)

+---------+---+--------------+--------+----------------+------+-----------------+-----------+--------------------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Search_Engine_Num|Country_Num|Search_Engine_Vector|
+---------+---+--------------+--------+----------------+------+-----------------+-----------+--------------------+
|    India| 41|             1|   Yahoo|              21|     1|              0.0|        1.0|       (3,[0],[1.0])|
|   Brazil| 28|             1|   Yahoo|               5|     0|              0.0|        2.0|       (3,[0],[1.0])|
|   Brazil| 40|             0|  Google|               3|     0|              1.0|        2.0|       (3,[1],[1.0])|
|Indonesia| 31|             1|    Bing|              15|     1|              2.0|        0.0|       (3,[2],[1.0])|
| Malaysia| 32|             0|  Google|              15|     1|              1.0|        3.0|       (3,[1],[1.0])|
+---------+---+--------------+--------+----------------+------+-----------------

In [58]:
df.groupBy('Search_Engine_Num').count().orderBy('count',ascending=False).show(5,False)

+-----------------+-----+
|Search_Engine_Num|count|
+-----------------+-----+
|0.0              |9859 |
|1.0              |5781 |
|2.0              |4360 |
+-----------------+-----+



In [60]:
country_indexer = StringIndexer(inputCol="Country", outputCol="Country_Num").fit(df)
df = country_indexer.transform(df)

In [61]:
df.select(['Country','Country_Num']).show(3,False)

+-------+-----------+
|Country|Country_Num|
+-------+-----------+
|India  |1.0        |
|Brazil |2.0        |
|Brazil |2.0        |
+-------+-----------+
only showing top 3 rows



In [67]:
country_encoder = OneHotEncoder(inputCol="Country_Num", outputCol="Country_Vector")
country_encoder.setDropLast(False)
country_encoder_ohe =country_encoder.fit(df)
df = country_encoder_ohe.transform(df)

In [68]:
df.select(['Country','country_Num','Country_Vector']).show(3,False)

+-------+-----------+--------------+
|Country|country_Num|Country_Vector|
+-------+-----------+--------------+
|India  |1.0        |(4,[1],[1.0]) |
|Brazil |2.0        |(4,[2],[1.0]) |
|Brazil |2.0        |(4,[2],[1.0]) |
+-------+-----------+--------------+
only showing top 3 rows



In [69]:
df.groupBy('Country').count().orderBy('count',ascending=False).show(5,False)

+---------+-----+
|Country  |count|
+---------+-----+
|Indonesia|12178|
|India    |4018 |
|Brazil   |2586 |
|Malaysia |1218 |
+---------+-----+



In [70]:
df.groupBy('Country_Num').count().orderBy('count',ascending=False).show(5,False)

+-----------+-----+
|Country_Num|count|
+-----------+-----+
|0.0        |12178|
|1.0        |4018 |
|2.0        |2586 |
|3.0        |1218 |
+-----------+-----+



In [71]:
df.groupBy('Country_Vector').count().orderBy('count',ascending=False).show(5,False)

+--------------+-----+
|Country_Vector|count|
+--------------+-----+
|(4,[0],[1.0]) |12178|
|(4,[1],[1.0]) |4018 |
|(4,[2],[1.0]) |2586 |
|(4,[3],[1.0]) |1218 |
+--------------+-----+



In [73]:
df.show(5)

+---------+---+--------------+--------+----------------+------+-----------------+-----------+--------------------+--------------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Search_Engine_Num|Country_Num|Search_Engine_Vector|Country_Vector|
+---------+---+--------------+--------+----------------+------+-----------------+-----------+--------------------+--------------+
|    India| 41|             1|   Yahoo|              21|     1|              0.0|        1.0|       (3,[0],[1.0])| (4,[1],[1.0])|
|   Brazil| 28|             1|   Yahoo|               5|     0|              0.0|        2.0|       (3,[0],[1.0])| (4,[2],[1.0])|
|   Brazil| 40|             0|  Google|               3|     0|              1.0|        2.0|       (3,[1],[1.0])| (4,[2],[1.0])|
|Indonesia| 31|             1|    Bing|              15|     1|              2.0|        0.0|       (3,[2],[1.0])| (4,[0],[1.0])|
| Malaysia| 32|             0|  Google|              15|     1|              1.0|        3

In [72]:
from pyspark.ml.feature import VectorAssembler

In [76]:
df_assembler = VectorAssembler(inputCols=
                               ['Country_Vector','Age', 
                                'Repeat_Visitor','Web_pages_viewed'], outputCol="features")
df = df_assembler.transform(df)

In [77]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)
 |-- Search_Engine_Num: double (nullable = false)
 |-- Country_Num: double (nullable = false)
 |-- Search_Engine_Vector: vector (nullable = true)
 |-- Country_Vector: vector (nullable = true)
 |-- features: vector (nullable = true)



In [78]:
df.select(['features','Status']).show(10,False)

+-------------------------------+------+
|features                       |Status|
+-------------------------------+------+
|[0.0,1.0,0.0,0.0,41.0,1.0,21.0]|1     |
|[0.0,0.0,1.0,0.0,28.0,1.0,5.0] |0     |
|(7,[2,4,6],[1.0,40.0,3.0])     |0     |
|[1.0,0.0,0.0,0.0,31.0,1.0,15.0]|1     |
|(7,[3,4,6],[1.0,32.0,15.0])    |1     |
|(7,[2,4,6],[1.0,32.0,3.0])     |0     |
|(7,[2,4,6],[1.0,32.0,6.0])     |0     |
|(7,[0,4,6],[1.0,27.0,9.0])     |0     |
|(7,[0,4,6],[1.0,32.0,2.0])     |0     |
|[1.0,0.0,0.0,0.0,31.0,1.0,16.0]|1     |
+-------------------------------+------+
only showing top 10 rows



In [79]:
model_df=df.select(['features','Status'])

In [80]:
from pyspark.ml.classification import LogisticRegression

In [81]:
training_df,test_df=model_df.randomSplit([0.75,0.25])

In [82]:
training_df.count()

14891

In [83]:
training_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 7457|
|     0| 7434|
+------+-----+



In [84]:
test_df.count()

5109

In [85]:
test_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 2543|
|     0| 2566|
+------+-----+



In [86]:
log_reg=LogisticRegression(labelCol='Status').fit(training_df)

In [87]:
train_results=log_reg.evaluate(training_df).predictions

In [88]:
train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).select(['Status','prediction','probability']).show(10,False)

+------+----------+---------------------------------------+
|Status|prediction|probability                            |
+------+----------+---------------------------------------+
|1     |1.0       |[0.3089885897718463,0.6910114102281537]|
|1     |1.0       |[0.3089885897718463,0.6910114102281537]|
|1     |1.0       |[0.3089885897718463,0.6910114102281537]|
|1     |1.0       |[0.3089885897718463,0.6910114102281537]|
|1     |1.0       |[0.3089885897718463,0.6910114102281537]|
|1     |1.0       |[0.3089885897718463,0.6910114102281537]|
|1     |1.0       |[0.17554760322230198,0.824452396777698]|
|1     |1.0       |[0.17554760322230198,0.824452396777698]|
|1     |1.0       |[0.17554760322230198,0.824452396777698]|
|1     |1.0       |[0.17554760322230198,0.824452396777698]|
+------+----------+---------------------------------------+
only showing top 10 rows



In [89]:
correct_preds=train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).count()

In [90]:
training_df.filter(training_df['Status']==1).count()

7457

In [91]:
float(correct_preds)/(training_df.filter(training_df['Status']==1).count())

0.9371060748290198

In [92]:
results=log_reg.evaluate(test_df).predictions

In [93]:
results.select(['Status','prediction']).show(10,False)

+------+----------+
|Status|prediction|
+------+----------+
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
+------+----------+
only showing top 10 rows



In [94]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [95]:
true_postives = results[(results.Status == 1) & (results.prediction == 1)].count()
true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count()
false_positives = results[(results.Status == 0) & (results.prediction == 1)].count()
false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count()

In [96]:
print (true_postives)
print (true_negatives)
print (false_positives)
print (false_negatives)
print(true_postives+true_negatives+false_positives+false_negatives)
print (results.count())

2397
2403
163
146
5109
5109


In [97]:
recall = float(true_postives)/(true_postives + false_negatives)
print(recall)

0.9425874950845458


In [98]:
recall = float(true_postives)/(true_postives + false_negatives)
print(recall)

0.9425874950845458


In [99]:
accuracy=float((true_postives+true_negatives) /(results.count()))
print(accuracy)

0.9395184967704052
