# Regressão Linear para um Comércio Eletrônico
 
Basicamente vamos analisar um conjunto de dados de um cliente de comércio eletrônico. Em seguida construir um modelo em que possa prever os gastos anuais do cliente com o produto da empresa.


### Instalação dos componentes relacionado ao Spark

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
findspark.init('spark-2.4.4-bin-hadoop2.7')

## Importação das Bibliotecas

In [3]:
import pyspark
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession

from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

## Criação da SparkSession e lendo o Arquivo

In [4]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [5]:
PATH = '/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/Linear_Regression/base/Ecommerce_Customers.csv'
data = spark.read.csv(PATH, inferSchema = True, header = True) 
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [6]:
# Mostrando os 5 primeiros registros

data.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [7]:
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [8]:
# Listando os dados do primeiro registro

for item in data.head():
  print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [9]:
# Listagem das colunas

data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

## Configurando o Dataframe para o Machine Learning

### Formatando para duas colunas "rótulo", "recursos"

In [10]:
assembler = VectorAssembler(
    inputCols = ['Avg Session Length',
                 'Time on App',
                 'Time on Website',
                 'Length of Membership'],
    outputCol = 'features'
)

In [11]:
output = assembler.transform(data)

In [12]:
output.select('features').show(10)

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
+--------------------+
only showing top 10 rows



In [13]:
output.show(10)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37

### Separando a base para treino e teste

In [14]:
final_data = output.select('features', 'Yearly Amount Spent')

In [15]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [17]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                360|
|   mean|  497.9170062533849|
| stddev|  78.78680052844595|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [18]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                140|
|   mean|  502.9064062719779|
| stddev|  80.83088783220472|
|    min|   266.086340948469|
|    max|  712.3963268096637|
+-------+-------------------+



## Criando o Modelo de Regressão Linear

In [19]:
lr = LinearRegression(labelCol = 'Yearly Amount Spent')

lr_model = lr.fit(train_data, )

In [20]:
# Printando os coeficientes e o valor de interceptação

print("Coefficients: {} \nIntercept: {}".format(lr_model.coefficients, lr_model.intercept))

Coefficients: [25.706500920765187,38.73525338001183,1.1565402914414142,61.60840128375682] 
 Intercept: -1078.2291414472083


In [21]:
test_results = lr_model.evaluate(test_data)

In [22]:
test_results.residuals.show(10)

+-------------------+
|          residuals|
+-------------------+
| 10.533760084842925|
|-10.717164151685779|
| -3.937679463434563|
| 0.4951060695759111|
|  -16.4530077923622|
|  3.750530711606757|
| -5.134247277314728|
| 23.237179271225614|
|  4.710163117579782|
|-1.6498333068126385|
+-------------------+
only showing top 10 rows



### Exibindo os resultados

In [23]:
unlabeled_data = test_data.select('features')

predictions = lr_model.transform(unlabeled_data)

In [24]:
predictions.show(10)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|398.10659098778456|
|[30.3931845423455...| 330.6460339548794|
|[30.4925366965402...| 286.4089251833491|
|[30.5743636841713...|441.56930768848974|
|[30.8162006488763...| 282.5393487408312|
|[31.0472221394875...|388.74686847741464|
|[31.0613251567161...|492.68970533521633|
|[31.2834474760581...| 568.5439101544418|
|[31.3584771924370...| 490.4657873318956|
|[31.5761319713222...|  542.876417296141|
+--------------------+------------------+
only showing top 10 rows



In [25]:
# Exibindo os valores de RMSE e MSE

print('RMSE: {}'.format(test_results.rootMeanSquaredError))
print('MSE: {}'.format(test_results.meanSquaredError))
print('MAE: {}'.format(test_results.meanAbsoluteError))

RMSE: 10.18451753748092
MSE: 103.72439747125642
MAE: 7.783552580159247
