In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
   .master("local") \
   .appName("Linear Regression Model") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()

In [4]:
sc = spark.sparkContext

In [27]:
data = spark.read.csv('Indaiatuba_BigData_Prep.csv', inferSchema=True, header=True)

In [28]:
data.printSchema()

root
 |-- CURSO: string (nullable = true)
 |-- TURNO: string (nullable = true)
 |-- NOME: string (nullable = true)
 |-- STATUS_ALUNO: integer (nullable = true)
 |-- ESCOLA_PUBLICA: integer (nullable = true)
 |-- RAÇA: double (nullable = true)
 |-- NOTA_VESTIBULAR: double (nullable = true)
 |-- MediaNotas: double (nullable = true)
 |-- Frequencia: double (nullable = true)



In [34]:
#data.show()

In [33]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [42]:
#Cria uma caixa preparada para receber os dados e devolver uma coluna com sua junção
featureAssembler=VectorAssembler(inputCols=['ESCOLA_PUBLICA','RAÇA','NOTA_VESTIBULAR','MediaNotas','Frequencia'],outputCol='Idependent_Features')

In [43]:
#Recebe os dados -data- e adicina a coluna conforme estipulado pela caixa
output=featureAssembler.transform(data)

In [44]:
#Apenas os dados relevantes (entradas, saida)
finalized_data = output.select('Idependent_Features','STATUS_ALUNO')

In [57]:
from pyspark.ml.regression import LinearRegression
#Cria a caixa da regreção que recebera dados
regressor = LinearRegression(featuresCol='Idependent_Features', labelCol='STATUS_ALUNO')

In [59]:
train_data,test_data = finalized_data.randomSplit([.75,.25])

In [60]:
#Coloca os dados -train_data- na caixa
regressor=regressor.fit(train_data)

In [64]:
regressor.coefficients

DenseVector([-0.0029, 0.0118, 0.0012, 0.1169, -0.0008])

In [65]:
regressor.intercept

-0.136832701854131

In [67]:
pred_results=regressor.evaluate(test_data)

In [69]:
pred_results.predictions.show()

+--------------------+------------+--------------------+
| Idependent_Features|STATUS_ALUNO|          prediction|
+--------------------+------------+--------------------+
|(5,[0,1],[1.0,-0....|           0|-0.14863505139309016|
|(5,[0,1],[1.0,-0....|           0|-0.14863505139309016|
| (5,[0,1],[1.0,0.5])|           0| -0.1338735784063694|
|(5,[0,2],[1.0,63....|           0|-0.06361700273470748|
|(5,[0,2],[1.0,67.1])|           0|-0.05933087657339774|
|(5,[0,2],[1.0,99....|           0|-0.02042603910920...|
|(5,[0,4],[1.0,26.0])|           0|-0.16128617337538065|
|     (5,[1],[-0.75])|           0|-0.14568958564616344|
|     (5,[1],[-0.75])|           0|-0.14568958564616344|
|(5,[1,2],[-0.75,2...|           0|-0.11520713595349198|
|(5,[1,2],[-0.75,5...|           0|-0.07546305700316518|
|(5,[1,2],[-0.75,6...|           0| -0.0725556847119271|
|(5,[1,2],[-0.75,6...|           0|-0.07015785189441115|
|(5,[1,2],[-0.75,6...|           0|-0.06925866458784267|
|(5,[1,2],[-0.75,6...|         

In [71]:
type(pred_results)

pyspark.ml.regression.LinearRegressionSummary

In [74]:
print(pred_results.meanAbsoluteError,',',pred_results.meanSquaredError)

0.27883818372085145 , 0.11737232758054582


In [76]:
pred_results.rootMeanSquaredError

0.3425964500407817

Py4JJavaError: An error occurred while calling o299.coefficientStandardErrors.
: java.lang.UnsupportedOperationException: No Std. Error of coefficients available for this LinearRegressionModel
	at org.apache.spark.ml.regression.LinearRegressionSummary.coefficientStandardErrors$lzycompute(LinearRegression.scala:992)
	at org.apache.spark.ml.regression.LinearRegressionSummary.coefficientStandardErrors(LinearRegression.scala:990)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
