# Regressor

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS db

In [0]:
%sql
USE db

In [0]:
%sql
CREATE TABLE motor_insurance
USING CSV
OPTIONS (
  path "/FileStore/motor_insurance___2021.csv",
  header "true",
  inferSchema "true",
  delimiter ","
);

In [0]:
from pyspark.ml.regression import LinearRegression 
from pyspark.ml.feature import VectorAssembler 
dfs = spark.table("motor_insurance") 
dfs.show(2)

+------+-------------+-------+---------+---------+
|GENDER|INSURED_VALUE|PREMIUM|PROD_YEAR|SEATS_NUM|
+------+-------------+-------+---------+---------+
|     0|     200000.0|3452.65|     1982|       10|
|     0|     200000.0|3077.54|     1982|       10|
+------+-------------+-------+---------+---------+
only showing top 2 rows



In [0]:
dfv = VectorAssembler(inputCols=['GENDER','INSURED_VALUE','PROD_YEAR','SEATS_NUM'],outputCol='FEAT_VECT').transform(dfs) 
dfv.select('FEAT_VECT').show(5)

+--------------------+
|           FEAT_VECT|
+--------------------+
|[0.0,200000.0,198...|
|[0.0,200000.0,198...|
|[0.0,200000.0,198...|
|[1.0,152038.0,200...|
|[0.0,400000.0,200...|
+--------------------+
only showing top 5 rows



In [0]:
linreg = LinearRegression(featuresCol="FEAT_VECT", labelCol="PREMIUM").fit(dfv) 
print(linreg.coefficients,linreg.intercept)

[-4.585032639083987,0.009848561462838364,-22.47668831150313,57.64491657921904] 45605.9206051514


In [0]:
fut = spark.createDataFrame([(0,33000,2024,5),
                             (1,33000,2024,5)],
                            ['GENDER','INSURED_VALUE','PROD_YEAR','SEATS_NUM'])
futv = VectorAssembler(inputCols=['GENDER','INSURED_VALUE','PROD_YEAR','SEATS_NUM'], outputCol='FEAT_VECT').transform(fut)
preds = linreg.transform(futv)
preds.select('prediction').show()

+-----------------+
|       prediction|
+-----------------+
|726.3305738388299|
|721.7455411997435|
+-----------------+



# Classifier

In [0]:
%sql
CREATE TABLE Loans_default
USING CSV
OPTIONS (
  path "/FileStore/Loans___default.csv",
  header "true",
  inferSchema "true",
  delimiter ","
);

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
dfs = spark.table("Loans_default")  
vector_assembler = VectorAssembler(inputCols=['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines',
                   'InterestRate', 'LoanTerm', 'HasMortgage', 'HasDependents'], outputCol="features")
dfv = vector_assembler.transform(dfs)
dfv.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[46.0,84208.0,129...|
|[32.0,31713.0,447...|
|[60.0,20437.0,913...|
|[36.0,42053.0,923...|
|[28.0,140466.0,16...|
+--------------------+
only showing top 5 rows



In [0]:
classif = RandomForestClassifier(featuresCol="features", labelCol='Default').fit(dfv)
fut = spark.createDataFrame([(33, 88000, 33000, 700, 60, 1, 0.065, 36, 1, 1)], 
                            ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines',
                             'InterestRate', 'LoanTerm', 'HasMortgage', 'HasDependents'])
futv = vector_assembler.transform(fut) 
predictions = classif.transform(futv)
predictions.select("prediction").show()

+----------+
|prediction|
+----------+
|       0.0|
+----------+

