In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
os.environ['SPARK_HOME'] = os.getenv('SPARK_HOME')
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = 'lab'
os.environ['PYSPARK_PYTHON'] = "python"

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master('local[*]').appName('Mediacal_cost_prediction').getOrCreate()

In [4]:
df = spark.read.csv('./data/medical.csv', header=True, inferSchema=True)
df.show()

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [5]:
categorical_cols = df.columns
categorical_cols.remove('age')
categorical_cols.remove('bmi')
categorical_cols.remove('children')
categorical_cols.remove('charges')

In [6]:
from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCols=categorical_cols,
                               outputCols=['sex_index', 'smoker_index', 'region_index'])

string_indexer = string_indexer.fit(df)

df = string_indexer.transform(df)


In [7]:
from pyspark.sql.functions import col

df = df.withColumn('sex_index', col('sex_index').cast('int'))
df = df.withColumn('smoker_index', col('smoker_index').cast('int'))

In [8]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol='region_index',
                        outputCol='region_one_hot')

encoder = encoder.fit(df)
df = encoder.transform(df)

In [9]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['age', 'bmi', 'children'], 
                            outputCol='numerical_cols_vector')

df = assembler.transform(df)

In [10]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='numerical_cols_vector',
                        outputCol='scaled_numerical_cols_vector',
                        withStd=True, withMean=True)

scaler = scaler.fit(df)

df = scaler.transform(df)

In [11]:
assembler = VectorAssembler(inputCols=['sex_index', 'smoker_index', 'region_one_hot', 'scaled_numerical_cols_vector'],
                            outputCol='final_features_vector')

df = assembler.transform(df)

In [12]:
df.select('final_features_vector').take(10)

[Row(final_features_vector=DenseVector([1.0, 1.0, 0.0, 0.0, 1.0, -1.4382, -0.4532, -0.9083])),
 Row(final_features_vector=SparseVector(8, {2: 1.0, 5: -1.5094, 6: 0.5094, 7: -0.0787})),
 Row(final_features_vector=SparseVector(8, {2: 1.0, 5: -0.7977, 6: 0.3832, 7: 1.5803})),
 Row(final_features_vector=SparseVector(8, {3: 1.0, 5: -0.4418, 6: -1.305, 7: -0.9083})),
 Row(final_features_vector=SparseVector(8, {3: 1.0, 5: -0.513, 6: -0.2924, 7: -0.9083})),
 Row(final_features_vector=DenseVector([1.0, 0.0, 1.0, 0.0, 0.0, -0.5841, -0.8074, -0.9083])),
 Row(final_features_vector=DenseVector([1.0, 0.0, 1.0, 0.0, 0.0, 0.4835, 0.4553, -0.0787])),
 Row(final_features_vector=DenseVector([1.0, 0.0, 0.0, 1.0, 0.0, -0.1571, -0.4794, 1.5803])),
 Row(final_features_vector=SparseVector(8, {5: -0.1571, 6: -0.1367, 7: 0.7508})),
 Row(final_features_vector=DenseVector([1.0, 0.0, 0.0, 1.0, 0.0, 1.4799, -0.791, -0.9083]))]