# Linear Regression Model Iteration

In [2]:
from pyspark.sql import SparkSession

#Initialize SparkSession
spark = SparkSession.builder \
     .appName("LinearRegressionModelImprovement") \
     .getOrCreate()
# Download Data
data_path = "Fin.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

                                                                                

In [3]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

#Define feature columns
feature_columns = ['Income Level']

# Combine feature vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_income = assembler.transform(df)

# Select features and target columns
df_income = df_income.select("features", "Incidence of malnutrition")

# Split the dataset
train_data, test_data = df_income.randomSplit([0.8, 0.2], seed=42)

#Initialize linear regression model
lr = LinearRegression(featuresCol='features', labelCol='Incidence of malnutrition')

# Train the model on the training set
lr_model = lr.fit(train_data)

# Make predictions
predictions = lr_model.transform(test_data)

# Calculate model evaluation indicators
evaluator = RegressionEvaluator(labelCol="Incidence of malnutrition", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
r2 = lr_model.summary.r2

print(f"Model 1 - Income Level:")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

24/05/23 17:09:20 WARN Instrumentation: [313408ed] regParam is zero, which might cause numerical instability and overfitting.
24/05/23 17:09:21 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/23 17:09:21 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
24/05/23 17:09:21 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Model 1 - Income Level:
RMSE: 8.287832999788485
R2: 0.44264770575913026


In [4]:
#Define feature columns
feature_columns = ['Agriculture_CG']

# Combine feature vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_agriculture = assembler.transform(df)

# Select features and target columns
df_agriculture = df_agriculture.select("features", "Incidence of malnutrition")

# Split the dataset
train_data, test_data = df_agriculture.randomSplit([0.8, 0.2], seed=42)

#Initialize linear regression model
lr = LinearRegression(featuresCol='features', labelCol='Incidence of malnutrition')

# Train the model on the training set
lr_model = lr.fit(train_data)

# Make predictions
predictions = lr_model.transform(test_data)

# Calculate model evaluation indicators
evaluator = RegressionEvaluator(labelCol="Incidence of malnutrition", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
r2 = lr_model.summary.r2

print(f"Model 2 - Agriculture_CG:")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

24/05/23 17:09:31 WARN Instrumentation: [0ca1fa5e] regParam is zero, which might cause numerical instability and overfitting.


Model 2 - Agriculture_CG:
RMSE: 11.387560101414723
R2: 0.13692118297939748


In [5]:
#Define feature columns
feature_columns = ['Total_Expenditure_CG']

# Combine feature vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_total_expenditure = assembler.transform(df)

# Select features and target columns
df_total_expenditure = df_total_expenditure.select("features", "Incidence of malnutrition")

# Split the dataset
train_data, test_data = df_total_expenditure.randomSplit([0.8, 0.2], seed=42)

#Initialize linear regression model
lr = LinearRegression(featuresCol='features', labelCol='Incidence of malnutrition')

# Train the model on the training set
lr_model = lr.fit(train_data)

# Make predictions
predictions = lr_model.transform(test_data)

# Calculate model evaluation indicators
evaluator = RegressionEvaluator(labelCol="Incidence of malnutrition", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
r2 = lr_model.summary.r2

print(f"Model 3 - Total_Expenditure_CG:")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")


24/05/23 17:09:50 WARN Instrumentation: [1b67ab38] regParam is zero, which might cause numerical instability and overfitting.


Model 3 - Total_Expenditure_CG:
RMSE: 12.091561504175314
R2: 0.13383624145976258
