# build first model：linear regression  

In [4]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_regression_adv').getOrCreate()
from pyspark.ml.regression import LinearRegression

# Download Data
data_path = "Fin.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# View data
df.show(5)


+---------------+-------+----+--------------+--------------------+-------------------------+------------------+--------------------------------+---------+------------+
|Area Code (FAO)|   Area|Year|Agriculture_CG|Total_Expenditure_CG|Incidence of malnutrition|GDP_Per_Capita_PPP|Children_Under_5_Wasting_Percent|GDP(mean)|Income Level|
+---------------+-------+----+--------------+--------------------+-------------------------+------------------+--------------------------------+---------+------------+
|              3|Albania|2017|         60.28|             2936.56|                      4.2|           12771.0|                             1.6| 13502.28|         3.0|
|              3|Albania|2018|         74.81|             3309.15|                      4.1|           13317.1|                             1.6| 13502.28|         3.0|
|              3|Albania|2019|         55.54|             3285.52|                      4.1|           13653.2|                             1.6| 13502.28|      

In [5]:
from pyspark.ml.feature import VectorAssembler

#Define feature columns
feature_columns = [
     'Area Code (FAO)', 'Year', 'Agriculture_CG',
     'Total_Expenditure_CG', 'Incidence of malnutrition',
     'GDP_Per_Capita_PPP', 'GDP(mean)', 'Income Level'
]

# Combine feature vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# Select features and target columns
df = df.select("features", "Children_Under_5_Wasting_Percent")

from pyspark.ml.regression import LinearRegression

#Initialize linear regression model
lr = LinearRegression(featuresCol='features', labelCol='Children_Under_5_Wasting_Percent')

#Train model
lr_model = lr.fit(df)

# Make predictions
predictions = lr_model.transform(df)

# Select actual and predicted values
predictions.select("prediction", "Children_Under_5_Wasting_Percent").show(5)


24/05/23 14:02:10 WARN Instrumentation: [09896dc0] regParam is zero, which might cause numerical instability and overfitting.


+------------------+--------------------------------+
|        prediction|Children_Under_5_Wasting_Percent|
+------------------+--------------------------------+
|  3.48505875684981|                             1.6|
| 3.440212077545578|                             1.6|
|3.4871182437184167|                             1.6|
| 3.405726628231374|                             1.6|
|3.3107475703069333|                             1.6|
+------------------+--------------------------------+
only showing top 5 rows



# Build a second model: decision tree

In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DecisionTreeModel") \
    .getOrCreate()

data_path = "Fin.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

In [16]:
from pyspark.ml.feature import VectorAssembler

#Define feature columns
feature_columns = [
     'Area Code (FAO)', 'Year', 'Agriculture_CG',
     'Total_Expenditure_CG', 'GDP_Per_Capita_PPP',
     'Children_Under_5_Wasting_Percent', 'GDP(mean)', 'Income Level'
]
# Combine feature vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# Select features and target columns
df = df.select("features", "Incidence of malnutrition")
from pyspark.ml.regression import DecisionTreeRegressor

#Initialize the decision tree regression model
dt = DecisionTreeRegressor(featuresCol='features', labelCol='Incidence of malnutrition')

#Train model
dt_model = dt.fit(df)
# Make predictions
predictions = dt_model.transform(df)

# Select actual and predicted values
predictions.select("prediction", "Incidence of malnutrition").show(5)

+-----------------+-------------------------+
|       prediction|Incidence of malnutrition|
+-----------------+-------------------------+
|3.878166666666666|                      4.2|
|3.878166666666666|                      4.1|
|3.878166666666666|                      4.1|
|3.878166666666666|                      4.1|
|3.878166666666666|                      4.1|
+-----------------+-------------------------+
only showing top 5 rows



# Build the third model: Random Forest

In [17]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RandomForestModel") \
    .getOrCreate()

data_path = "Fin.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

In [18]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

#Define feature columns
feature_columns = [
     'Area Code (FAO)', 'Year', 'Agriculture_CG',
     'Total_Expenditure_CG', 'GDP_Per_Capita_PPP',
     'Children_Under_5_Wasting_Percent', 'GDP(mean)', 'Income Level'
]

# Combine feature vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

#Convert target column to index label
indexer = StringIndexer(inputCol="Incidence of malnutrition", outputCol="label")
df = indexer.fit(df).transform(df)

# Select features and target columns
df = df.select("features", "label")

from pyspark.ml.classification import RandomForestClassifier

#Initialize the random forest classification model
rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=10)

#Train model
rf_model = rf.fit(df)

# Make predictions
predictions = rf_model.transform(df)

# Select actual and predicted values
predictions.select("prediction", "label").show(5)


                                                                                

+----------+-----+
|prediction|label|
+----------+-----+
|       8.0|  2.0|
|       8.0|  8.0|
|       8.0|  8.0|
|       8.0|  8.0|
|       8.0|  8.0|
+----------+-----+
only showing top 5 rows

