In [7]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [1]:
import sys
sys.path.append("/usr/local/lib/python3.10/dist-packages")  

import findspark
findspark.init("/opt/spark/spark-3.5.4-bin-hadoop3")  # Path to your Spark installation
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler


In [2]:

spark = SparkSession.builder \
    .appName("Model Comparison") \
    .getOrCreate()


25/01/21 13:36:25 WARN Utils: Your hostname, abhay-ki-bandi resolves to a loopback address: 127.0.1.1; using 192.168.163.114 instead (on interface wlp0s20f3)
25/01/21 13:36:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/21 13:36:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/21 13:36:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/01/21 13:36:26 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:

# Load data
data = spark.read.csv("./synthetic_crop_yield_dataset.csv", header=True, inferSchema=True)
data.show()


+---------+---------+-------------+----------------+-------+--------------+-----------+------------+---------------+---------+
|   Region|Crop Type|Rainfall (mm)|Temperature (°C)|Soil pH|Nitrogen (ppm)|Sowing Date|Harvest Date|Yield (tons/ha)|Soil Type|
+---------+---------+-------------+----------------+-------+--------------+-----------+------------+---------------+---------+
|  Haryana|    Maize|        422.7|            22.0|    5.8|          22.3| 2023-03-11|  2023-11-13|            2.2|     Clay|
|  Haryana|     Rice|        394.9|            23.5|    7.0|          54.9| 2023-02-05|  2023-09-08|            3.6|    Loamy|
|  Haryana|    Wheat|        306.7|            29.8|    7.0|          54.4| 2023-01-07|  2023-08-19|            2.7|    Sandy|
|   Punjab|    Wheat|        592.3|            34.9|    6.5|          58.0| 2023-05-18|  2023-10-19|            3.7|     Clay|
|   Punjab|     Rice|        653.2|            22.7|    6.3|          24.7| 2023-05-23|  2023-12-09|           

In [4]:
from pyspark.ml.feature import StringIndexer


state_indexer = StringIndexer(inputCol="Region", outputCol="state_index")
data = state_indexer.fit(data).transform(data)

crop_indexer = StringIndexer(inputCol="Crop Type", outputCol="crop_index")
data = crop_indexer.fit(data).transform(data)

soil_indexer = StringIndexer(inputCol="Soil Type", outputCol="soil_index")
data = soil_indexer.fit(data).transform(data)


In [5]:
from pyspark.sql.functions import col, floor
from pyspark.sql.types import IntegerType
# Convert label column to integer by flooring values (or round/mapping)

# Ensure indices are integers
data = data.withColumn("state_index", floor(col("state_index")))
data = data.withColumn("crop_index", floor(col("crop_index")))
data = data.withColumn("soil_index", floor(col("soil_index")))

#data = data.withColumn("label", col("Yield (tons/ha)").cast(IntegerType()))


In [6]:


feature_columns = ["state_index", "crop_index", "Rainfall (mm)", "Temperature (°C)", "Soil pH", "Nitrogen (ppm)"]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

data = assembler.transform(data).select("features", "Yield (tons/ha)")

data = data.withColumn("Yield (tons/ha)", col("Yield (tons/ha)").cast(IntegerType()))
#data = data.withColumn("features", col("features").cast(IntegerType()))



In [7]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Yield (tons/ha): integer (nullable = true)



In [8]:

 #Define models
models = {
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="Yield (tons/ha)"),
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="Yield (tons/ha)"),
    "Decision Tree": DecisionTreeClassifier(featuresCol="features", labelCol="Yield (tons/ha)")
}


In [9]:
# Evaluator
#evaluator = MulticlassClassificationEvaluator(
 #   labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse"
#)


In [10]:
train_data.select("features").distinct().show()
test_data.select("Yield (tons/ha)").distinct().show()


+--------------------+
|            features|
+--------------------+
|[0.0,0.0,277.4,34...|
|[1.0,2.0,563.5,15...|
|[2.0,0.0,674.8,24...|
|[1.0,2.0,779.9,34...|
|[1.0,0.0,231.3,26...|
|[2.0,0.0,657.4,34...|
|[2.0,0.0,244.9,29...|
|[0.0,1.0,261.0,25...|
|[0.0,2.0,667.9,25...|
|[2.0,1.0,763.3,33...|
|[0.0,2.0,358.2,31...|
|[0.0,0.0,435.7,33...|
|[0.0,0.0,610.5,34...|
|[1.0,0.0,604.7,15...|
|[1.0,1.0,440.8,26...|
|[2.0,1.0,401.3,20...|
|[0.0,0.0,449.2,17...|
|[0.0,1.0,586.6,20...|
|[1.0,0.0,394.3,20...|
|[1.0,2.0,372.4,17...|
+--------------------+
only showing top 20 rows

+---------------+
|Yield (tons/ha)|
+---------------+
|              1|
|              3|
|              5|
|              4|
|              2|
+---------------+



In [11]:
#from pyspark.ml.classification import LogisticRegression



#lr = LogisticRegression(featuresCol='features', labelCol='Yield (tons/ha)')  # Replace 'your_target_column' with your actual label column


#model = lr.fit(train_data)

#predictions = model.transform(test_data)


#predictions.show()


#from pyspark.ml.evaluation import RegressionEvaluator

#evaluator = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")

#rmse = evaluator.evaluate(predictions)
#print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")


In [14]:
# Train and evaluate models
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")

results = {}
for name, model in models.items():
    # Train the model
    trained_model = model.fit(train_data)
    
    # Make predictions
    predictions = trained_model.transform(test_data)
    predictions.show(5)
    # Evaluate accuracy
    

    #evaluator = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")

    rmse = evaluator.evaluate(predictions)
    print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

    #accuracy = evaluator.evaluate(predictions)
    #results[name] = accuracy
    #print(f"{name} Accuracy: {accuracy:.2f}")
spark.stop()


+--------------------+---------------+--------------------+--------------------+----------+
|            features|Yield (tons/ha)|       rawPrediction|         probability|prediction|
+--------------------+---------------+--------------------+--------------------+----------+
|[0.0,0.0,318.9,17...|              1|[-20.212949007303...|[4.31785408491820...|       2.0|
|[0.0,0.0,503.7,31...|              3|[-18.698850157816...|[2.57602318715272...|       3.0|
|[0.0,0.0,581.3,19...|              2|[-18.862777432167...|[2.08997963105298...|       3.0|
|[0.0,1.0,207.6,29...|              4|[-19.661360848696...|[8.42557102746699...|       2.0|
|[0.0,1.0,430.1,25...|              4|[-19.060890117291...|[1.67883559163286...|       2.0|
+--------------------+---------------+--------------------+--------------------+----------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data: 1.5718104959867516
+--------------------+---------------+--------------------+--------------------+---

In [140]:
# Sort results
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

print("\nModel Performance Comparison:")
for name, accuracy in sorted_results:
    print(f"{name}: {accuracy:.2f}")

# Stop the Spark session
spark.stop()



Model Performance Comparison:
Decision Tree: 0.29
Logistic Regression: 0.24
Random Forest: 0.24
