In [7]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [29]:
import sys
sys.path.append("/usr/local/lib/python3.10/dist-packages")  

import findspark
findspark.init("/opt/spark/spark-3.5.4-bin-hadoop3")  # Path to your Spark installation
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler


In [30]:

spark = SparkSession.builder \
    .appName("Model Comparison") \
    .getOrCreate()


In [31]:

# Load data
data = spark.read.csv("./synthetic_crop_yield_dataset.csv", header=True, inferSchema=True)
data.show()


+---------+---------+-------------+----------------+-------+--------------+-----------+------------+---------------+---------+
|   Region|Crop Type|Rainfall (mm)|Temperature (°C)|Soil pH|Nitrogen (ppm)|Sowing Date|Harvest Date|Yield (tons/ha)|Soil Type|
+---------+---------+-------------+----------------+-------+--------------+-----------+------------+---------------+---------+
|  Haryana|    Maize|        422.7|            22.0|    5.8|          22.3| 2023-03-11|  2023-11-13|            2.2|     Clay|
|  Haryana|     Rice|        394.9|            23.5|    7.0|          54.9| 2023-02-05|  2023-09-08|            3.6|    Loamy|
|  Haryana|    Wheat|        306.7|            29.8|    7.0|          54.4| 2023-01-07|  2023-08-19|            2.7|    Sandy|
|   Punjab|    Wheat|        592.3|            34.9|    6.5|          58.0| 2023-05-18|  2023-10-19|            3.7|     Clay|
|   Punjab|     Rice|        653.2|            22.7|    6.3|          24.7| 2023-05-23|  2023-12-09|           

In [32]:
from pyspark.ml.feature import StringIndexer


state_indexer = StringIndexer(inputCol="Region", outputCol="state_index")
data = state_indexer.fit(data).transform(data)

crop_indexer = StringIndexer(inputCol="Crop Type", outputCol="crop_index")
data = crop_indexer.fit(data).transform(data)

soil_indexer = StringIndexer(inputCol="Soil Type", outputCol="soil_index")
data = soil_indexer.fit(data).transform(data)


In [33]:
from pyspark.sql.functions import col, floor
from pyspark.sql.types import IntegerType
# Convert label column to integer by flooring values (or round/mapping)

# Ensure indices are integers
data = data.withColumn("state_index", floor(col("state_index")))
data = data.withColumn("crop_index", floor(col("crop_index")))
data = data.withColumn("soil_index", floor(col("soil_index")))

#data = data.withColumn("label", col("Yield (tons/ha)").cast(IntegerType()))


In [34]:


feature_columns = ["state_index", "crop_index", "Rainfall (mm)", "Temperature (°C)", "Soil pH", "Nitrogen (ppm)"]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

data = assembler.transform(data).select("features", "Yield (tons/ha)")

data = data.withColumn("Yield (tons/ha)", col("Yield (tons/ha)").cast(IntegerType()))
#data = data.withColumn("features", col("features").cast(IntegerType()))



In [35]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Yield (tons/ha): integer (nullable = true)



In [36]:

 #Define models
models = {
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="Yield (tons/ha)"),
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="Yield (tons/ha)"),
    "Decision Tree": DecisionTreeClassifier(featuresCol="features", labelCol="Yield (tons/ha)")
}


In [37]:
# Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="accuracy"
)


In [38]:
train_data.select("features").distinct().show()
test_data.select("Yield (tons/ha)").show()


+--------------------+
|            features|
+--------------------+
|[0.0,0.0,277.4,34...|
|[1.0,2.0,563.5,15...|
|[2.0,0.0,674.8,24...|
|[1.0,2.0,779.9,34...|
|[1.0,0.0,231.3,26...|
|[2.0,0.0,657.4,34...|
|[2.0,0.0,244.9,29...|
|[0.0,1.0,261.0,25...|
|[0.0,2.0,667.9,25...|
|[2.0,1.0,763.3,33...|
|[0.0,2.0,358.2,31...|
|[0.0,0.0,435.7,33...|
|[0.0,0.0,610.5,34...|
|[1.0,0.0,604.7,15...|
|[1.0,1.0,440.8,26...|
|[2.0,1.0,401.3,20...|
|[0.0,0.0,449.2,17...|
|[0.0,1.0,586.6,20...|
|[1.0,0.0,394.3,20...|
|[1.0,2.0,372.4,17...|
+--------------------+
only showing top 20 rows

+---------------+
|Yield (tons/ha)|
+---------------+
|              1|
|              3|
|              2|
|              4|
|              4|
|              3|
|              4|
|              4|
|              2|
|              3|
|              4|
|              5|
|              5|
|              3|
|              3|
|              2|
|              1|
+---------------+



In [41]:
# from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize RandomForestRegressor
rf = RandomForestRegressor(featuresCol='features', labelCol='Yield (tons/ha)')

# Train the model
model = rf.fit(train_data)

# Make predictions
predictions = model.transform(test_data)
predictions.show()

# Save the trained model
model_path = "/home/abhay/Spark_dir/Research_Project/models/random_forest_model"
model.write().overwrite().save(model_path)
print(f"Model saved successfully at {model_path}")

# Evaluate model performance
evaluator = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")





+--------------------+---------------+------------------+
|            features|Yield (tons/ha)|        prediction|
+--------------------+---------------+------------------+
|[0.0,0.0,318.9,17...|              1| 2.199305194805195|
|[0.0,0.0,503.7,31...|              3| 2.666708754208754|
|[0.0,0.0,581.3,19...|              2|2.6546079192546586|
|[0.0,1.0,207.6,29...|              4|2.7465778403009393|
|[0.0,1.0,430.1,25...|              4| 2.947558361626729|
|[0.0,1.0,467.6,30...|              3| 3.271849758212176|
|[0.0,2.0,232.8,18...|              4| 2.750934099955772|
|[0.0,2.0,592.4,23...|              4| 2.386565054883742|
|[1.0,0.0,399.7,31...|              2|3.3258759037211716|
|[1.0,0.0,400.0,17...|              3| 2.698248196248196|
|[1.0,0.0,407.5,30...|              4| 2.660665094277899|
|[1.0,0.0,480.6,32...|              5|2.8392094135207477|
|[1.0,0.0,604.6,33...|              5|2.7432912153742888|
|[1.0,0.0,726.5,32...|              3| 3.084225700338505|
|[1.0,1.0,402.

In [42]:
# # Train and evaluate models
# from pyspark.ml.evaluation import RegressionEvaluator

# evaluator2 = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")

# results = {}
# for name, model in models.items():
#     # Train the model
#     trained_model = model.fit(train_data)
    
#     # Make predictions
#     predictions = trained_model.transform(test_data)
#     predictions.show(5)
#     # Evaluate accuracy
    

#     #evaluator = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")

#     rmse = evaluator2.evaluate(predictions)
#     print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

#     accuracy = evaluator.evaluate(predictions)
#     results[name] = accuracy
#     print(f"{name} Accuracy: {accuracy:.2f}")



In [43]:

# sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

# print("\nModel Performance Comparison:")
# for name, accuracy in sorted_results:
#     print(f"{name}: {accuracy:.2f}")

# Stop the Spark session
spark.stop()
