In [1]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [26]:
import sys
sys.path.append("/usr/local/lib/python3.10/dist-packages")  

import findspark
findspark.init("/opt/spark/spark-3.5.4-bin-hadoop3")  # Path to your Spark installation
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler


In [27]:

spark = SparkSession.builder \
    .appName("Model Comparison") \
    .getOrCreate()


In [28]:

# Load data
data = spark.read.csv("./Synthetic_Crops_dataset.csv", header=True, inferSchema=True)
data.show()


+----------+---------+-------------+----------------+-------+--------------+-----------+------------+---------------+---------+
|    Region|Crop Type|Rainfall (mm)|Temperature (°C)|Soil pH|Nitrogen (ppm)|Sowing Date|Harvest Date|Yield (tons/ha)|Soil Type|
+----------+---------+-------------+----------------+-------+--------------+-----------+------------+---------------+---------+
|   Solapur|    Maize|        641.5|            25.9|    5.7|          30.6| 2023-07-08|  2023-11-30|           2.38|    Sandy|
|     Latur|    Wheat|        621.3|            31.7|    7.3|          50.2| 2023-01-03|  2023-06-11|           4.79|    Loamy|
|    Satara|    Wheat|       1037.4|            22.1|    5.8|          36.9| 2023-06-03|  2023-10-19|           3.62|     Clay|
|    Satara|     Toor|        731.3|            23.3|    7.4|          58.6| 2023-07-13|  2023-10-14|           3.53|     Clay|
|    Satara|    Bajra|       1040.6|            28.7|    6.2|          56.9| 2023-12-24|  2024-05-25|   

In [29]:
from pyspark.ml.feature import StringIndexer


state_indexer = StringIndexer(inputCol="Region", outputCol="state_index")
data = state_indexer.fit(data).transform(data)

crop_indexer = StringIndexer(inputCol="Crop Type", outputCol="crop_index")
data = crop_indexer.fit(data).transform(data)

soil_indexer = StringIndexer(inputCol="Soil Type", outputCol="soil_index")
data = soil_indexer.fit(data).transform(data)


In [30]:
from pyspark.sql.functions import col, floor
from pyspark.sql.types import IntegerType
# Convert label column to integer by flooring values (or round/mapping)

# Ensure indices are integers
data = data.withColumn("state_index", floor(col("state_index")))
data = data.withColumn("crop_index", floor(col("crop_index")))
data = data.withColumn("soil_index", floor(col("soil_index")))

#data = data.withColumn("label", col("Yield (tons/ha)").cast(IntegerType()))


In [32]:


feature_columns = ["state_index", "crop_index", "Rainfall (mm)", "Temperature (°C)", "Soil pH", "Nitrogen (ppm)", "soil_index"]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

data = assembler.transform(data).select("features", "Yield (tons/ha)")

data = data.withColumn("Yield (tons/ha)", col("Yield (tons/ha)").cast(IntegerType()))
#data = data.withColumn("features", col("features").cast(IntegerType()))



In [33]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Yield (tons/ha): integer (nullable = true)



In [34]:

 #Define models
models = {
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="Yield (tons/ha)"),
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="Yield (tons/ha)"),
    "Decision Tree": DecisionTreeClassifier(featuresCol="features", labelCol="Yield (tons/ha)")
}


In [35]:
# Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="accuracy"
)


In [36]:
train_data.select("features").distinct().show()
test_data.select("Yield (tons/ha)").show()


+--------------------+
|            features|
+--------------------+
|[5.0,0.0,757.0,29...|
|[5.0,1.0,1082.7,3...|
|[5.0,2.0,1327.5,2...|
|[8.0,5.0,402.2,23...|
|[0.0,5.0,810.8,31...|
|[1.0,6.0,999.0,27...|
|[2.0,4.0,621.7,20...|
|[6.0,1.0,639.9,22...|
|[6.0,4.0,1068.9,3...|
|[7.0,3.0,438.4,28...|
|[9.0,3.0,659.4,24...|
|[0.0,1.0,635.4,23...|
|[1.0,1.0,921.9,22...|
|[1.0,0.0,730.7,32...|
|[2.0,4.0,699.8,22...|
|[5.0,6.0,859.2,27...|
|[7.0,0.0,893.3,24...|
|[7.0,2.0,1341.3,2...|
|[7.0,4.0,974.1,26...|
|[0.0,0.0,716.7,21...|
+--------------------+
only showing top 20 rows

+---------------+
|Yield (tons/ha)|
+---------------+
|              4|
|              2|
|              2|
|              3|
|              4|
|              4|
|              5|
|              2|
|              2|
|              2|
|              2|
|              4|
|              2|
|              2|
|              2|
|              4|
|              2|
|              4|
|              5|
|              4|
+-------

In [37]:
# from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize RandomForestRegressor
rf = RandomForestRegressor(featuresCol='features', labelCol='Yield (tons/ha)')

# Train the model
model = rf.fit(train_data)

# Make predictions
predictions = model.transform(test_data)
predictions.show()

# Save the trained model
model_path = "/home/abhay/Spark_dir/Research_Project/models/random_forest_model"
model.write().overwrite().save(model_path)
print(f"Model saved successfully at {model_path}")

# Evaluate model performance
evaluator = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")



+--------------------+---------------+------------------+
|            features|Yield (tons/ha)|        prediction|
+--------------------+---------------+------------------+
|[0.0,0.0,680.6,22...|              4| 3.169499446086983|
|[0.0,0.0,838.7,23...|              2| 3.170009948709631|
|[0.0,0.0,1034.1,3...|              2| 3.294024933967022|
|[0.0,1.0,500.8,29...|              3|3.2395811976224955|
|[0.0,1.0,758.8,21...|              4| 3.335875011060582|
|[0.0,1.0,877.7,24...|              4|3.0989124214035937|
|[0.0,1.0,1019.0,2...|              5|3.2180977718038326|
|[0.0,2.0,1023.3,3...|              2| 3.034293577645074|
|[0.0,3.0,400.9,24...|              2|3.6855029378213358|
|[0.0,3.0,406.8,28...|              2| 3.312517742097809|
|[0.0,3.0,421.6,24...|              2|  3.63421986854299|
|[0.0,3.0,531.9,28...|              4|3.2866768448835635|
|[0.0,3.0,569.1,26...|              2|3.3241325344555728|
|[0.0,3.0,600.5,29...|              2|3.2291498458381946|
|[0.0,3.0,870.

In [14]:
# # Train and evaluate models
# from pyspark.ml.evaluation import RegressionEvaluator

# evaluator2 = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")

# results = {}
# for name, model in models.items():
#     # Train the model
#     trained_model = model.fit(train_data)
    
#     # Make predictions
#     predictions = trained_model.transform(test_data)
#     predictions.show(5)
#     # Evaluate accuracy
    

#     #evaluator = RegressionEvaluator(labelCol="Yield (tons/ha)", predictionCol="prediction", metricName="rmse")

#     rmse = evaluator2.evaluate(predictions)
#     print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

#     accuracy = evaluator.evaluate(predictions)
#     results[name] = accuracy
#     print(f"{name} Accuracy: {accuracy:.2f}")



+--------------------+---------------+--------------------+--------------------+----------+
|            features|Yield (tons/ha)|       rawPrediction|         probability|prediction|
+--------------------+---------------+--------------------+--------------------+----------+
|[0.0,0.0,680.6,22...|              4|[-15.192059039953...|[2.95179166203231...|       2.0|
|[0.0,0.0,838.7,23...|              2|[-14.992491048902...|[3.92212200795026...|       2.0|
|[0.0,0.0,1034.1,3...|              2|[-14.859890398693...|[4.81972421271956...|       2.0|
|[0.0,1.0,500.8,29...|              3|[-15.608509220501...|[1.61152520045685...|       2.0|
|[0.0,1.0,758.8,21...|              4|[-15.022829330871...|[3.83011072462622...|       2.0|
+--------------------+---------------+--------------------+--------------------+----------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data: 1.3263707095903692
Logistic Regression Accuracy: 0.30
+--------------------+---------------+----------

In [43]:

# sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

# print("\nModel Performance Comparison:")
# for name, accuracy in sorted_results:
#     print(f"{name}: {accuracy:.2f}")

# Stop the Spark session
spark.stop()
