In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [3]:
sesh=pyspark.sql.SparkSession.builder.appName('v1').getOrCreate()
sesh

In [4]:
df=sesh.read.option('header','true').csv('Crop_recommendation.csv',inferSchema=True)

In [5]:
df.printSchema()

root
 |-- N: integer (nullable = true)
 |-- P: integer (nullable = true)
 |-- K: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- ph: double (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- label: string (nullable = true)



In [6]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+
|summary|                 N|                P|                 K|       temperature|          humidity|                ph|          rainfall|     label|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+
|  count|              2200|             2200|              2200|              2200|              2200|              2200|              2200|      2200|
|   mean|50.551818181818184|53.36272727272727| 48.14909090909091|25.616243851779533| 71.48177921778648| 6.469480065256369|103.46365541576832|      NULL|
| stddev|36.917333833756594|32.98588273858713|50.647930546660135|5.0637485999588545|22.263811589761104|0.7739376880298732|54.958388524878174|      NULL|
|    min|                 0|                5|                 5|       8.82567474

In [7]:
df_cleaned=df.na.drop()

In [8]:
df_cleaned.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+
|summary|                 N|                P|                 K|       temperature|          humidity|                ph|          rainfall|     label|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+
|  count|              2200|             2200|              2200|              2200|              2200|              2200|              2200|      2200|
|   mean|50.551818181818184|53.36272727272727| 48.14909090909091|25.616243851779533| 71.48177921778648| 6.469480065256369|103.46365541576832|      NULL|
| stddev|36.917333833756594|32.98588273858713|50.647930546660135|5.0637485999588545|22.263811589761104|0.7739376880298732|54.958388524878174|      NULL|
|    min|                 0|                5|                 5|       8.82567474

In [9]:
columns = df.columns
feature_columns = columns[:-1]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

df_assembled = assembler.transform(df)
df_assembled.show()

+---+---+---+-----------+-----------------+------------------+------------------+-----+--------------------+
|  N|  P|  K|temperature|         humidity|                ph|          rainfall|label|            features|
+---+---+---+-----------+-----------------+------------------+------------------+-----+--------------------+
| 90| 42| 43|20.87974371|      82.00274423| 6.502985292000001|       202.9355362| rice|[90.0,42.0,43.0,2...|
| 85| 58| 41|21.77046169|      80.31964408|       7.038096361|       226.6555374| rice|[85.0,58.0,41.0,2...|
| 60| 55| 44|23.00445915|       82.3207629|       7.840207144|       263.9642476| rice|[60.0,55.0,44.0,2...|
| 74| 35| 40|26.49109635|      80.15836264|       6.980400905|       242.8640342| rice|[74.0,35.0,40.0,2...|
| 78| 42| 42|20.13017482|      81.60487287|       7.628472891|       262.7173405| rice|[78.0,42.0,42.0,2...|
| 69| 37| 42|23.05804872|      83.37011772|       7.073453503|       251.0549998| rice|[69.0,37.0,42.0,2...|
| 69| 55| 38|22.708

In [24]:
# StringIndexer
indexer = StringIndexer(inputCol="label", outputCol="label_indexed")
df_indexed = indexer.fit(df_assembled).transform(df_assembled)

# OneHotEncoder
# encoder = OneHotEncoder(inputCols=["label_indexed"], outputCols=["label_encoded"])
# df_encoded = encoder.fit(df_indexed).transform(df_indexed)

df_indexed.show()


+---+---+---+-----------+-----------------+------------------+------------------+-----+--------------------+-------------+
|  N|  P|  K|temperature|         humidity|                ph|          rainfall|label|            features|label_indexed|
+---+---+---+-----------+-----------------+------------------+------------------+-----+--------------------+-------------+
| 90| 42| 43|20.87974371|      82.00274423| 6.502985292000001|       202.9355362| rice|[90.0,42.0,43.0,2...|         20.0|
| 85| 58| 41|21.77046169|      80.31964408|       7.038096361|       226.6555374| rice|[85.0,58.0,41.0,2...|         20.0|
| 60| 55| 44|23.00445915|       82.3207629|       7.840207144|       263.9642476| rice|[60.0,55.0,44.0,2...|         20.0|
| 74| 35| 40|26.49109635|      80.15836264|       6.980400905|       242.8640342| rice|[74.0,35.0,40.0,2...|         20.0|
| 78| 42| 42|20.13017482|      81.60487287|       7.628472891|       262.7173405| rice|[78.0,42.0,42.0,2...|         20.0|
| 69| 37| 42|23.

In [25]:
df_final = df_indexed.select("features", "label_indexed")
df_final.show()

+--------------------+-------------+
|            features|label_indexed|
+--------------------+-------------+
|[90.0,42.0,43.0,2...|         20.0|
|[85.0,58.0,41.0,2...|         20.0|
|[60.0,55.0,44.0,2...|         20.0|
|[74.0,35.0,40.0,2...|         20.0|
|[78.0,42.0,42.0,2...|         20.0|
|[69.0,37.0,42.0,2...|         20.0|
|[69.0,55.0,38.0,2...|         20.0|
|[94.0,53.0,40.0,2...|         20.0|
|[89.0,54.0,38.0,2...|         20.0|
|[68.0,58.0,38.0,2...|         20.0|
|[91.0,53.0,40.0,2...|         20.0|
|[90.0,46.0,42.0,2...|         20.0|
|[78.0,58.0,44.0,2...|         20.0|
|[93.0,56.0,36.0,2...|         20.0|
|[94.0,50.0,37.0,2...|         20.0|
|[60.0,48.0,39.0,2...|         20.0|
|[85.0,38.0,41.0,2...|         20.0|
|[91.0,35.0,39.0,2...|         20.0|
|[77.0,38.0,36.0,2...|         20.0|
|[88.0,35.0,40.0,2...|         20.0|
+--------------------+-------------+
only showing top 20 rows



In [26]:
train_ratio = 0.7
test_ratio = 0.3

train_df, test_df = df_final.randomSplit([train_ratio, test_ratio], seed=42)

In [27]:
from pyspark.ml.classification import RandomForestClassifier

# Create a RandomForestClassifier object
rf = RandomForestClassifier(featuresCol="features", labelCol="label_indexed")

# Train the model
rf_model = rf.fit(train_df)


In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = rf_model.transform(test_df)


evaluator = MulticlassClassificationEvaluator(labelCol="label_indexed", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

evaluator = MulticlassClassificationEvaluator(labelCol="label_indexed", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)

evaluator = MulticlassClassificationEvaluator(labelCol="label_indexed", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)



Accuracy: 0.973421926910299
Precision: 0.9789261664261663
Recall: 0.9734219269102988
