In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=4946774fe9f502a1f90e06c56bfeced1f18ab536a96c55bc68d9e8cada91df11
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev, corr

# Initialize Spark session
spark = SparkSession.builder.appName("CropRecommendationStats").getOrCreate()

In [3]:
# Load the dataset
df = spark.read.csv('Crop_Recommendation.csv', header=True, inferSchema=True)

# Select relevant columns for analysis
columns = ['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH_Value', 'Rainfall']

In [4]:
# Compute basic statistics: mean and standard deviation
stats = {}
for column in columns:
    stats[column] = {
        "mean": df.select(mean(col(column))).collect()[0][0],
        "stddev": df.select(stddev(col(column))).collect()[0][0]
    }

# Compute correlation matrix
correlation_matrix = {}
for col1 in columns:
    for col2 in columns:
        if col1 != col2:
            correlation_matrix[(col1, col2)] = df.select(corr(col1, col2)).collect()[0][0]

In [5]:
# Display results
print("Basic Statistics (Mean and Standard Deviation):")
for column, stat in stats.items():
    print(f"{column}: Mean = {stat['mean']}, StdDev = {stat['stddev']}")

print("\nCorrelation Matrix:")
for (col1, col2), corr_value in correlation_matrix.items():
    print(f"Correlation between {col1} and {col2}: {corr_value}")

Basic Statistics (Mean and Standard Deviation):
Nitrogen: Mean = 50.551818181818184, StdDev = 36.917333833756594
Phosphorus: Mean = 53.36272727272727, StdDev = 32.98588273858713
Potassium: Mean = 48.14909090909091, StdDev = 50.647930546660135
Temperature: Mean = 25.616243851779533, StdDev = 5.0637485999588545
Humidity: Mean = 71.48177921778648, StdDev = 22.263811589761104
pH_Value: Mean = 6.469480065256368, StdDev = 0.7739376880298733
Rainfall: Mean = 103.46365541576832, StdDev = 54.958388524878174

Correlation Matrix:
Correlation between Nitrogen and Phosphorus: -0.23145957738457262
Correlation between Nitrogen and Potassium: -0.14051183844915763
Correlation between Nitrogen and Temperature: 0.026503796219081235
Correlation between Nitrogen and Humidity: 0.19068837919787315
Correlation between Nitrogen and pH_Value: 0.09668284622242826
Correlation between Nitrogen and Rainfall: 0.05902022369254324
Correlation between Phosphorus and Nitrogen: -0.23145957738457262
Correlation between Ph

In [6]:
# Stop the Spark session
spark.stop()