In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
# Make sure the CSV file is in the same directory as your script,
# or provide the full path to the file.
df = pd.read_csv('Crop_recommendation_NPK_simplified_ratios.csv')

# --- 1. Preprocessing and EDA ---

# Display basic information about the dataset
print("--- Initial Data Info ---")
df.info()
print("\n--- First 5 Rows of Data ---")
print(df.head())

df.drop(df[df['label'].isin(['mango', 'jute'])].index, inplace=True)

# Check for missing values
print("\n--- Missing Values ---")
print(df.isnull().sum())

--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            1700 non-null   int64  
 1   P            1700 non-null   int64  
 2   K            1700 non-null   int64  
 3   Soil_Type    1700 non-null   object 
 4   temperature  1700 non-null   float64
 5   humidity     1700 non-null   float64
 6   ph           1700 non-null   float64
 7   rainfall     1700 non-null   float64
 8   label        1700 non-null   object 
dtypes: float64(4), int64(3), object(2)
memory usage: 119.7+ KB

--- First 5 Rows of Data ---
   N  P  K Soil_Type  temperature   humidity        ph    rainfall label
0  3  1  2  Alluvial    20.879744  82.002744  6.502985  202.935536  rice
1  3  2  1      Clay    21.770462  80.319644  7.038096  226.655537  rice
2  2  2  1      Clay    23.004459  82.320763  7.840207  263.964248  rice
3  3  1  2  Alluvial

In [110]:
df['Soil_Type'].unique()

KeyError: 'Soil_Type'

In [86]:
# Encode crop type into numbers
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

#OHE
df = pd.get_dummies(df, columns=['Soil_Type'], prefix='soil')

In [93]:
# Now split features/target
X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
# --- 3. Model Selection and Training ---

# Model 1: Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)

In [98]:
# --- 4. Model Evaluation ---

print("\n\n--- Model Evaluation ---")

print("\n--- Random Forest Classifier ---")
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Accuracy: {rf_accuracy:.4f}")
print("Classification Report:")
# Use le.classes_ to get the original string labels
print(classification_report(y_test, rf_predictions, target_names=le.classes_))



--- Model Evaluation ---

--- Random Forest Classifier ---
Accuracy: 0.9933
Classification Report:
              precision    recall  f1-score   support

      banana       1.00      1.00      1.00        19
   blackgram       1.00      1.00      1.00        17
    chickpea       1.00      1.00      1.00        21
     coconut       1.00      1.00      1.00        24
      coffee       1.00      1.00      1.00        19
      cotton       1.00      1.00      1.00        21
      grapes       1.00      1.00      1.00        15
      lentil       0.94      1.00      0.97        15
       maize       1.00      1.00      1.00        15
   mothbeans       1.00      0.96      0.98        28
   muskmelon       1.00      1.00      1.00        22
      papaya       0.96      1.00      0.98        24
  pigeonpeas       1.00      1.00      1.00        20
        rice       1.00      1.00      1.00        20
  watermelon       1.00      0.95      0.97        20

    accuracy                     

In [99]:
new_data = [
    { # Sample 1: Ideal for Rice
        'N': 4, 'P': 2, 'K': 2, 'temperature': 25.5, 'humidity': 82.0,
        'ph': 6.8, 'rainfall': 210.0, 'Soil_Type': 'Clay'
    },
    { # Sample 2: Ideal for Maize (Corn)
        'N': 3, 'P': 1, 'K': 1, 'temperature': 28.2, 'humidity': 65.0,
        'ph': 6.2, 'rainfall': 80.0, 'Soil_Type': 'Alluvial'
    },
    { # Sample 3: Ideal for Coffee
        'N': 2, 'P': 1, 'K': 3, 'temperature': 21.5, 'humidity': 75.0,
        'ph': 6.0, 'rainfall': 170.0, 'Soil_Type': 'Red'
    }
]

# Convert the new data into a pandas DataFrame
df_new = pd.DataFrame(new_data)
print("--- New Data Samples ---")
print(df_new)
print("\n")


# --- 4. Preprocess the New Data and Predict ---

# Apply the same One-Hot Encoding
df_new_processed = pd.get_dummies(df_new, columns=['Soil_Type'])

# Align the columns of the new data with the training data
# This ensures the new data has the exact same columns as the model was trained on.
# Missing columns in the new data will be filled with 0.
df_new_aligned = df_new_processed.reindex(columns=X_train.columns, fill_value=0)

print("--- Making Predictions ---")
# Use the trained model to make predictions
predictions_numeric = rf_classifier.predict(df_new_aligned)

# Convert the numeric predictions back to original crop labels
predictions_labels = le.inverse_transform(predictions_numeric)

# Display the results
for i, prediction in enumerate(predictions_labels):
    print(f"Prediction for Sample {i+1}: ==> {prediction}")


--- New Data Samples ---
   N  P  K  temperature  humidity   ph  rainfall Soil_Type
0  4  2  2         25.5      82.0  6.8     210.0      Clay
1  3  1  1         28.2      65.0  6.2      80.0  Alluvial
2  2  1  3         21.5      75.0  6.0     170.0       Red


--- Making Predictions ---
Prediction for Sample 1: ==> rice
Prediction for Sample 2: ==> maize
Prediction for Sample 3: ==> rice
