Import necessary libraries

In [29]:
!pip install fuzzywuzzy
!pip install python-Levenshtein  # Optional but speeds up fuzzy matching


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rap

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from fuzzywuzzy import process
import pickle

# Load the datasets

In [31]:
# Load soil data
soil_data = pd.read_csv('indian_soil_data.csv')

# Load crop production data
crop_production = pd.read_csv('crop_production.csv')

# Load crop characteristics data
crop_data = pd.read_csv('crop_data.csv')

Clean and Standardize Column Names

In [32]:
def standardize_columns(df):
    df.columns = df.columns.str.strip().str.lower()
    return df

crop_data = standardize_columns(crop_data)
crop_production = standardize_columns(crop_production)
soil_data = standardize_columns(soil_data)

 Handle Missing Values

In [33]:
crop_data = crop_data.dropna()
crop_production = crop_production.dropna()
soil_data = soil_data.fillna(soil_data.mean(numeric_only=True))


In [34]:
# Fuzzy logic function for matching districts
def fuzzy_match(df1, df2, col1, col2, threshold=90):
    matches = {}
    for item in df1[col1].unique():
        match, score = process.extractOne(item, df2[col2].unique())
        if score >= threshold:
            matches[item] = match
    return matches

# Apply fuzzy matching for district names
district_matches = fuzzy_match(soil_data, crop_production, 'district', 'district')
crop_production['district'] = crop_production['district'].replace(district_matches)


Encode crop labels for the target variable

In [35]:
le_crop = LabelEncoder()
crop_data['label_encoded'] = le_crop.fit_transform(crop_data['label'])

 Keep only relevant columns for merging and prediction

In [36]:
soil_data = soil_data[['district', 'zn %', 'fe%', 'cu %', 'mn %', 'b %', 's %']]
crop_production = crop_production[['state', 'district', 'crop', 'area', 'yield']]

Merge datasets

In [37]:
merged_data = pd.merge(crop_production, soil_data, on='district', how='inner')

Apply fuzzy matching for crop names

In [38]:
crop_matches = fuzzy_match(merged_data, crop_data, 'crop', 'label')
merged_data['crop'] = merged_data['crop'].replace(crop_matches)

Merge with crop_data to get the target variable

In [39]:
final_data = pd.merge(merged_data, crop_data, left_on='crop', right_on='label', how='inner')

In [48]:
# Drop any rows with missing values for simplicity
final_data = final_data.dropna()

In [57]:
# Check if the merged data is empty
if final_data.empty:
    raise ValueError("Merged data is empty. Please check your data and merging logic.")

# Prepare features and target for the model
features = ['zn %', 'fe%', 'cu %', 'mn %', 'b %','rainfall','humidity','temperature','ph', 's %', 'area', 'yield']
X = final_data[features]
y = final_data['label_encoded']

In [58]:
# Encode the target variable using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

Train-Test Split

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train the model

In [61]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

Make predictions

In [62]:
y_pred = model.predict(X_test)

Evaluate the model

In [63]:
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1587
           2       1.00      1.00      1.00      1153
           3       1.00      1.00      1.00       792
           5       1.00      1.00      1.00      1049
           8       1.00      1.00      1.00      2236
           9       1.00      1.00      1.00      3171
          13       1.00      1.00      1.00      3686

    accuracy                           1.00     13674
   macro avg       1.00      1.00      1.00     13674
weighted avg       1.00      1.00      1.00     13674

