In [None]:

#  Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import joblib


In [None]:

#    Load Dataset
data = pd.read_csv('preprocessed2.csv')
data.head()


Unnamed: 0.1,Unnamed: 0,State_Name,District_Name,Season,Crop
0,0,Andaman and Nicobar Islands,NICOBARS,Kharif,Arecanut
1,1,Andaman and Nicobar Islands,NICOBARS,Kharif,Other Kharif pulses
2,2,Andaman and Nicobar Islands,NICOBARS,Kharif,Rice
3,3,Andaman and Nicobar Islands,NICOBARS,Whole Year,Banana
4,4,Andaman and Nicobar Islands,NICOBARS,Whole Year,Cashewnut


In [None]:

#  Data Cleaning
data['Season'] = data['Season'].str.rstrip()  # Remove extra whitespace
if 'Unnamed: 0' in data.columns:
    del data['Unnamed: 0']  # Remove unnamed index column
data.head()


Unnamed: 0,State_Name,District_Name,Season,Crop
0,Andaman and Nicobar Islands,NICOBARS,Kharif,Arecanut
1,Andaman and Nicobar Islands,NICOBARS,Kharif,Other Kharif pulses
2,Andaman and Nicobar Islands,NICOBARS,Kharif,Rice
3,Andaman and Nicobar Islands,NICOBARS,Whole Year,Banana
4,Andaman and Nicobar Islands,NICOBARS,Whole Year,Cashewnut


In [None]:

#  Preparing Training and Testing Data
training_data = list(np.array(data))
testing_data = training_data[100:120]  # Taking sample test data
header = ['State_Name', 'District_Name', 'Season', 'Crop']


In [None]:

# Helper Function for crop count

def unique_vals(data, col):
    return set([row[col] for row in data])

def class_counts(data):
    counts = {}
    for row in data:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


In [None]:

#  Defined Question Class for decision tree split

class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value
    
    def match(self, example):
        val = example[self.column]
        return val == self.value
    
    def __repr__(self):
        return f"Is {header[self.column]} == {str(self.value)}?"


In [None]:

#  Partition the dataset to clasify decision tree

def partition(data, question):
    true_rows, false_rows = [], []
    for row in data:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows


In [None]:

#  Gini Impurity for best split

def gini(data):
    counts = class_counts(data)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(data))
        impurity -= prob_of_lbl**2
    return impurity

def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)


In [None]:

#  Find the Best Split

def find_best_split(data):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(data)
    n_features = len(data[0]) - 1
    
    for col in range(n_features):
        values = unique_vals(data, col)
        
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(data, question)
            
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            
            if gain > best_gain:
                best_gain, best_question = gain, question
                
    return best_gain, best_question


In [None]:

# final node

class Leaf:
    def __init__(self, data):
        self.predictions = class_counts(data)


In [None]:

#  Decision Node store question

class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch


In [12]:

# 🛠️ Build the Decision Tree

def build_tree(data):
    gain, question = find_best_split(data)
    
    if gain == 0:
        return Leaf(data)
    
    true_rows, false_rows = partition(data, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    
    return Decision_Node(question, true_branch, false_branch)


In [13]:

# 📈 Visualize the Tree

def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Predict", node.predictions)
        return

    print(spacing + str(node.question))
    print(spacing + "--> True:")
    print_tree(node.true_branch, spacing + "  ")
    
    print(spacing + "--> False:")
    print_tree(node.false_branch, spacing + "  ")


In [14]:

# 🧠 Classify New Data

def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs


In [15]:

# 🚀 Train the Model
my_tree = build_tree(training_data)
print_tree(my_tree)


Is Season == Rabi?
--> True:
  Is State_Name == Karnataka?
  --> True:
    Is District_Name == DAKSHIN KANNAD?
    --> True:
      Predict {'Dry chillies': 10, 'Groundnut': 1, 'Horse-gram': 15, 'Paddy': 1, 'Rice': 18, 'Moong(Green Gram)': 17, 'Other  Rabi pulses': 11, 'Urad': 16, 'Gram': 1, 'Cowpea(Lobia)': 5}
    --> False:
      Is District_Name == UDUPI?
      --> True:
        Predict {'Groundnut': 15, 'Horse-gram': 16, 'Moong(Green Gram)': 16, 'Onion': 2, 'Rice': 17, 'Other  Rabi pulses': 9, 'Urad': 15, 'Dry chillies': 9, 'Gram': 3, 'Maize': 10, 'Rapeseed &Mustard': 1, 'Sunflower': 2, 'Cowpea(Lobia)': 5}
      --> False:
        Is District_Name == KODAGU?
        --> True:
          Predict {'Dry chillies': 1, 'Paddy': 1, 'Rice': 16, 'Groundnut': 6, 'Other  Rabi pulses': 11, 'Ragi': 9, 'Gram': 6, 'Horse-gram': 7, 'Maize': 8, 'Cowpea(Lobia)': 2, 'Peas & beans (Pulses)': 1}
        --> False:
          Is District_Name == KOLAR?
          --> True:
            Predict {'Dry chillie

In [16]:

# 💾 Save the trained model
joblib.dump(my_tree, 'decision_tree_model.pkl')
print("Model saved as 'decision_tree_model.pkl'!")


Model saved as 'decision_tree_model.pkl'!


In [18]:
# 🧪 Predict on a Random Sample from Dataset

import random

# Pick a random index (skip last column which is label)
random_index = random.randint(0, len(data) - 1)
sample = list(data.iloc[random_index])

# Separate input and true label
input_features = sample[:-1]  # ['State_Name', 'District_Name', 'Season']
true_label = sample[-1]       # 'Crop'

# Add dummy label at the end for structure
test_row = input_features + ["Unknown"]

# Predict
prediction = classify(test_row, my_tree)
predicted_crop = print_leaf(prediction)

# Display
print("\n🌾 Prediction on Random Sample from Dataset")
print(f"Input: {input_features}")
print(f"✅ True Crop: {true_label}")
print(f"🤖 Predicted Crop: {predicted_crop}")



🌾 Prediction on Random Sample from Dataset
Input: ['Uttarakhand', 'TEHRI GARHWAL', 'Kharif']
✅ True Crop: Dry chillies
🤖 Predicted Crop: {'Arhar/Tur': '8%', 'Horse-gram': '8%', 'Maize': '8%', 'Other Cereals & Millets': '5%', 'Other Kharif pulses': '8%', 'Potato': '6%', 'Ragi': '8%', 'Rice': '8%', 'Sesamum': '8%', 'Small millets': '5%', 'Soyabean': '8%', 'Urad': '8%', 'other oilseeds': '5%', 'Dry chillies': '0%', 'Ginger': '0%', 'Pulses total': '0%', 'Total foodgrain': '0%'}
