# FP Growth Implementation

Applies FP Growth Algorithm on the dataset to generate frequent itemsets.

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
db_path = os.path.join('.','data','7_Preprocess_Final.csv')
db = pd.read_csv(db_path)
db.head()

Unnamed: 0,Season,Crop,Soil Type,Rainfall_Disc,Yield_Disc,Rainfall_Very_Low,Rainfall_Low,Rainfall_Medium,Rainfall_High,Rainfall_Very_High,Yield_Very_Low,Yield_Low,Yield_Medium,Yield_High,Yield_Very_High
0,Whole Year,Cashewnut,Red,Medium,Low,0,0,1,0,0,0,1,0,0,0
1,Summer,Maize,Red,Low,Low,0,1,0,0,0,0,1,0,0,0
2,Kharif,Cotton(lint),Red,Medium,Very_Low,0,0,1,0,0,1,0,0,0,0
3,Kharif,Sesamum,Alluvial,High,Very_Low,0,0,0,1,0,1,0,0,0,0
4,Rabi,Urad,Red,Low,Very_Low,0,1,0,0,0,1,0,0,0,0


### Preprocessing

Drop the binary columns because they are too numerous. We prefer to append the keywords _Rainfall_ and _Yield_ to the data to make it more readable to us.

In [3]:
new_db = db[['Season', 'Crop', 'Soil Type', 'Rainfall_Disc', 'Yield_Disc']]
new_db.head()

Unnamed: 0,Season,Crop,Soil Type,Rainfall_Disc,Yield_Disc
0,Whole Year,Cashewnut,Red,Medium,Low
1,Summer,Maize,Red,Low,Low
2,Kharif,Cotton(lint),Red,Medium,Very_Low
3,Kharif,Sesamum,Alluvial,High,Very_Low
4,Rabi,Urad,Red,Low,Very_Low


In [4]:
x1 = new_db['Rainfall_Disc'].values
x2 = new_db["Yield_Disc"].values
crops = list(set(new_db["Crop"].values))

In [5]:
for i in range(0, len(x1)):
    x1[i] = x1[i]+"_Rainfall"
for i in range(0, len(x2)):
    x2[i] = x2[i]+"_Yield"

In [6]:
new_db.head()

Unnamed: 0,Season,Crop,Soil Type,Rainfall_Disc,Yield_Disc
0,Whole Year,Cashewnut,Red,Medium_Rainfall,Low_Yield
1,Summer,Maize,Red,Low_Rainfall,Low_Yield
2,Kharif,Cotton(lint),Red,Medium_Rainfall,Very_Low_Yield
3,Kharif,Sesamum,Alluvial,High_Rainfall,Very_Low_Yield
4,Rabi,Urad,Red,Low_Rainfall,Very_Low_Yield


In [7]:
new_db.values

array([['Whole Year', 'Cashewnut', 'Red', 'Medium_Rainfall', 'Low_Yield'],
       ['Summer', 'Maize', 'Red', 'Low_Rainfall', 'Low_Yield'],
       ['Kharif', 'Cotton(lint)', 'Red', 'Medium_Rainfall',
        'Very_Low_Yield'],
       ...,
       ['Summer', 'Moong(Green Gram)', 'Alluvial', 'High_Rainfall',
        'Very_Low_Yield'],
       ['Whole Year', 'Onion', 'Alluvial', 'Low_Rainfall', 'High_Yield'],
       ['Whole Year', 'Peas & beans (Pulses)', 'Black',
        'Medium_Rainfall', 'Very_Low_Yield']], dtype=object)

Convert the dataset into a 2D list to ease the data preparation for the FP Tree generation

In [8]:
db = new_db.values.tolist()

## Generating the Frequent Itemsets

- We now prepare the dataset, keeping only the items which satisfy the minimum support threshold in our dataset
- Then we create an FP Tree using the items in this dataset
- Then we generate frequent itemsets by mining this FP Tree

In [9]:
from tree import Tree
from tree import dft
from pprint import pprint

support_count = {}
min_support = 50

# Determine the support count
for transaction in db:
    for item in transaction:
        if item in support_count.keys():
            support_count[item] += 1
        else:
            support_count[item] = 1

freq_items = {}

# Store the frequent items with their support count in a dictionary
for item in support_count.keys():
    if support_count[item] >= min_support:
        freq_items[item] = support_count[item]

# Sort this dictionary with support count as the key
{k: v for k, v in sorted(freq_items.items(), key=lambda x: x[1], reverse=True)}

# Prepare the dataset in the following way
# for each transaction, intersect it with the set of frequent items
# then sort the remaining items of the transaction based on the support counts
new_db = []
freq_item_list = set(freq_items.keys())

for transaction in db:
    transaction = list(set(transaction).intersection(freq_item_list))
    transaction = sorted(transaction, key=lambda x: freq_items[x], reverse=True)
    new_db.append(transaction)

db = new_db

# Create an FP Tree with the minimum support and the prepared dataset
fp_tree = Tree(min_support)
for transaction in db:
    fp_tree.insert_transaction(transaction)

# Mine the FP tree
frequent_itemsets = fp_tree.mine_fp_tree([], {})
# How many frequent itemsets are generated?
print("No. of frequent itemsets generated:", len(frequent_itemsets))

No. of frequent itemsets generated: 7551


We now collect all frequent itemsets of length 5 

In [10]:
useful_freq_itemsets = []
for i in frequent_itemsets.keys():
    if len(i) == 5:
        useful_freq_itemsets.append(i)

print("No. of frequent itemsets of length 5:", len(useful_freq_itemsets))

No. of frequent itemsets of length 5: 782


## Rule Generation

We now generate rules based on the frequent itemsets.

In our rules, we have _Yield_ and _Crops_ in the consequent and _Rainfall_, _Soil Type_ and _Season_ as the antecedent

In [11]:
import re

rules = []
for i in useful_freq_itemsets:
    consequent = []
    antecedent = []
    for j in i:
        if len(re.findall(r"._Yield", j)) > 0 or j in crops:
            consequent.append(j)
        else:
            antecedent.append(j)

    rules.append((antecedent, consequent))
print("No. of for rules generated:", len(rules))

No. of for rules generated: 782


We now calculate confidence of each rule

In [12]:
for k in range(0, len(rules)):
    antecedent, consequent = set(rules[k][0]), set(rules[k][1])
    sup_ante, sup_tot = 0, 0
    for i in useful_freq_itemsets:
        if antecedent.issubset(set(i)):
            sup_ante += frequent_itemsets[tuple(i)]
        if antecedent.union(consequent).issubset(set(i)):
            sup_tot += frequent_itemsets[tuple(i)]
            
    confidence = sup_tot/sup_ante
    rules[k] = (rules[k][0], rules[k][1], confidence)
rules[0:5]

[(['Red', 'Low_Rainfall', 'Whole Year'],
  ['Very_High_Yield', 'Bhindi'],
  0.02055045871559633),
 (['Medium_Rainfall', 'Alluvial', 'Rabi'],
  ['Very_High_Yield', 'Pineapple'],
  0.02360391479562464),
 (['Medium_Rainfall', 'Alluvial', 'Kharif'],
  ['Very_High_Yield', 'Papaya'],
  0.0073086058834277365),
 (['High_Rainfall', 'Whole Year', 'Mountain'],
  ['Low_Yield', 'Oilseeds total'],
  0.21727019498607242),
 (['Medium_Rainfall', 'Alluvial', 'Kharif'],
  ['Very_High_Yield', 'Orange'],
  0.006669102868627809)]

We now define a mapping which eases our work of sorting our results on the basis of yield 

In [13]:
mapping = {}
mapping['Very_Low_Yield'] = 0
mapping['Low_Yield'] = 1
mapping['Medium_Yield'] = 2
mapping['High_Yield'] = 3
mapping['Very_High_Yield'] = 4

The `give_crops` function requires the season, soil type and the rainfall as the inputs and it gives a set of suggested crops sorted on the basis of the yield.

We may take only the _very high_, _high_ and _medium_ yields for suggestions based on the number of results returned

In [14]:
def give_crops(season="kharif", soil_type="alluvial", rainfall="medium_rainfall", min_confidence=0):
#     season = input("season: ")
#     soil_type = input("soil type: ")
#     rainfall = input("rainfall (input as very_low_rainfall, etc.): ")
    
    lis = [season, soil_type, rainfall]
    found_set = []
    for rule in rules:
        comp = []
        if rule[2] < min_confidence:
            continue
        for i in rule[0]:
            comp.append(i.lower())
        if set(comp) == set(lis):
            found_set.append([rule[1], rule[2]])
    
    found_set = sorted(found_set, key=lambda x: mapping[x[0][0]], reverse=True)

    return found_set

Let us now see the recommendations for the crops given by the function

In [15]:
# Test example
# kharif
# alluvial
# medium_rainfall
crop_recommendations = give_crops("rabi", "alluvial", "very_low_rainfall", 0)
pprint(crop_recommendations)

[[['Very_High_Yield', 'Onion'], 0.07931262392597488],
 [['Very_High_Yield', 'Potato'], 0.043621943159286185],
 [['Very_High_Yield', 'Wheat'], 0.11368142762723067],
 [['Very_High_Yield', 'Garlic'], 0.03502974223397224],
 [['High_Yield', 'Wheat'], 0.0680766688697951],
 [['Medium_Yield', 'Rapeseed &Mustard'], 0.07534699272967614],
 [['Low_Yield', 'Jowar'], 0.06345009914077991],
 [['Low_Yield', 'Rapeseed &Mustard'], 0.09980171844018507],
 [['Low_Yield', 'Gram'], 0.15928618638466624],
 [['Low_Yield', 'Masoor'], 0.04758757435558493],
 [['Low_Yield', 'Peas & beans (Pulses)'], 0.04758757435558493],
 [['Low_Yield', 'Other  Rabi pulses'], 0.04824851288830139],
 [['Very_Low_Yield', 'Gram'], 0.04560475875743556],
 [['Very_Low_Yield', 'Coriander'], 0.037012557832121616],
 [['Very_Low_Yield', 'Linseed'], 0.036351619299405155]]


We now discard the _low_yield_ and _very_low_yield_ recommendations and sort the set according to the confidence rules

In [16]:
def give_recommendations(crop_recommendations):
    vh = []
    h = []
    m = []
    for r in crop_recommendations:
        if "Very_High" in r[0][0]:
            vh.append((r[0][1], r[1]))
        elif "High" in r[0][0]:
            h.append((r[0][1], r[1]))
        elif "Medium" in r[0][0]:
            m.append((r[0][1], r[1]))

    vh = sorted(vh, key=lambda x: x[1], reverse=True)
    h = sorted(h, key=lambda x:x[1], reverse=True)
    m = sorted(m, key=lambda x:x[1], reverse=True)
    
    # print(vh, h, m)

    final_suggestion = []
    if len(vh)+len(h)+len(m) == 0:
        print()
        print("Sorry, no crops to suggest.")
        return
    elif len(vh)+len(h) < 5:
        final_suggestion = [i[0] for i in vh]
        final_suggestion.extend([i[0] for i in h])
        final_suggestion.extend(i[0] for i in m)
    else:
        final_suggestion = [i[0] for i in vh]
        final_suggestion.extend([i[0] for i in h])

    print()
    print("Suggested Crops:", final_suggestion)

We can now test on more combinations of inputs

In [17]:
crops = give_crops("rabi", "alluvial", "very_low_rainfall", 0.02)
give_recommendations(crops)

crops = give_crops("kharif", "alluvial", "medium_rainfall", 0.02)
give_recommendations(crops)

crops = give_crops("kharif", "red", "very_low_rainfall", 0.02)
give_recommendations(crops)

crops = give_crops("kharif", "arid", "very_low_rainfall", 0.02)
give_recommendations(crops)


Suggested Crops: ['Wheat', 'Onion', 'Potato', 'Garlic', 'Wheat']

Suggested Crops: ['Jute', 'Rice', 'Mesta', 'Sugarcane', 'Rice']

Sorry, no crops to suggest.

Sorry, no crops to suggest.
