# Applying Apriori on the preprocessed dataset

In [1]:
# importing libraries
import os
import pandas as pd
# global frequency dictionary to store frequency of each row of dataset
frequency = {}

In [2]:
filepath = os.path.join('.','7_Preprocess_Final.csv')
df = pd.read_csv(filepath, usecols=["Season", "Crop", "Soil Type", "Rainfall_Disc", "Yield_Disc"])
df = df.reindex(columns=["Season", "Soil Type", "Rainfall_Disc","Crop", "Yield_Disc"])
for j, col in enumerate(df.columns):
    for i, val in enumerate(df[col]):
        df.at[i, col] = str(j) + ' ' + col + ' = ' + val
df.to_csv(r'AprioriInput.csv', index=False)

In [3]:
def loadDataSet():
    # reads the dataset and converts it into list of list
    
    df = pd.read_csv('AprioriInput.csv')
    records = df.values.tolist()
    for i in range(len(records)):
        records[i] = frozenset(records[i])
        if records[i] in frequency:
            frequency[records[i]] += 1
        else:
            frequency[records[i]] = 1
    return list(set(records))

In [4]:
def createC1(dataSet):
    # creates c1 that is candidate item set of size 1
    
    C1 = set()
    for trans in dataSet:
        for item in trans:
            C1.add(frozenset([item]))
    return list(C1)

In [5]:
def createLK(DS, Ck, minSupport):
    # creates Lk from Ck by removing those candidate sets
    # which have support less than min support and returns
    # Lk and a dictionary with support count
    
    sup_count = {}
    for trans in DS:
        for can in Ck:
            if can.issubset(trans):
                if can in sup_count:
                    sup_count[can] += frequency[trans]
                else:
                    sup_count[can] = frequency[trans]
    supportData = {}
    Lk = []
    for key in sup_count:
        support = sup_count[key]
        if support >= minSupport: # if it passes the min threshold
            Lk.insert(0, key)     # insert it in Lk
            supportData[key] = support
    return Lk, supportData

In [6]:
def aprioriGen(Lkm1, k):
    # returns Ck from Lk-1 by union of those frequent
    # sets which have k-2 items equal

    Ck = []
    for i in range(len(Lkm1)):
        for j in range(i+1, len(Lkm1)):
            x = list(Lkm1[i])
            y = list(Lkm1[j])
            x.sort()
            y.sort()
            L1 = x[:k-2]
            L2 = y[:k-2]
            L1.sort()
            L2.sort()
            if L1 == L2:  # if first k-2 elements are equal
                Ck.append(Lkm1[i] | Lkm1[j])  # set union
    return Ck

In [7]:
def apriori(dataSet, minSupport):
    # returns a list of frequent itemsets and
    # dictionary of support counts for those itemsets

    C1 = createC1(dataSet)
    L1, supportData = createLK(dataSet, C1, minSupport)
    L = [L1]
    k = 0
    while (len(L[k]) > 0):
        Ck = aprioriGen(L[k], k+2)
        Lk, supK = createLK(dataSet, Ck, minSupport)
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

In [8]:
def generateRules(itemsets, supportData, minConf):
    # main rule gen function that generates and returns interesting
    # association rules that passed the confidence threshold

    Assoc_rules = {} # rules are stored in dictionary with antecedent as key and consequent and confidence as value
    for itemset in itemsets:
        sup_tot = supportData[frozenset(itemset)] # entire rule
        ante = frozenset(itemset[:3]) # antecedent
        sup_ante = supportData[ante]
        conf = sup_tot/sup_ante # conf = sup(rule) / sup(antecedent)
        if(conf >= minConf): # if confidence passes the min threshold add it to the rules
            if ante in Assoc_rules:
                Assoc_rules[ante].append([itemset[-2], itemset[-1], conf])
            else :
                Assoc_rules[ante] = [[itemset[-2], itemset[-1], conf]]
    return Assoc_rules

Load the dataset and get the frequent itemset list by calling apriori function

In [9]:
ds = loadDataSet() # load the dataset
minsup = 50 # set the minimum support
l, s = apriori(ds, minsup) # call apriori to get the frequent itemset and support count
print('done')

done


Get the item sets of size 5 and generate rules from them

In [10]:
for i in range(len(l[4])): # l[4] has frequent itemsets of size 5 (0 indexed)
    l[4][i] = list(l[4][i])
    l[4][i].sort()
minConf = 0 # set the minimum confidence
rules = generateRules(l[4],s,minConf) # call generateRules to generate the rules
print(len(rules))

55


Take season, soil and rainfall as input from the user and
output the crops with high yield using the above rules

In [16]:
season = '0 Season = ' + ['Kharif','Summer','Whole Year','Rabi','Winter','Autumn'][int(input()) - 1]
soil = '1 Soil Type = ' + ['Alluvial','Black','Red','Mountain','Laterite','Arid'][int(input()) - 1]
rainfall = '2 Rainfall_Disc = ' + ['Very_High','High','Medium','Low','Very_Low'][int(input()) - 1]
crops = rules[frozenset([season, soil, rainfall])]
for crop in crops:
    print(crop[0][9:], crop[1], sep = '\t\t')

4
1
5
Rapeseed &Mustard		4 Yield_Disc = Low
Peas & beans (Pulses)		4 Yield_Disc = Low
Coriander		4 Yield_Disc = Very_Low
Onion		4 Yield_Disc = Very_High
Linseed		4 Yield_Disc = Very_Low
Rapeseed &Mustard		4 Yield_Disc = Medium
Potato		4 Yield_Disc = Very_High
Gram		4 Yield_Disc = Low
Wheat		4 Yield_Disc = High
Jowar		4 Yield_Disc = Low
Other  Rabi pulses		4 Yield_Disc = Low
Masoor		4 Yield_Disc = Low
Gram		4 Yield_Disc = Very_Low
Garlic		4 Yield_Disc = Very_High
Wheat		4 Yield_Disc = Very_High
