In [7]:
import pandas as pd
import time
import numpy as np
import json
from nltk import ngrams
from sklearn.feature_extraction import text
from itsdangerous import URLSafeSerializer
import hashlib
import operator
import itertools
from itertools import combinations, chain
import import_ipynb
import EntropyBasedItemsetGenerationModule as entropy

In [3]:
class PreprocessApriori:
    def __init__(self, masterData, softwareToolGroupDict, columns):
        self.masterData = masterData
        self.softwareToolGroupDict = softwareToolGroupDict
        self.columns = columns
    
    def getSubsetNotNA(self, df, columnName):
        data = df.copy()
        data = data[data[columnName].notna()]
        return data
        
    def getSubsetForApriori(self, df, columnName, columnvalue):
        df.dropna(subset=[columnName], inplace=True)
        data = df.loc[df[columnName] == columnValue]
        return data
        
    def preprocessSoftwareToolsGroup(self, df, groups):
        data = df.copy()
        var = 'SoftwareTools'
        data[var]=data[var].str.split('##')
        for i,v in data.iterrows():
            #print(data['SoftwareTools'][i])
            l=[]
            if(isinstance(v[var],list)):
                for k in v['SoftwareTools']:
                    if(k!=''):
                        t=k.rstrip("\n")
                        h=t.strip()
                        #print(h)
                        l.append(h.lower())
            data['SoftwareTools'][i]=l
            
        for i,v in data.iterrows():
            l=[]
            for k in v['SoftwareTools']:
                if(k in groups['l1']):
                    l.append("Adobe")
                if(k in groups['l2']):
                    l.append("Cloud Computing and Big Data")
                if(k in groups['l3']):
                    l.append("Python tools")
                if(k in groups['l4']):
                    l.append("Microsoft Tools")
                if(k in groups['l5']):
                    l.append("DBMS Tools")
                if(k in groups['l6']):
                    l.append("Machine Learning Tools")
                if(k in groups['l7']):
                    l.append("Computer Network")
            l_str = '##'.join(l)
            data['SoftwareTools'][i]=l_str
        return data
    
    def preprocessTier(self,data):
        data['TierLevel']=data['TierLevel'].replace(to_replace ="1.0",value ="1")
        data['TierLevel']=data['TierLevel'].replace(to_replace ="2.0",value ="2")
        data['TierLevel']=data['TierLevel'].replace(to_replace ="3.0",value ="3")
        return data
    
    def columnwiseItemPreprocess(self,data,columns):
        df=data.copy()
        for column in columns:
            #print(column)
            df[column]=df[column].str.strip('##').str.split('##')
            for i,v in df.iterrows():
                l=[]
                #print(v)
                #print(type(v[column]))
                if(isinstance(v[column],list)):
                    for k in v[column]:
                        #print(k)
                        if(k!=''):
                            l.append(column+"_"+k)
                #print(l)
                df[column][i]=l
        return df
    
    def columnwiseItemGeneration(self,data,columns):
        df=data.copy()
        df['combinedItems_noCourse']=np.empty((len(df), 0)).tolist()
        for i,v in df.iterrows():
            l=df['combinedItems_noCourse'][i]
            for column in columns:
                #print(v)
                #print(type(v[column]))
                if(isinstance(v[column],list)):
                    l=l+v[column]
            #print(l)
            df['combinedItems_noCourse'][i]=l
        return df
    
    def allColumnsItemGeneration(self, df, columnName, columnValue, gradeColumnName, courseColumnName, addCourses = True):
        data_full = df.copy()
        self.generateValidCoursesItem(data_full, columnName, columnValue, gradeColumnName, courseColumnName)
        data_full1 = self.getSubsetNotNA(df, columnName)
        data_full1 = self.preprocessTier(data_full1)
        data_full2 = data_full1.merge(self.validCoursesData.drop_duplicates(subset=['USN']), how ='left')
        data_main = self.getSubsetForApriori(data_full2, columnName, columnValue)
        dataPreProcess = self.preprocessSoftwareToolsGroup(data_main, self.softwareToolGroupDict)
        dataPreProcess1 = self.columnwiseItemPreprocess(dataPreProcess, self.columns)
        dataPreProcess2 = self.columnwiseItemGeneration(dataPreProcess1, self.columns)
        if(addCourses):
            dataPreProcess2['combinedItemset'] =  dataPreProcess2['ValidCourses'] + dataPreProcess2['combinedItems_noCourse']
        else:
            dataPreProcess2['combinedItemset'] =  dataPreProcess2['combinedItems_noCourse']
        return dataPreProcess2
        
    def generateValidCoursesItem(self, data, columnName, columnValue, gradeColumnName, courseColumnName):
        obj = entropy.EntropyBasedItemset(data)
        self.validCoursesData = obj.getOptimalGradeForAllCourses(data, columnName, columnValue, gradeColumnName, courseColumnName)
        self.optimal10thMarks = obj.getEntropy10th(data, columnName, columnValue)
        self.optimal12thMarks = obj.getEntropy12th(data, columnName, columnValue)
    
    def getOptimal10thMarks(self):
        return self.optimal10thMarks
    
    def getOptimal10thMarks(self):
        return self.optimal12thMarks
    
    def getValidCoursesItemset(self):
        return self.validCoursesData

In [114]:
#MODULAR VERSION: 
class Apriori:
    def __init__(self, data, alpha, ls):
        self.data = data
        self.alpha = alpha
        self.ls = ls
        
    def findsubsets(self, uniqueItems, listFormattedPreviousRules, n):
        #return list(map(set, itertools.combinations(s, n)))
        #find combination the first time for all unique items 
        if(len(listFormattedPreviousRules)==0):
            return list(itertools.combinations(uniqueItems, n))
        else:
            #print("In combo")
            return self.getValidCombinations(uniqueItems, listFormattedPreviousRules, n)
        
    def getValidCombinations(self, uniqueItems, listFormattedPreviousRules, n):
        #print("\n\n\nin get valid combis\n\n\n")
        validCombinations = []
        for item in uniqueItems:
            for rule in listFormattedPreviousRules:
                if(item not in rule):
                    if(self.isCombinationValid(item, rule, listFormattedPreviousRules)):
                        temp =[]
                        temp.append(item)
                        l = rule + temp
                        l.sort()
                        validCombinations.append(tuple(l))
        return list(set(validCombinations))
          
    def isCombinationValid(self, item, rule, listFormattedPreviousRules):
        flag = True
        temp = []
        for prevRule in listFormattedPreviousRules:
            prevRule.sort()
            temp.append(prevRule)
        for i in rule:
            y = rule[:]  # fastest way to copy
            y.remove(i)
            y.append(item)
            y.sort()
            if(y not in temp):
                flag = False
                break
        return flag
            
        
    def findItemCount(self, columnName, data):
        item_count={}
        for v,k in data[columnName].iteritems():
            for i in k:
                if i in item_count:
                    item_count[i]=item_count[i]+1
                else:
                    item_count[i]=1
        print("itemcount\n", item_count)
        print("unique items:", len(item_count))
        return item_count
    
    def findLambda(self, item_count):
        lamb = 0
        for val in item_count.values(): 
            lamb += val 
        lamb = lamb / len(item_count) 
        return lamb
    
    def mis(self, item_count):
        sd=self.findLambda(item_count)*(1-self.alpha)
        mis=[]
        misdict={}
        mis_val={}
        for k, v in item_count.items():
            m=item_count[k]-sd
            if(m<self.ls):
                mis.append(self.ls)
                misdict[item_count[k]]=self.ls
                mis_val[k]=self.ls
            else:
                mis.append(m)
                misdict[item_count[k]]=m
                mis_val[k]=m
        return mis_val
    
    def getMinMIS(self, mis_val):
        return min(mis_val.values())
    
    def getIndividualItems(self, mis_val, item_count):
        L_2={}
        for k, v in item_count.items():
            if(item_count[k]>=mis_val[k]):
                L_2[k]=mis_val[k]
        uniqueItems = list(L_2.keys())
        temp= sorted(L_2.items(), key=operator.itemgetter(0))
        L_21=sorted(temp, key=operator.itemgetter(1), reverse=True)
        print("individual items size: ", len(L_21))
        #print("items:", L_21)
        print("L21 Keys: ",uniqueItems)
        return uniqueItems, L_21
    
    def getNItemsetRules(self, n, individualUniqueItems, listFormattedPreviousRules, mis_val, columnName, data, courseNameColumn):
        start_time = time.time()
        CN=self.findsubsets(individualUniqueItems, listFormattedPreviousRules, n)
        if(n>2):
            print("combination:", CN)
        print("time taken to get all combinations: ", time.time() - start_time)
        print("combination size", len(CN))
        print("combinations:",CN[:3])
        #count for each candidate itemset
        start_time = time.time()
        count = self.getSupportOfEachCombination(CN, data, columnName)
        print("time taken to get support of each combination : ", time.time() - start_time)
        #checking criteria for count
        start_time = time.time()
        itemSet={}
        indivUniqueItems =[]
        for k, v in count.items():
                m=count[k]*100/data.shape[0]
                if(m > self.getMinMIS(mis_val)):
                    itemSet[k]=count[k]
        
        indivUniqueItems, listFormattedRules = self.getUnqiueValidItemsAndListFormatteRules(list(itemSet.keys()))
        temp= sorted(itemSet.items(), key=operator.itemgetter(0))
        sortedRules=sorted(temp, key=operator.itemgetter(1), reverse=True)
        print("time taken to get valid combinations : ", time.time() - start_time)
        
        if(len(sortedRules)>4):
            print("N:", n,"No of rules:", len(sortedRules),"\nRules: ", sortedRules[:3], type(sortedRules[:3][0]))
        file=open('NoCourse'+ str(n)+'_'+str(self.alpha)+'_'+str(self.ls)+'_'+courseNameColumn+'_RulesNoCourse.txt','w+')
        for items in sortedRules:
            file.writelines([str(items)])
        file.close()
        return indivUniqueItems, listFormattedRules, sortedRules
    
    def getUnqiueValidItemsAndListFormatteRules(self, listOfItemSets):
        #keys will be strings separated by ,
        allItems = []
        listFormattedRules = []
        for itemSet in listOfItemSets:
            l = itemSet.split(',')
            allItems+= l
            listFormattedRules.append(l)
        return list(set(allItems)),listFormattedRules
    
    def getSupportOfEachCombination(self, CN, data, columnName):
        count={}
        for i in CN:
            for index, row in data.iterrows():
                if(set(i).issubset( row[columnName])):
                    strSet = ','.join(set(i))
                    if(strSet in count):
                        count[strSet] +=1
                    else:
                        count[strSet] = 1
        return count
            
    def generateAllRules(self, L1List, mis_val, columnName, data, start, end, courseNameColumn):
        rules = {}
        indivUniqueItems = L1List
        listFormattedRules = []
        for i in range(start,end+1):
            indivUniqueItems, listFormattedRules, rule = self.getNItemsetRules(i, indivUniqueItems, listFormattedRules, mis_val, columnName, data, courseNameColumn)
            print("for ", i, " number of rules:", len(rule))
            print("indivudal unique items left:", indivUniqueItems)
            print("number of unique items:", len(indivUniqueItems)) 
            print("rules in list format: ", listFormattedRules)
            if(len(rule)==0):
                break
            rules[i] = rule
        return rules
    
    def runApriori(self, columnName, df, start, end, courseNameColumn):
        data = df.copy()
        item_count = self.findItemCount(columnName, data)
        mis_val = self.mis(item_count)
        uniqueItems , L1Set = self.getIndividualItems(mis_val, item_count)
        print("L1 set: ", L1Set)
        rules = self.generateAllRules(uniqueItems, mis_val, columnName, data,start, end, courseNameColumn)
        print("total rules:", len(rules))
        return rules
        
                

In [112]:
def readJson(filename):
    data = {}
    with open(filename) as json_file:
        data = json.load(json_file)
    return data

In [116]:
#Parameter
#1: dataset
#2: Tier, Company name, etc
#3: LS
#4: Alpha
#5: list of column names
start_time = time.time()
softwareToolGroupDict = readJson('softwareToolGroups.json')
columns=['AwardsNLP',
       'ExternalCertificatesDomain', 'ExternalCertificatesKey',
       'GeneralSkills', 'InternshipCompany',
       'InternshipProjectDomain', 'LanguagesNLP',
       'ProjectDetailDomain',
       'PublicationNLP', 'ResearchDomain',
       'ScholarshipsNLP', 'VolunteeringWork',
       'WorkshopsDomain', 'WorkshopsOrg', 'SoftwareTools', 'ProgLanguagesNLP'
]
columnName = 'TierLevel'
columnValue = '1'

data_full= pd.read_csv(r"combinedMasterLatest.csv")
obj1 = PreprocessApriori(data_full, softwareToolGroupDict, columns)
a = obj1.allColumnsItemGeneration(data_full, columnName, columnValue,'ElectiveCourseGrade', 'ElectiveCourseName', False )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SoftwareTools'][i]=l
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SoftwareTools'][i]=l_str
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i]=l
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combinedItems_noCourse'][i]=l


In [None]:
objApriori = Apriori(a, 0.1, 4)
rules = objApriori.runApriori('combinedItemset',a, 2,15, 'ElectiveCourseName')
print("time taken overall ", time.time() - start_time)
        

itemcount
 {'AwardsNLP_iayp': 1, 'AwardsNLP_e-yantra': 1, 'GeneralSkills_Coding, Algorithms, Data Structure, ML and AI, Breathing': 1, 'InternshipProjectDomain_Image Processing': 220, 'InternshipProjectDomain_Algorithms': 29, 'LanguagesNLP_kannada': 76, 'LanguagesNLP_english': 118, 'LanguagesNLP_german': 5, 'ProjectDetailDomain_Cloud computing': 221, 'ProjectDetailDomain_Machine Learning': 119, 'ProjectDetailDomain_Web Development': 108, 'ProjectDetailDomain_Compiler Design': 92, 'ProjectDetailDomain_Big Data': 51, 'ProjectDetailDomain_Image Processing': 187, 'ProjectDetailDomain_Data Structures': 112, 'ProjectDetailDomain_Deep Learning': 81, 'ProjectDetailDomain_Artificial Intelligence': 28, 'ProjectDetailDomain_Micro Processors and Controllers': 16, 'ResearchDomain_Machine Learning': 7, 'ResearchDomain_Image Processing': 15, 'ResearchDomain_Deep Learning': 2, 'ScholarshipsNLP_cnr': 87, 'VolunteeringWork_Helped in organizing #Code 2020': 1, 'WorkshopsOrg_ieee': 4, 'GeneralSkills_Leade

In [119]:
rules

[('ProgLanguagesNLP_c,ProjectDetailDomain_Image Processing,ProjectDetailDomain_Cloud computing,ProgLanguagesNLP_javascript,ProgLanguagesNLP_python,ProgLanguagesNLP_html',
  62),
 ('ProgLanguagesNLP_c,ProjectDetailDomain_Cloud computing,ProgLanguagesNLP_javascript,ProgLanguagesNLP_python,ProgLanguagesNLP_php,ProgLanguagesNLP_html',
  59),
 ('ProgLanguagesNLP_c,ProjectDetailDomain_Image Processing,ProjectDetailDomain_Cloud computing,ProgLanguagesNLP_javascript,ProgLanguagesNLP_python,ProgLanguagesNLP_php',
  59),
 ('ProgLanguagesNLP_c,ProjectDetailDomain_Cloud computing,ProgLanguagesNLP_javascript,ProgLanguagesNLP_python,ProgLanguagesNLP_css,ProgLanguagesNLP_html',
  56),
 ('ProgLanguagesNLP_c,ProjectDetailDomain_Image Processing,ProgLanguagesNLP_javascript,ProgLanguagesNLP_python,ProgLanguagesNLP_php,ProgLanguagesNLP_html',
  55),
 ('ProgLanguagesNLP_c,InternshipProjectDomain_Image Processing,ProjectDetailDomain_Image Processing,ProjectDetailDomain_Cloud computing,ProgLanguagesNLP_javas

In [None]:
#softwar tool, 
#prg languaes,
#generalskills, volunteerin, cocurricular
#external, work, public

#combine ineternship and project domains