In [2]:
import pandas as pd
import numpy as np
import json
from nltk import ngrams
from sklearn.feature_extraction import text
from itsdangerous import URLSafeSerializer
import hashlib
import operator
import itertools
from itertools import combinations, chain
import import_ipynb
import EntropyBasedItemsetGenerationModule as entropy

importing Jupyter notebook from EntropyBasedItemsetGenerationModule.ipynb


In [3]:
class PreprocessApriori:
    def __init__(self, masterData, softwareToolGroupDict, columns):
        self.masterData = masterData
        self.softwareToolGroupDict = softwareToolGroupDict
        self.columns = columns
    
    def getSubsetNotNA(self, df, columnName):
        data = df.copy()
        data = data[data[columnName].notna()]
        return data
        
    def getSubsetForApriori(self, df, columnName, columnvalue):
        df.dropna(subset=[columnName], inplace=True)
        data = df.loc[df[columnName] == columnValue]
        return data
        
    def preprocessSoftwareToolsGroup(self, df, groups):
        data = df.copy()
        var = 'SoftwareTools'
        data[var]=data[var].str.split('##')
        for i,v in data.iterrows():
            #print(data['SoftwareTools'][i])
            l=[]
            if(isinstance(v[var],list)):
                for k in v['SoftwareTools']:
                    if(k!=''):
                        t=k.rstrip("\n")
                        h=t.strip()
                        #print(h)
                        l.append(h.lower())
            data['SoftwareTools'][i]=l
            
        for i,v in data.iterrows():
            l=[]
            for k in v['SoftwareTools']:
                if(k in groups['l1']):
                    l.append("Adobe")
                if(k in groups['l2']):
                    l.append("Cloud Computing and Big Data")
                if(k in groups['l3']):
                    l.append("Python tools")
                if(k in groups['l4']):
                    l.append("Microsoft Tools")
                if(k in groups['l5']):
                    l.append("DBMS Tools")
                if(k in groups['l6']):
                    l.append("Machine Learning Tools")
                if(k in groups['l7']):
                    l.append("Computer Network")
            l_str = '##'.join(l)
            data['SoftwareTools'][i]=l_str
        return data
    
    def preprocessTier(self,data):
        data['TierLevel']=data['TierLevel'].replace(to_replace ="1.0",value ="1")
        data['TierLevel']=data['TierLevel'].replace(to_replace ="2.0",value ="2")
        data['TierLevel']=data['TierLevel'].replace(to_replace ="3.0",value ="3")
        return data
    
    def columnwiseItemPreprocess(self,data,columns):
        df=data.copy()
        for column in columns:
            #print(column)
            df[column]=df[column].str.strip('##').str.split('##')
            for i,v in df.iterrows():
                l=[]
                #print(v)
                #print(type(v[column]))
                if(isinstance(v[column],list)):
                    for k in v[column]:
                        #print(k)
                        if(k!=''):
                            l.append(column+"_"+k)
                #print(l)
                df[column][i]=l
        return df
    
    def columnwiseItemGeneration(self,data,columns):
        df=data.copy()
        df['combinedItems_noCourse']=np.empty((len(df), 0)).tolist()
        for i,v in df.iterrows():
            l=df['combinedItems_noCourse'][i]
            for column in columns:
                #print(v)
                #print(type(v[column]))
                if(isinstance(v[column],list)):
                    l=l+v[column]
            #print(l)
            df['combinedItems_noCourse'][i]=l
        return df
    
    def allColumnsItemGeneration(self, df, columnName, columnValue, gradeColumnName, courseColumnName):
        data_full = df.copy()
        self.generateValidCoursesItem(data_full, columnName, columnValue, gradeColumnName, courseColumnName)
        data_full1 = self.getSubsetNotNA(df, columnName)
        data_full1 = self.preprocessTier(data_full1)
        data_full2 = data_full1.merge(self.validCoursesData.drop_duplicates(subset=['USN']), how ='left')
        data_main = self.getSubsetForApriori(data_full2, columnName, columnValue)
        dataPreProcess = self.preprocessSoftwareToolsGroup(data_main, self.softwareToolGroupDict)
        dataPreProcess1 = self.columnwiseItemPreprocess(dataPreProcess, self.columns)
        dataPreProcess2 = self.columnwiseItemGeneration(dataPreProcess1, self.columns)
        dataPreProcess2['combinedItemset'] =  dataPreProcess2['ValidCourses'] + dataPreProcess2['combinedItems_noCourse']
        #dataPreProcess2['combinedItemset'] =  dataPreProcess2['combinedItems_noCourse']
        return dataPreProcess2
        
    def generateValidCoursesItem(self, data, columnName, columnValue, gradeColumnName, courseColumnName):
        obj = entropy.EntropyBasedItemset(data)
        self.validCoursesData = obj.getOptimalGradeForAllCourses(data, columnName, columnValue, gradeColumnName, courseColumnName)
        self.optimal10thMarks = obj.getEntropy10th(data, columnName, columnValue)
        self.optimal12thMarks = obj.getEntropy12th(data, columnName, columnValue)
    
    def getOptimal10thMarks(self):
        return self.optimal10thMarks
    
    def getOptimal10thMarks(self):
        return self.optimal12thMarks
    
    def getValidCoursesItemset(self):
        return self.validCoursesData

In [4]:
#MODULAR VERSION: 
class Apriori:
    def __init__(self, data, alpha, ls):
        self.data = data
        self.alpha = alpha
        self.ls = ls
        
    def findsubsets(self,s, n):
        return list(map(set, itertools.combinations(s, n)))
        
    def findDomainCount(self, columnName, data):
        print("FOR ", columnName)
        print("LS = ",self.ls)
        print("\n")
        #finding frequency of domains
        domain_count={}
        for v,k in data[columnName].iteritems():
            for i in k:
                if i in domain_count:
                    domain_count[i]=domain_count[i]+1
                else:
                    domain_count[i]=1
        return domain_count
    
    def findLambda(self, domain_count):
        lamb = 0
        for val in domain_count.values(): 
            lamb += val 
        lamb = lamb / len(domain_count) 
        return lamb
    
    def mis(self, domain_count):
        sd=self.findLambda(domain_count)*(1-self.alpha)
        mis=[]
        misdict={}
        mis_val={}
        for k, v in domain_count.items():
            m=domain_count[k]-sd
            if(m<self.ls):
                mis.append(self.ls)
                misdict[domain_count[k]]=self.ls
                mis_val[k]=self.ls
            else:
                mis.append(m)
                misdict[domain_count[k]]=m
                mis_val[k]=m
        #print(mis_val)
        return mis_val
    
    def getMinMIS(self, mis_val):
        return min(mis_val.values())
    
    def getIndividualItems(self, mis_val, domain_count):
        L_2={}
        for k, v in domain_count.items():
            if(domain_count[k]>=mis_val[k]):
                L_2[k]=mis_val[k]
        temp= sorted(L_2.items(), key=operator.itemgetter(0))
        L_21=sorted(temp, key=operator.itemgetter(1), reverse=True)
        return L_21
    
    def getNItemsetRules(self, n, L1Set, mis_val, columnName, data, courseNameColumn):
        CN=self.findsubsets(L1Set, n)
        #count for each candidate itemset
        count={}
        for i in CN:
            for index, row in data.iterrows():
                flag=True
                for j in i:
                    if(j[0] in row[columnName]):
                        flag=True
                    else:
                        flag=False
                        break
                if(flag==True):
                    temp=list(i)
                    t=""
                    for k in temp:
                        t=t+","+k[0]
                    t=t[1:]
                    if(t in count):
                        count[t]=count[t]+1
                    else:
                        count[t]=1
        #checking criteria for count
        itemSet={}
        for k, v in count.items():
                m=count[k]*100/272
                if(m > self.getMinMIS(mis_val)):
                    itemSet[k]=count[k]

        temp= sorted(itemSet.items(), key=operator.itemgetter(0))
        sortedRules=sorted(temp, key=operator.itemgetter(1), reverse=True)
        print(n, sortedRules)
        file=open(str(n)+'_'+str(self.alpha)+'_'+str(self.ls)+'_'+courseNameColumn+'_Rules.txt','w')
        for items in sortedRules:
            file.writelines([str(items)])
        file.close()
        return sortedRules
    
    def generateAllRules(self, L1Set, mis_val, columnName, data, start, end, courseNameColumn):
        rules = {}
        for i in range(start,end+1):
            rule = self.getNItemsetRules(i, L1Set, mis_val, columnName, data, courseNameColumn)
            if(len(rules)==0):
                break
            rules[i] = rule
        return rules
    
    def runApriori(self, columnName, df, start, end, courseNameColumn):
        data = df.copy()
        domain_count = self.findDomainCount(columnName, data)
        mis_val = self.mis(domain_count)
        L1Set = self.getIndividualItems(mis_val, domain_count)
        rules = self.generateAllRules(L1Set, mis_val, columnName, data,start, end, courseNameColumn)
        return rules
        
                

In [5]:
def readJson(filename):
    data = {}
    with open(filename) as json_file:
        data = json.load(json_file)
    return data

In [63]:
#Parameter
#1: dataset
#2: Tier, Company name, etc
#3: LS
#4: Alpha
#5: list of column names
softwareToolGroupDict = readJson('softwareToolGroups.json')
columns=['AwardsNLP',
       'ExternalCertificatesDomain', 'ExternalCertificatesKey',
       'GeneralSkills', 'InternshipCompany',
       'InternshipProjectDomain', 'LanguagesNLP',
       'ProjectDetailDomain', 'PublicationNLP', 'ResearchDomain',
       'ScholarshipsNLP', 'VolunteeringWork',
       'WorkshopsDomain', 'WorkshopsOrg', 'SoftwareTools', 'ProgLanguagesNLP']
columnName = 'TierLevel'
columnValue = '1'

data_full= pd.read_csv(r"combinedMasterLatest.csv")
obj1 = PreprocessApriori(data_full, softwareToolGroupDict, columns)
a = obj1.allColumnsItemGeneration(data_full, columnName, columnValue,'CourseGrade', 'CourseName' )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SoftwareTools'][i]=l
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SoftwareTools'][i]=l_str
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i]=l
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combinedItems_noCourse'][i]=l


In [84]:
objApriori = Apriori(a, 0.1, 4)
objApriori.runApriori('combinedItemset',a, 'CourseName')

FOR  combinedItemset
LS =  4


2 [('CourseName_data base management systems::B and higher,CourseName_data structures::B and higher', 277), ('CourseName_data structures::B and higher,CourseName_digital design & computer organization::C and higher', 277), ('CourseName_data base management systems::B and higher,CourseName_digital design & computer organization::C and higher', 276), ('CourseName_data structures::B and higher,CourseName_design and analysis of algorithms::B and higher', 276), ('CourseName_data base management systems::B and higher,CourseName_design and analysis of algorithms::B and higher', 275), ('CourseName_problem solving with c::B and higher,CourseName_linear algebra and its applications::B and higher', 275), ('CourseName_digital design & computer organization::C and higher,CourseName_design and analysis of algorithms::B and higher', 274), ('CourseName_linear algebra and its applications::B and higher,CourseName_data structures::B and higher', 271), ('CourseName_data bas

{}

In [None]:
#Parameter
#1: dataset
#2: Tier, Company name, etc
#3: LS
#4: Alpha
#5: list of column names
softwareToolGroupDict = readJson('softwareToolGroups.json')
columns=['AwardsNLP',
       'ExternalCertificatesDomain', 'ExternalCertificatesKey',
       'GeneralSkills', 'InternshipCompany',
       'InternshipProjectDomain', 'LanguagesNLP',
       'ProjectDetailDomain', 'PublicationNLP', 'ResearchDomain',
       'ScholarshipsNLP', 'VolunteeringWork',
       'WorkshopsDomain', 'WorkshopsOrg', 'SoftwareTools', 'ProgLanguagesNLP']
columnName = 'TierLevel'
columnValue = '1'

data_full= pd.read_csv(r"combinedMasterLatest.csv")
obj1 = PreprocessApriori(data_full, softwareToolGroupDict, columns)
a = obj1.allColumnsItemGeneration(data_full, columnName, columnValue,'ElectiveCourseGrade', 'ElectiveCourseName' )
objApriori = Apriori(a, 0.1, 4)
objApriori.runApriori('combinedItemset',a, 4,4, 'ElectiveCourseName')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SoftwareTools'][i]=l
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SoftwareTools'][i]=l_str
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i]=l


In [17]:
a = (1,2,3,4)
str(a)

'(1, 2, 3, 4)'