In [4]:
import pandas as pd
import numpy as np
import json
from nltk import ngrams
from sklearn.feature_extraction import text
from itsdangerous import URLSafeSerializer
import hashlib

In [5]:
class PreProcessing:
    def __init__(self, masterData):
        self.masterData = masterData
        
    def fillCTCNa(self, df):
        df['CTC'] = df.groupby('CompanyName')['CTC'].ffill()
        return df
    
    def fillTierNa(self, df):
        df['TierLevel'] = df.groupby('CompanyName')['TierLevel'].ffill()
        return df
    
    def PreprocessCTC(self ,data):
    #convert CTC to standard format
        df=data
        for idx, row in df.iterrows():
            if(row.CTC==' '):
                data.set_value(idx, 'CTC', 0)
            elif float(row.CTC) > 100:
                data.set_value(idx, 'CTC', float(row.CTC)/100000)
        return(data)

    def TierCorrectionPreProcess(self, data):
        df_ctc=self.PreprocessCTC(data)
        df_ctc['CTC'] = pd.to_numeric(df_ctc['CTC'],errors='coerce')
        #map CTC to Tier level based on 2021 Tier mapping
        for idx, row in data.iterrows():
            if pd.isnull(row.TierLevel):
                #print(row.CTC) #mostly all 0 are printed here
                if row.CTC > 0 and row.CTC <= 5:
                    df_ctc.set_value(idx, 'TierLevel', 3)
                if row.CTC > 5 and row.CTC <= 10:
                    df_ctc.set_value(idx, 'TierLevel', 2)
                if row.CTC > 10:
                    df_ctc.set_value(idx, 'TierLevel', 1)
        return(df_ctc)
    
    def PreProcess10thMarks(self, df):
        df1 = df.copy()
        df1['10thPercentage'] = df1['10thPercentage'].str.replace('CGPA', '').replace('GPA','').replace('GRADE','').replace(':','').replace('ICSE','').replace('CBSE','').replace(' ','')
        df1['10thPercentage'] = df1['10thPercentage'].str.replace('GPA', '')
        df1['10thPercentage'] = df1['10thPercentage'].str.extract('(\d*\.?\d+)', expand=False)
        df1['10thPercentage'] = df1['10thPercentage'].astype(float)
        df1['10thPercentage'] = np.where((df1['10thPercentage'] >1) & (df1['10thPercentage']<=10) , df1['10thPercentage'] * 9.5, df1['10thPercentage'])
        df1['10thPercentage'] = np.where(df1['10thPercentage'] <1 , df1['10thPercentage'] * 100, df1['10thPercentage'])
        df1['10thPercentage'].fillna((df1['10thPercentage'].mean()), inplace=True)
        return df1
    
    def PreProcess12thMarks(self, df):
        df1 = df.copy()
        df1['12thPercentage'] = df1['12thPercentage'].astype(str).replace(':','').replace('ICSE','').replace('CBSE','').replace(' ','')
        df1['12thPercentage'] = df1['12thPercentage'].str.extract('(\d*\.?\d+)', expand=False)
        df1['12thPercentage'] = df1['12thPercentage'].astype(float)
        df1['12thPercentage'] = np.where(df1['12thPercentage'] <1 , df1['12thPercentage'] * 100, df1['12thPercentage'])
        df1['12thPercentage'].fillna((df1['12thPercentage'].mean()), inplace=True)
        return df1
    

In [13]:
class NLPPreProcess:
    def __init__(self, masterData, award_keywords, languages, company_dict, scholarships, project_domains, workshop_org, publications, extCertificates_keywords, extCertificates_domains, coCurr_dict, generalSkills_dict):
        self.masterData = masterData
        self.masterData.reset_index(drop=True, inplace=True)
        self.stop = text.ENGLISH_STOP_WORDS
        self.awards_keywords = award_keywords
        self.languages = languages
        self.company_dict = company_dict
        self.scholarships = scholarships
        self.project_domains = project_domains
        self.workshop_org = workshop_org
        self.publications = publications
        self.extCertificates_keywords = extCertificates_keywords
        self.extCertificates_domains = extCertificates_domains
        self.coCurr_dict = coCurr_dict
        self.generalSkills_dict = generalSkills_dict
        
    def preProcessAllColumns(self,df):
        df = self.preProcessAwards(df)
        df = self.preProcessLanguages(df)
        df = self.preProcessCompanyName(df)
        df = self.preProcessScholarships(df)
        df = self.preProcessProjectDomain(df)
        df = self.preProcessInternshipDomain(df)
        df = self.preProcessInternshipCompany(df)
        df = self.preProcessWorkshop(df)
        df = self.preProcessPublications(df)
        df = self.preProcessResearch(df)
        df = self.preProcessExternalCertificates(df)
        df = self.preProcessProgLanguages(df)
        df = self.preprocessSoftwareToolsGroup(df, groups)
        df = self.preProcessCoCurricular(df)
        df = self.preProcessGeneralSkills(df)
        return df        
        
    def preProcessAwards(self, df):
        df['Awards'].fillna((''),inplace = True)
        df['Awards'] = df['Awards'].str.lower()
        df["Awards"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in self.stop))
        vals = [y for x in df['Awards'] for y in x.split()]
        n = [2,3]
        a = pd.Series([y for x in n for y in ngrams(vals, x)]).value_counts()
        #awards AWARDSNLPKEY column will contain only keywords seperated by ##
        df['AwardsNLP'] = ''
        for key in self.awards_keywords:
            for i in range(0, len(df)):
                if(df.iloc[i]['Awards'].find(key)!=-1):
                    df.iloc[i, df.columns.get_loc('AwardsNLP')] =df.iloc[i, df.columns.get_loc('AwardsNLP')]+ key+'##'
        
        return df
    
    def preProcessLanguages(self, df):
        #instead of argument can use this 
        df['Languages'].fillna((''),inplace = True)    
        #lang = pd.Series(' '.join(df['Languages']).lower().replace('\n','').replace(',',"##").replace(' ','##').split('##')).value_counts()[:20]
        df['Languages'] = df['Languages'].str.lower()
        df['LanguagesNLP'] = ''
        for key in self.languages:
            for i in range(0, len(df)):
                if(df.iloc[i]['Languages'].find(key)!=-1):
                    df.iloc[i, df.columns.get_loc('LanguagesNLP')] =df.iloc[i, df.columns.get_loc('LanguagesNLP')]+ key+'##'
        return df
    
    def preProcessProgLanguages(self, data):
        #instead of argument can use this 
        df = data.copy()
        progLang = ["c++","c","flask","html","css","php","java","javascript","c#","python","r","node.js","scala","ajax","bash","rest","unix","sql","go","matlab","arduino"]
        df["ProgLanguages"].fillna((""),inplace = True)    
        df["ProgLanguages"] = df["ProgLanguages"].str.lower()
        df["ProgLanguagesTemp"] = df["ProgLanguages"].str.replace(" ","")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("c\+\+","plusplus")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("c#,","sharp")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("flask","rest")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace(",","##")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("css","webdesign")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("#r#","rprog")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("node.js","node")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("nodejs","node")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("js","javascript")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("javascript","js")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("java","java-oop")
        df["ProgLanguagesTemp"] = df["ProgLanguagesTemp"].str.replace("scala","s@ala")
        progLangNames = ["plusplus","c","rest","html","webdesign","php","java-oop","js","sharp","python","rprog","s@ala","ajax","bash","unix","sql","go","matlab","arduino"] 
        df["ProgLanguagesNLP"] = ""
        for key in progLangNames:
            for i in range(0, len(df)):
                if(df.iloc[i]["ProgLanguagesTemp"].find(key)!=-1):
                    df.iloc[i, df.columns.get_loc("ProgLanguagesNLP")] =df.iloc[i, df.columns.get_loc("ProgLanguagesNLP")]+ key+"##"        
        df["ProgLanguagesNLP"] = df["ProgLanguagesNLP"].str.replace("plusplus","c++")
        df["ProgLanguagesNLP"] = df["ProgLanguagesNLP"].str.replace("sharp","csharp")
        df["ProgLanguagesNLP"] = df["ProgLanguagesNLP"].str.replace("webdesign","css")
        df["ProgLanguagesNLP"] = df["ProgLanguagesNLP"].str.replace("rprog","r")
        df["ProgLanguagesNLP"] = df["ProgLanguagesNLP"].str.replace("java-oop","java")
        df["ProgLanguagesNLP"] = df["ProgLanguagesNLP"].str.replace("js","javascript")
        df["ProgLanguagesNLP"] = df["ProgLanguagesNLP"].str.replace("node","node.js")
        df["ProgLanguagesNLP"] = df["ProgLanguagesNLP"].str.replace("s@ala","scala") 
        df.drop(['ProgLanguagesTemp'], axis='columns', inplace=True)
        return df
    
    def preprocessSoftwareToolsGroup(self, df, groups):
        data = df.copy()
        var = 'SoftwareTools'
        data[var]=data[var].str.split('##')
        for i,v in data.iterrows():
            l=[]
            for k in v['SoftwareTools']:
                if(k!=''):
                    t=k.rstrip("\n")
                    h=t.strip()
                    #print(h)
                    l.append(h.lower())
            data['SoftwareTools'][i]=l
            
        for i,v in data.iterrows():
            l=[]
            for k in v['SoftwareTools']:
                if(k in groups['l1']):
                    l.append("Adobe")
                if(k in groups['l2']):
                    l.append("Cloud Computing and Big Data")
                if(k in groups['l3']):
                    l.append("Python tools")
                if(k in groups['l4']):
                    l.append("Microsoft Tools")
                if(k in groups['l5']):
                    l.append("DBMS TOOLS")
                if(k in groups['l6']):
                    l.append("Machine Learning Tools")
                if(k in groups['l7']):
                    l.append("Computer Network")
            data['SoftwareTools'][i]=l
        return data
    
    def preProcessCoCurricular(self, df):
        df['CoCurricularActivities']= df['CoCurricularActivities'].str.lower()
        df['CoCurricularActivitiesNLP'] = ''
        coCurr_key_list = list(self.coCurr_dict.keys()) 
        coCurr_val_list = list(self.coCurr_dict.values()) 
        for i in range(0, len(df)):
            #print(i)
            for j in coCurr_val_list:
                if(pd.isnull(df.at[i,'CoCurricularActivities'])):
                    df.iloc[i, df.columns.get_loc('CoCurricularActivitiesNLP')] = np.nan
                    break
                elif(type(j) == str and df.iloc[i]['CoCurricularActivities'].find(j)!=-1):
                    df.iloc[i, df.columns.get_loc('CoCurricularActivitiesNLP')] += coCurr_key_list[coCurr_val_list.index(j)]+"##"
                elif(type(j)==list):
                    for k in j:
                        if(df.iloc[i]['CoCurricularActivities'].find(k)!=-1):
                            df.iloc[i, df.columns.get_loc('CoCurricularActivitiesNLP')] += coCurr_key_list[coCurr_val_list.index(j)]+"##"
                            #print('list ',j)
                            break
        return df
    
    def preProcessGeneralSkills(self, df):
        df['GeneralSkills']= df['GeneralSkills'].str.lower()
        df['GeneralSkillsNLP'] = ''
        generalSkills_key_list = list(self.generalSkills_dict.keys()) 
        generalSkills_val_list = list(self.generalSkills_dict.values()) 
        for i in range(0, len(df)):
            #print(i)
            for j in generalSkills_val_list:
                #print(j,type(j))
                if(pd.isnull(df.at[i,'GeneralSkills'])):
                    df.iloc[i, df.columns.get_loc('GeneralSkillsNLP')] = np.nan
                    #print("nan\n"+j)
                    break
                elif(type(j) == str and df.iloc[i]['GeneralSkills'].find(j)!=-1):
                    df.iloc[i, df.columns.get_loc('GeneralSkillsNLP')]+= generalSkills_key_list[generalSkills_val_list.index(j)]+"##"
                    #print('str '+j)
                elif(type(j)==list):
                    #print("list outside ",j)
                    for k in j:
                        if(df.iloc[i]['GeneralSkills'].find(k)!=-1):
                            df.iloc[i, df.columns.get_loc('GeneralSkillsNLP')] += generalSkills_key_list[generalSkills_val_list.index(j)]+"##"
                            #print('list ',j)
                            break
        return df
    
        
    def preProcessCompanyName(self, df):
        df['CompanyName']= df['CompanyName'].str.lower()
        df['CompanyNameNLP'] = ''
        company_key_list = list(self.company_dict.keys()) 
        company_val_list = list(self.company_dict.values()) 
        for i in range(0, len(df)):
            #print(i)
            for j in company_val_list:
                #print(j,type(j))
                if(pd.isnull(df.at[i,'CompanyName'])):
                    df.iloc[i, df.columns.get_loc('CompanyNameNLP')] = np.nan
                    #print("nan\n"+j)
                    break
                elif(type(j) == str and df.iloc[i]['CompanyName'].find(j)!=-1):
                    df.iloc[i, df.columns.get_loc('CompanyNameNLP')] = company_key_list[company_val_list.index(j)]
                    #print('str '+j)
                elif(type(j)==list):
                    #print("list outside ",j)
                    for k in j:
                        if(df.iloc[i]['CompanyName'].find(k)!=-1):
                            df.iloc[i, df.columns.get_loc('CompanyNameNLP')] = company_key_list[company_val_list.index(j)]
                            #print('list ',j)
                            break
        return df
    
    def preProcessScholarships(self, df):
        # Scholarships 
        df['Scholarships'].fillna((''),inplace = True)
        df['Scholarships'] = df['Scholarships'].str.lower()
        df['ScholarshipsNLP'] = ''
        for key in self.scholarships:
            for i in range(0, len(df)):
                if(df.iloc[i]['Scholarships'].find(key)!=-1):
                    df.iloc[i, df.columns.get_loc('ScholarshipsNLP')] =df.iloc[i, df.columns.get_loc('ScholarshipsNLP')]+ key+'##'
        return df
    
    def preProcessProjectDomain(self, df):
        # getting list of project domains
        df['ProjectDetailNLP']= df['ProjectDetail'].str.lower()
        df['ProjectDetailDomain'] = ''
        proj_dom_key_list = list(self.project_domains.keys()) 
        proj_dom_val_list = list(self.project_domains.values()) 
        for i in range(0, len(df)):
            for j in proj_dom_val_list:
                #print(j,type(j))
                if(pd.isnull(df.at[i,'ProjectDetailNLP'])):
                    df.iloc[i, df.columns.get_loc('ProjectDetailDomain')] = ''
                    break
                elif(type(j) == str and df.iloc[i]['ProjectDetailNLP'].find(j)!=-1):
                    df.iloc[i, df.columns.get_loc('ProjectDetailDomain')] = df.iloc[i, df.columns.get_loc('ProjectDetailDomain')]+proj_dom_key_list[proj_dom_val_list.index(j)]+'##'
                    #print('str '+j)
                elif(type(j)==list):
                    #print("list outside ",j)
                    for k in j:
                        if(df.iloc[i]['ProjectDetailNLP'].find(k)!=-1):
                            df.iloc[i, df.columns.get_loc('ProjectDetailDomain')] = df.iloc[i, df.columns.get_loc('ProjectDetailDomain')]+proj_dom_key_list[proj_dom_val_list.index(j)]+'##'
                            #print('list ',j)
                            break
        return df
        
    def preProcessInternshipDomain(self, df):
        # getting list of internship project domains
        df['InternshipDetailNLP']= df['InternshipDetail'].str.lower()
        df['InternshipProjectDomain'] = ''
        proj_dom_key_list = list(self.project_domains.keys()) 
        proj_dom_val_list = list(self.project_domains.values()) 
        for i in range(0, len(df)):
            for j in proj_dom_val_list:
                #print(j,type(j))
                if(pd.isnull(df.at[i,'InternshipDetailNLP'])):
                    df.iloc[i, df.columns.get_loc('InternshipProjectDomain')] = ''
                    break
                elif(type(j) == str and df.iloc[i]['InternshipDetailNLP'].find(j)!=-1):
                    df.iloc[i, df.columns.get_loc('InternshipProjectDomain')] = df.iloc[i, df.columns.get_loc('InternshipProjectDomain')]+proj_dom_key_list[proj_dom_val_list.index(j)]+'##'
                    #print('str '+j)
                elif(type(j)==list):
                    #print("list outside ",j)
                    for k in j:
                        if(df.iloc[i]['InternshipDetailNLP'].find(k)!=-1):
                            df.iloc[i, df.columns.get_loc('InternshipProjectDomain')] = df.iloc[i, df.columns.get_loc('InternshipProjectDomain')]+proj_dom_key_list[proj_dom_val_list.index(j)]+'##'
                            #print('list ',j)
                            break
        return df
    
    def preProcessInternshipCompany(self, df ):
        # getting list of internship company names
        df['InternshipCompany'] = ''
        company_key_list = list(self.company_dict.keys()) 
        company_val_list = list(self.company_dict.values()) 
        for i in range(0, len(df)):
            if(not pd.isnull(df.iloc[i]['InternshipDetailNLP'])):
                for ind in df.iloc[i]['InternshipDetailNLP'].split('##'):
                    for j in company_val_list:
                        #print(j,type(j))
                        if(pd.isnull(ind.split('@@')[0])):
                            df.iloc[i, df.columns.get_loc('InternshipCompany')] = ''
                            break
                        elif(type(j) == str and ind.split('@@')[0].find(j)!=-1):
                            df.iloc[i, df.columns.get_loc('InternshipCompany')] = df.iloc[i, df.columns.get_loc('InternshipCompany')]+company_key_list[company_val_list.index(j)]+'##'
                            #print('str '+j)
                        elif(type(j)==list):
                            #print("list outside ",j)
                            for k in j:
                                if(ind.split('@@')[0].find(k)!=-1):
                                    df.iloc[i, df.columns.get_loc('InternshipCompany')] = df.iloc[i, df.columns.get_loc('InternshipCompany')]+company_key_list[company_val_list.index(j)]+'##'
                                    #print('list ',j)
                                    break
        return df
    
    def preProcessWorkshop(self, df):
        # getting workshop organizers
        df['Workshops'].fillna((''),inplace = True)
        df['Workshops'] = df['Workshops'].str.lower()
        df['WorkshopsOrg'] = ''
        for key in self.workshop_org:
            for i in range(0, len(df)):
                if(df.iloc[i]['Workshops'].find(key)!=-1):
                    df.iloc[i, df.columns.get_loc('WorkshopsOrg')] =df.iloc[i, df.columns.get_loc('WorkshopsOrg')]+ key+'##'

        # getting worshop domains

        df['WorkshopsDomain'] = ''
        proj_dom_key_list = list(self.project_domains.keys()) 
        proj_dom_val_list = list(self.project_domains.values()) 
        for i in range(0, len(df)):
            for j in proj_dom_val_list:
                #print(j,type(j))
                if(pd.isnull(df.at[i,'Workshops'])):
                    df.iloc[i, df.columns.get_loc('WorkshopsDomain')] = ''
                    break
                elif(type(j) == str and df.iloc[i]['Workshops'].find(j)!=-1):
                    df.iloc[i, df.columns.get_loc('WorkshopsDomain')] = df.iloc[i, df.columns.get_loc('WorkshopsDomain')]+proj_dom_key_list[proj_dom_val_list.index(j)]+'##'
                    #print('str '+j)
                elif(type(j)==list):
                    #print("list outside ",j)
                    for k in j:
                        if(df.iloc[i]['Workshops'].find(k)!=-1):
                            df.iloc[i, df.columns.get_loc('WorkshopsDomain')] = df.iloc[i, df.columns.get_loc('WorkshopsDomain')]+proj_dom_key_list[proj_dom_val_list.index(j)]+'##'
                            #print('list ',j)
                            break
        return df
    
    def preProcessPublications(self, df):
        # getting publications
        df['PublicationDetails'].fillna((''),inplace = True)
        df['PublicationDetails'] = df['PublicationDetails'].str.lower()
        df['PublicationNLP'] = ''
        for key in self.publications:
            for i in range(0, len(df)):
                if(df.iloc[i]['PublicationDetails'].find(key)!=-1):
                    df.iloc[i, df.columns.get_loc('PublicationNLP')] =df.iloc[i, df.columns.get_loc('PublicationNLP')]+ key+'##'
        return df
    
    def preProcessResearch(self, df):
        # getting research paper domain
        df['ResearchDomain'] = ''
        proj_dom_key_list = list(self.project_domains.keys()) 
        proj_dom_val_list = list(self.project_domains.values()) 
        for i in range(0, len(df)):
            for j in proj_dom_val_list:
                #print(j,type(j))
                if(pd.isnull(df.at[i,'PublicationDetails'])):
                    df.iloc[i, df.columns.get_loc('ResearchDomain')] = ''
                    break
                elif(type(j) == str and df.iloc[i]['PublicationDetails'].find(j)!=-1):
                    df.iloc[i, df.columns.get_loc('ResearchDomain')] = df.iloc[i, df.columns.get_loc('ResearchDomain')]+proj_dom_key_list[proj_dom_val_list.index(j)]+'##'
                    #print('str '+j)
                elif(type(j)==list):
                    #print("list outside ",j)
                    for k in j:
                        if(df.iloc[i]['PublicationDetails'].find(k)!=-1):
                            df.iloc[i, df.columns.get_loc('ResearchDomain')] = df.iloc[i, df.columns.get_loc('ResearchDomain')]+proj_dom_key_list[proj_dom_val_list.index(j)]+'##'
                            #print('list ',j)
                            break
        return df
    
    def preProcessExternalCertificates(self, df):
        df['ExternalCertificates'].fillna((''),inplace = True)
        df['ExternalCertificates'] = df['ExternalCertificates'].str.lower()
        df['ExternalCertificatesKey'] = ''
        df['ExternalCertificatesDomain'] =''
        for key in self.extCertificates_keywords:
            for i in range(0, len(df)):
                if(df.iloc[i]['ExternalCertificates'].find(key)!=-1):
                    df.iloc[i, df.columns.get_loc('ExternalCertificatesKey')] =df.iloc[i, df.columns.get_loc('ExternalCertificatesKey')]+ key+'##'

        for dom in self.extCertificates_domains:
            for i in range(0, len(df)):
                if(df.iloc[i]['ExternalCertificates'].find(dom)!=-1):
                    df.iloc[i, df.columns.get_loc('ExternalCertificatesDomain')] =df.iloc[i, df.columns.get_loc('ExternalCertificatesDomain')]+ dom+'##'
        return df

In [7]:
class MasterDataset:
    def __init__(self, placementAcademicFile, resumeFile, columnRename, dropColumn):
        self.placementAcademicFile = placementAcademicFile
        self.resumeFile = resumeFile
        self.columnRename = columnRename
        self.dropColumn = dropColumn
    def setMasterDataFinal(self, df):
        self.masterDataFinal = df.copy()
        
    def readCsv(self):
        # reading csv file  
        self.placAcadDf = pd.read_csv(self.placementAcademicFile)
        self.resumeDf = pd.read_excel(self.resumeFile)
        
    def removeNull(self, df):
        df1 = df[df['NumberOfOffers'].notnull()]
        df2 = df1[df1['CompanyName'].notnull()]
        return df2
    
    def separateElecCourseNameGrade(self, df):
        course_list = df['Courses'].to_list()
        elec_name=[]
        elec_grade=[]
        ctr = 0
        for i in course_list:
            if(len(i.replace("\'", "\"").replace('nan','0').split('"Core Courses": ')[1].split(', "Y": '))>1):
                a = i.replace("\'", "\"").replace('nan','0').split('"Core Courses": ')[1].split(', "Y": ')[1][2:-2].split('}, {')
                #print(a[1])
                b= ["{" + str(x) +"}" for x in a]
            #print(b[1])
                indiv_elec_name=[]
                indiv_elec_grade=[]
                for j in b:
                #print(j.split('}": "')[0][2:]+'}')
                    indiv_elec_name.append(json.loads(j.split('}": "')[0][2:]+'}')['Name'])
                    indiv_elec_grade.append(j.split('}": "')[1][:-2].replace('"',''))
            #print(indiv_elec_name,indiv_elec_grade)
                elec_name.append(indiv_elec_name)
                elec_grade.append(indiv_elec_grade)
            else:
                elec_name.append([])
                elec_grade.append([])
            ctr+=1
        return elec_name, elec_grade
        
    def separateCoreCourseNameGrade(self, df):
        course_list = df['Courses'].to_list()
        core_name=[]
        core_grade=[]
        ctr = 0
        for i in course_list:
            #print(ctr)
            #print(i.replace("\'", "\"").split('"Core Courses": ')[1].split(', "Y": ')[0][2:-2].split('}, {'))
            a = i.replace("\'", "\"").replace('nan','0').split('"Core Courses": ')[1].split(', "Y": ')[0][2:-2].split('}, {')
            b= ["{" + str(x) +"}" for x in a]
            indiv_core_name=[]
            indiv_core_grade=[]
            for j in b:
                #print(j.split('}": "')[0][2:]+'}')
                indiv_core_name.append(json.loads(j.split('}": "')[0][2:]+'}')['Name'])
                indiv_core_grade.append(j.split('}": "')[1][:-2])
            #print(len(indiv_core_name),len(indiv_core_grade))
            core_name.append(indiv_core_name)
            core_grade.append(indiv_core_grade)
            ctr+=1
        return core_name,core_grade
    
    def separateCourseNameGrade(self, df, core_name, core_grade, elec_name, elec_grade):
        course_list = df['Courses'].to_list()
        all_courses_name =[]
        all_courses_grade = []
        for i in range(0,len(core_name)):
            #print(i)
            #print(len(core_name[i]),len(elec_name[i]))
            all_courses_name.append(core_name[i]+elec_name[i])
            all_courses_grade.append(core_grade[i]+elec_grade[i])
            #print(len(all_courses_name[i]), len(all_courses_grade[i]))
            #print(len(all_courses_name),len(all_courses_grade))
        return all_courses_name, all_courses_grade
    
    def addSepColumnsForCourseGrade(self, df):
        core_name, core_grade = self.separateCoreCourseNameGrade(df)
        elec_name, elec_grade = self.separateElecCourseNameGrade(df)
        all_courses_name, all_courses_grade = self.separateCourseNameGrade(df,core_name, core_grade, elec_name, elec_grade)
        df['CourseName'] = all_courses_name
        df['CourseGrade'] = all_courses_grade
        df['ElectiveCourseName'] = elec_name
        df['ElectiveCourseGrade'] = elec_grade
        df['CoreCourseName'] = core_name
        df['CoreCourseGrade'] = core_grade
        return df
    
    def combinePlacementAcademicsResume(self, dfResume, dfPlacementAcademics):
        dfResume.rename(columns = {'SRN':'USN'}, inplace = True)
        usn_list = dfResume.USN.unique()
        academic_placement_res = dfPlacementAcademics.loc[dfPlacementAcademics['USN'].isin(usn_list)]
        combined_master=pd.merge(academic_placement_res,dfResume, on='USN')
        return combined_master
        
    def renameCombinedColumns(self,df):
        df1 = self.dropColumns(self.dropColumn, df)
        for k in self.columnRename.keys():
            df1.rename(columns = {k:columnRename[k]}, inplace = True)
        return df1
    
    def dropColumns(self, columnList, df):
        df1 = df.drop(columnList, axis=1)
        return df1
        

In [8]:
def readJson(filename):
    data = {}
    with open(filename) as json_file:
        data = json.load(json_file)
    return data

In [9]:
def combineDatasetsAcrossBatches(dfBatch1, dfBatch2):
    dfCombined = pd.concat([dfBatch1,dfBatch2],ignore_index=True)
    dfCombined.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis='columns', inplace=True)
    return dfCombined

In [10]:
def encryptionDataset(data):
    l2=[]
    for i in data['USN']:
        p=(hashlib.md5(i.encode()))
        l2.append(p.hexdigest())
    data['USN']=l2
    l3 =[]
    for i in data['Name']:
        p=(hashlib.md5(i.encode()))
        l3.append(p.hexdigest())
    data['Name']=l3
    cols = list(data.columns.values)
    cols = ['USN']  + [col for col in cols if col != 'USN']
    data = data[cols]
    return data

In [11]:
def UsnFirstCol(data):
    cols = list(data.columns.values)
    cols = ['USN']  + [col for col in cols if col != 'USN']
    data = data[cols]
    return data

In [6]:
columnRename = readJson('columnRenameJson.txt')
placementAcademicFileName = 'Placement+Academics1516.csv'
resumeFileName = 'Combined (2).xlsm'
dropColumns = ['StudentId','Name_y','CGPA_y','Mobile','Email','EarnedCredits', 'TotalCredits']
masterObj = MasterDataset(placementAcademicFileName, resumeFileName, columnRename, dropColumns)
masterObj.readCsv()
combinedMaster = masterObj.combinePlacementAcademicsResume( masterObj.resumeDf, masterObj.placAcadDf)
combinedMaster = masterObj.renameCombinedColumns(combinedMaster)
combinedMaster = masterObj.removeNull(combinedMaster)
combinedMaster = masterObj.addSepColumnsForCourseGrade(combinedMaster)

In [12]:
masterDataPreProcObj = PreProcessing(combinedMaster)
masterDataPreProc = masterDataPreProcObj.PreProcess10thMarks(masterDataPreProcObj.masterData)
masterDataPreProc = masterDataPreProcObj.PreProcess12thMarks(masterDataPreProc)
masterDataPreProc = masterDataPreProcObj.fillCTCNa(masterDataPreProc)
masterDataPreProc = masterDataPreProcObj.PreprocessCTC(masterDataPreProc)
masterDataPreProc = masterDataPreProcObj.TierCorrectionPreProcess(masterDataPreProc)
masterDataPreProc = masterDataPreProcObj.fillTierNa(masterDataPreProc)



In [21]:
award_keywords = ['distinction award','alcoding','hackerearth','olympiad','iisc','outstanding','hackathon','hackerrank','rotary club','iayp','intel','e-yantra','cnr','merit','robotics','machine learning','Data science','olympiad','quiz','microsoft','pesu i/o','centre innovation entrepreneur','model united nation','mun','math','chemistry','outgoing']
scholarships = ['cnr','mrd','distinction']
publications = ['ieee']
workshop_org =['cie','ieee','iisc','bits','pes','dsc','nitk','b v jagadish','intuit','amazon','nokia','tedx']
extCertificates_keywords = ['udemy','coursera','nptel','linkedin','mooc','hackerrank','pesu i/o','internshaala','german','harvard','ibm','stanford','google','microsoft']
extCertificates_domains = ['machine learning','deep learning','cloud','data science','neural networks','android','robotics','pattern recognition','cryptography','big data','natural language processing']
languages = ['kannada', 'hindi', 'english', 'tamil', 'bengali', 'marathi', 'telugu', 'tamil', 'gujrati', 'german', 'french', 'spanish', 'japanese', 'sanskrit', 'malayalam', 'urdu']
company_dict = readJson('companyNameJson.txt')
coCurr_dict = readJson('coCurricularActivities.json')
generalSkills_dict = readJson('generalSkills.json')

project_domain_dict = readJson('projectDomainList.txt')
reqdColumns = ['USN', 'Name', 'CGPA', 'NumberOfOffers',
       'TierLevel', 'EmploymentType', 'CTC', 'Stipend(K)', 'Branch',
       '10thPercentage', '12thPercentage', 'NoofProjects',
       'GitHubLink', 'LinkedInLink', 'GeneralSkills',
       'NoofInternships',
       'ProgLanguages', 'SoftwareTools', 'MinorAttended',  'VolunteeringWork',
       'CoCurricularActivities', 'OtherDetails', 'CourseName',
       'CourseGrade', 'ElectiveCourseName', 'ElectiveCourseGrade',
       'CoreCourseName', 'CoreCourseGrade', 'AwardsNLP', 'LanguagesNLP',
       'CompanyNameNLP', 'ScholarshipsNLP', 'GeneralSkillsNLP','CocurricularActivitiesNLP', 
       'ProjectDetailDomain', 'InternshipProjectDomain',
       'InternshipCompany', 'WorkshopsOrg', 'WorkshopsDomain',
       'PublicationNLP', 'ResearchDomain', 'ExternalCertificatesKey',
       'ExternalCertificatesDomain']
nlpMasterObj = NLPPreProcess(masterDataPreProc, award_keywords, languages, company_dict, scholarships, project_domain_dict, workshop_org, publications, extCertificates_keywords, extCertificates_domains, coCurr_dict, generalSkills_dict)
masterDf = nlpMasterObj.preProcessAllColumns(nlpMasterObj.masterData)
masterDataNew = masterDf[reqdColumns]
masterDataNewEncrypted = encryptionDataset(masterDataNew)
masterDataNewEncrypted = UsnFirstCol(masterDataNewEncrypted)
masterDataNewEncrypted.to_csv('MasterNew.csv')

In [75]:
newBatchData = pd.read_csv('MasterNew.csv')
oldBatchData = pd.read_csv('MasterOld.csv')
#if not encrypted do this
#newBatchEncrypted = encryptionDataset(newBatchData)
#oldBatchEncrypted = encryptionDataset(oldBatchData)
combinedFinalMasterDataset = combineDatasetsAcrossBatches(newBatchEncrypted,oldBatchEncrypted)
combinedFinalMasterDataset = UsnFirstCol(combinedFinalMasterDataset)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [77]:
combinedFinalMasterDataset.to_csv('combinedFinalMasterDataset.csv')