clean datasets

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("2021_12_07.csv", dtype={'work_postal':'str'})
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
df['end_date'] = df['end_date'].fillna(pd.to_datetime('2022-01-01'))

def retentionByGroup(dataframe, group, start, end):
    #create n records using getEmpDates, where n = amount of groups 
    retentionGroups = {}
    #initialize groups to null
    for i in range(len(dataframe)):
        if pd.isna(dataframe[group][i]):
            continue
        if dataframe[group][i] in retentionGroups:
            continue
        retentionGroups[dataframe[group][i]] = "temp"
    
    for key in retentionGroups:
        tempdf = dataframe[dataframe[group] == key].copy()
        tempdf = tempdf.reset_index() 
        retentionRate = calcRetentionRate(tempdf, start, end)
        retentionGroups[key] = retentionRate
    
    return retentionGroups

def getEmpDates(dataframe):

    records = {}

    for i in range(len(dataframe)):
        if dataframe['emplid_sec'][i] in records:
            record = records[dataframe['emplid_sec'][i]]
            if dataframe['start_date'][i] < record[0]:
                record[0] = dataframe['start_date'][i]
            if dataframe['end_date'][i] > record[1]:
                record[1] = dataframe['end_date'][i] 
            records[dataframe['emplid_sec'][i]] = record
        else:
            record = []
            record.append(dataframe['start_date'][i])
            record.append(dataframe['end_date'][i])
            records[dataframe['emplid_sec'][i]] = record
    
    return records

def partitionDFbyDate(dataframe, start, end):

    temp_df = dataframe[(dataframe['start_date'] <= start) & (dataframe['end_date'] >= start)]
    temp_df = temp_df[(temp_df['start_date'] <= end) & (temp_df['end_date'] >= end)]
    temp_df = temp_df.sort_values(by="end_date")
    temp_df = temp_df.drop_duplicates(subset=['emplid_sec'], keep="last")
    temp_df = temp_df.reset_index()


    return temp_df

def calcRetentionRate(dataframe, start, end):
    emp_dates = getEmpDates(dataframe)
    retained_emp = 0
    total_emp = 0

    for key in emp_dates:
        record = emp_dates[key]
        if record[0] <= start:
            if record[1] < end and record[1] > start:
                #case where employee was active at start of time period but quit at some point
                total_emp = total_emp + 1
            elif record[1] >= end:
                total_emp = total_emp + 1
                retained_emp = retained_emp + 1

    if (total_emp != 0):
        retentionRate = (retained_emp/total_emp) * 100
    else:
        retentionRate = "No employees in date range"
    
    return retentionRate

def getJobTitles(dataframe):
    jobTitles = {}
    jobTitles['index'] = ['job count', 'percentage of total jobs', 'average comp rate']
    for i in range(len(dataframe)):
        if dataframe['jobtitle'][i] in jobTitles:
            continue
        else:
            jobTitles[dataframe['jobtitle'][i]] = [0,0,0] # placeholder. job # count, job %, avg comprate

    return jobTitles

def jobTitleCountsRate(jobs, dataframe):
    total = 0
    for i in range(len(dataframe)):
        jobData = jobs[dataframe['jobtitle'][i]]
        jobData[0] = jobData[0] + 1
        jobs[dataframe['jobtitle'][i]] = jobData
        total = total + 1

    assert(total == len(dataframe))

    iterjobs = iter(jobs)
    next(iterjobs)
    for key in iterjobs:
        jobData = jobs[key]
        jobData[1] = jobData[0] / total
        jobs[key] = jobData

def jobTitleCounts(jobs, dataframe):
    for i in range(len(dataframe)):
        jobData = jobs[dataframe['jobtitle'][i]]
        jobData[0] = jobData[0] + 1
        jobs[dataframe['jobtitle'][i]] = jobData
    return jobs

def getAvgComprate(jobs, dataframe):
    for i in range(len(dataframe)):
        jobData = jobs[dataframe['jobtitle'][i]]
        jobData[2] = jobData[2] + dataframe['comprate'][i]
        jobs[dataframe['jobtitle'][i]] = jobData
    
    iterjobs = iter(jobs)
    next(iterjobs)
    for key in iterjobs:
        jobData = jobs[key] 
        #jobData[2] at current point is the sum of all comp rates for every employee with that specific job
        #to convert into average, we divide by total # of employees who have that job title, which is located in jobData[0]
        jobData[2] = jobData[2] / jobData[0] 
        jobs[key] = jobData
    
    return jobs

def jobsAnalysis(dataframe, start, end):
    dataf = partitionDFbyDate(dataframe, start, end)
    jobs = getJobTitles(dataf)
    jobTitleCountsRate(jobs, dataf)
    getAvgComprate(jobs, dataf)

    return jobs

def getJobRaiseRates(dataframe):
    raiseCounts = {}
    jobCounts = {}
    for record in range(len(dataframe)):
        if dataframe['jobtitle'][record] not in raiseCounts:
            raiseCounts[dataframe['jobtitle'][record]] = 0
        elif dataframe['emplid_sec'][record] == dataframe['emplid_sec'][record-1]:
            print("repeat emp.")
            if dataframe['jobtitle'][record] == dataframe['jobtitle'][record-1]:
                print("repeat job title")
                if dataframe['comprate'][record] > dataframe['comprate'][record-1]:
                    print("inc")
                    raiseCounts[dataframe['jobtitle'][record]] = raiseCounts[dataframe['jobtitle'][record]] + 1
    
        if dataframe['jobtitle'][record] not in jobCounts:
            jobCounts[dataframe['jobtitle'][record]] = 1

        elif dataframe['emplid_sec'][record] != dataframe['emplid_sec'][record-1]:
            jobCounts[dataframe['jobtitle'][record]] = jobCounts[dataframe['jobtitle'][record]] + 1
        elif dataframe['emplid_sec'][record] == dataframe['emplid_sec'][record-1]:
            if dataframe['jobtitle'][record] != dataframe['jobtitle'][record-1]:
                jobCounts[dataframe['jobtitle'][record]] = jobCounts[dataframe['jobtitle'][record]] + 1

    print(raiseCounts)
    
    for key in raiseCounts:
        raiseCounts[key] = raiseCounts[key] / jobCounts[key]
            
    raiseCounts = dict(sorted(raiseCounts.items(), key=lambda item: item[1]))

    return raiseCounts    
        
startdate = pd.Timestamp("2020-12-07")
enddate = pd.Timestamp("2021-12-07")
new_df = partitionDFbyDate(df, startdate, enddate)
# raiseCounts = getJobRaiseRates(new_df)

#print(raiseCounts)
#divide by job counts

# jobs = jobsAnalysis(df, startdate, enddate)
# retentionRate = calcRetentionRate(df, startdate, enddate)
# print(retentionRate)
retentionGroup = retentionByGroup(df, "highest_educ_lvl", startdate, enddate)
print(retentionGroup)

#print(jobs)
#print(retentionRate)
#print(retentionEthnicity)
# with open('rc_jobcounts.csv', 'w') as f:
#     for key in jobs.keys():
#         f.write("%s,%s,%s,%s\n"%(key,jobs[key][0],jobs[key][1],jobs[key][2]))

{'A': 87.69829392397486, 'G': 77.64705882352942, 'D': 81.31313131313132, 'I': 70.58823529411765, 'C': 76.13636363636364, 'J': 66.66666666666666, 'H': 80.55555555555556, 'E': 75.86206896551724, 'F': 76.66666666666667, 'K': 100.0, 'L': 100.0, 'B': 80.0}
