clean datasets

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv

df = pd.read_csv("10_05_2021.csv", dtype={'work_postal':'str'})
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
df['end_date'] = df['end_date'].fillna(pd.to_datetime('2021-11-01'))

def retentionByGroup(dataframe, group, start, end):
    #create n records using getEmpDates, where n = amount of groups 
    retentionGroups = {}
    #initialize groups to null
    for i in range(len(dataframe)):
        if pd.isna(dataframe[group][i]):
            continue
        if dataframe[group][i] in retentionGroups:
            continue
        retentionGroups[dataframe[group][i]] = "temp"
    
    for key in retentionGroups:
        tempdf = dataframe[dataframe[group] == key].copy()
        tempdf = tempdf.reset_index() 
        retentionRate = calcRetentionRate(tempdf, start, end)
        retentionGroups[key] = retentionRate
    
    return retentionGroups

def getEmpDates(dataframe):

    records = {}

    for i in range(len(dataframe)):
        if dataframe['emplid_sec'][i] in records:
            record = records[dataframe['emplid_sec'][i]]
            if dataframe['start_date'][i] < record[0]:
                record[0] = dataframe['start_date'][i]
            if dataframe['end_date'][i] > record[1]:
                record[1] = dataframe['end_date'][i] 
            records[dataframe['emplid_sec'][i]] = record
        else:
            record = []
            record.append(dataframe['start_date'][i])
            record.append(dataframe['end_date'][i])
            records[dataframe['emplid_sec'][i]] = record
    
    return records

def partitionDFbyDate(dataframe, start, end):

    temp_df = dataframe[(dataframe['start_date'] <= start) & (dataframe['end_date'] >= start)]
    temp_df = temp_df[(temp_df['start_date'] <= end) & (temp_df['end_date'] >= end)]
    temp_df = temp_df.sort_values(by="end_date")
    temp_df = temp_df.drop_duplicates(subset=['emplid_sec'], keep="last")
    temp_df = temp_df.reset_index()


    return temp_df

def calcRetentionRate(dataframe, start, end):
    emp_dates = getEmpDates(dataframe)
    retained_emp = 0
    total_emp = 0

    for key in emp_dates:
        record = emp_dates[key]
        if record[0] <= start:
            if record[1] < end and record[1] > start:
                #case where employee was active at start of time period but quit at some point
                total_emp = total_emp + 1
            elif record[1] >= end:
                total_emp = total_emp + 1
                retained_emp = retained_emp + 1

    if (total_emp != 0):
        retentionRate = (retained_emp/total_emp) * 100
    else:
        retentionRate = "No employees in date range"
    
    return retentionRate

def getJobTitles(dataframe):
    jobTitles = {}
    jobTitles['index'] = ['job count', 'percentage of total jobs', 'average comp rate']
    for i in range(len(dataframe)):
        if dataframe['jobtitle'][i] in jobTitles:
            continue
        else:
            jobTitles[dataframe['jobtitle'][i]] = [0,0,0] # placeholder. job # count, job %, avg comprate

    return jobTitles

def jobTitleCounts(jobs, dataframe):
    total = 0
    for i in range(len(dataframe)):
        jobData = jobs[dataframe['jobtitle'][i]]
        jobData[0] = jobData[0] + 1
        jobs[dataframe['jobtitle'][i]] = jobData
        total = total + 1

    assert(total == len(dataframe))

    iterjobs = iter(jobs)
    next(iterjobs)
    for key in iterjobs:
        jobData = jobs[key]
        jobData[1] = jobData[0] / total
        jobs[key] = jobData

def getAvgComprate(jobs, dataframe):
    for i in range(len(dataframe)):
        jobData = jobs[dataframe['jobtitle'][i]]
        jobData[2] = jobData[2] + dataframe['comprate'][i]
        jobs[dataframe['jobtitle'][i]] = jobData
    
    iterjobs = iter(jobs)
    next(iterjobs)
    for key in iterjobs:
        jobData = jobs[key] 
        #jobData[2] at current point is the sum of all comp rates for every employee with that specific job
        #to convert into average, we divide by total # of employees who have that job title, which is located in jobData[0]
        jobData[2] = jobData[2] / jobData[0] 
        jobs[key] = jobData
    
    return jobs

def jobsAnalysis(dataframe, start, end):
    dataf = partitionDFbyDate(dataframe, start, end)
    jobs = getJobTitles(dataf)
    jobTitleCounts(jobs, dataf)
    getAvgComprate(jobs, dataf)

    return jobs
        
startdate = pd.Timestamp("2020-07")
enddate = pd.Timestamp("2021-07")

jobs = jobsAnalysis(df, startdate, enddate)
retentionRate = calcRetentionRate(df, startdate, enddate)
retentionEthnicity = retentionByGroup(df, "ethnicity", startdate, enddate)
#print(jobs)
#print(retentionRate)
#print(retentionEthnicity)
# with open('rc_jobcounts.csv', 'w') as f:
#     for key in jobs.keys():
#         f.write("%s,%s,%s,%s\n"%(key,jobs[key][0],jobs[key][1],jobs[key][2]))

{'index': ['job count', 'percentage of total jobs', 'average comp rate'], 'M.H. PEER SPECIALIST': [5, 0.0025733401955738548, 19.8591572], 'SOCIAL SVCS PRACTITIONER II': [34, 0.017498713329902212, 25.976598382352936], 'PROGRAM SPECIALIST II': [21, 0.01080802882141019, 36.19805947619048], 'SOCIAL SVCS PRACTITIONER III': [286, 0.1471950591868245, 34.27455455594407], 'ELIGIBILITY TECHNICIAN II': [137, 0.07050952135872363, 24.656100423357653], 'OFFICE ASSISTANT III': [158, 0.08131755018013381, 20.955143322784817], 'ELIGIBILITY TECHNICIAN I': [27, 0.013896037056098817, 17.824191074074072], 'BEHAVIORAL HEALTH SVC SUPV': [5, 0.0025733401955738548, 46.366193599999995], 'ELIGIBILITY TECHNICIAN III': [54, 0.027792074112197633, 28.79560916666666], 'SECRETARY II': [13, 0.0066906845084920225, 26.48364576923077], 'DEP DIR OF PROGRAMS & OPS': [1, 0.000514668039114771, 68.2699], 'ADMIN SVCS ANALYST I': [1, 0.000514668039114771, 22.9428], 'SECRETARY I': [38, 0.0195573854863613, 23.204071605263163], 'ELI