In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt

In [2]:
# making data frame from csv file 
df_sourcefile = pd.read_csv("Data/report_Atos_EmployeeData.csv")

print("Shape of dataframe is: {}".format(df_sourcefile.shape))
df_sourcefile.head()

Shape of dataframe is: (14718, 16)


Unnamed: 0,lastName,firstName,status,username,user-id,currency-code,paycompvalue,start-date,end-date,pay-component,frequency,date-of-birth,start-date.1,end-date.1,end-date.2,fte
0,Trombley,Francois,Active User,ftrombley,1,EUR,6500.0,1/30/2010,12/31/2013,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
1,Trombley,Francois,Active User,ftrombley,1,EUR,6763.26,1/1/2017,12/31/9999,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
2,Trombley,Francois,Active User,ftrombley,1,EUR,6696.3,1/1/2016,12/31/2016,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
3,Trombley,Francois,Active User,ftrombley,1,EUR,6630.0,1/1/2014,12/31/2015,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
4,Trombley,Francois,Active User,ftrombley,1,EUR,6500.0,1/30/2010,12/31/2013,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,1.0


# Data Cleaning

In [3]:
df = df_sourcefile.copy()
df.head()

Unnamed: 0,lastName,firstName,status,username,user-id,currency-code,paycompvalue,start-date,end-date,pay-component,frequency,date-of-birth,start-date.1,end-date.1,end-date.2,fte
0,Trombley,Francois,Active User,ftrombley,1,EUR,6500.0,1/30/2010,12/31/2013,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
1,Trombley,Francois,Active User,ftrombley,1,EUR,6763.26,1/1/2017,12/31/9999,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
2,Trombley,Francois,Active User,ftrombley,1,EUR,6696.3,1/1/2016,12/31/2016,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
3,Trombley,Francois,Active User,ftrombley,1,EUR,6630.0,1/1/2014,12/31/2015,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
4,Trombley,Francois,Active User,ftrombley,1,EUR,6500.0,1/30/2010,12/31/2013,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,1.0


In [4]:
# rename and delete columns
df = df.rename(columns={"start-date.1":"hire-date", "end-date.2":"termination-date", "end-date":"data-date"})
df.drop(['start-date', 'pay-component', 'end-date.1'], axis=1) # TODO: end-date.1 is not getting deleted
df.head()

Unnamed: 0,lastName,firstName,status,username,user-id,currency-code,paycompvalue,start-date,data-date,pay-component,frequency,date-of-birth,hire-date,end-date.1,termination-date,fte
0,Trombley,Francois,Active User,ftrombley,1,EUR,6500.0,1/30/2010,12/31/2013,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
1,Trombley,Francois,Active User,ftrombley,1,EUR,6763.26,1/1/2017,12/31/9999,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
2,Trombley,Francois,Active User,ftrombley,1,EUR,6696.3,1/1/2016,12/31/2016,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
3,Trombley,Francois,Active User,ftrombley,1,EUR,6630.0,1/1/2014,12/31/2015,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88
4,Trombley,Francois,Active User,ftrombley,1,EUR,6500.0,1/30/2010,12/31/2013,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,1.0


In [5]:
#drop dublicate if rows are identical
df.drop_duplicates( keep = 'first', inplace = True)
print("Shape of dataframe is: {}".format(df.shape))

Shape of dataframe is: (2839, 16)


In [6]:
#FTE should be lower equal 1
df.drop(df[df.fte > 1].index, inplace=True)
print("Shape of dataframe is: {}".format(df.shape))

Shape of dataframe is: (2805, 16)


In [7]:
# calculate daily rate
#TODO
# simplification one year has 12 months every month 4 weeks every week 5 working days
#if df['frequency'] = "MON":
    # 4 weeks with 5 working days for one FTE
    #df['daily-rate'] = ( df['paycompvalue'] / (4 * 5) ) * df['fte']
    # usw....
    
#print("Shape of dataframe is: {}".format(df.shape))

In [8]:
# delete combinations: "inactive user“ + „termination-date = NaN”
df.drop(df[(df.status == "Inactive User") & (pd.isnull(df["termination-date"]))].index, inplace=True)
print("Shape of dataframe is: {}".format(df.shape))

Shape of dataframe is: (2306, 16)


In [9]:
# delete status = x
df.drop(df[df.status == "x"].index, inplace=True)
print("Shape of dataframe is: {}".format(df.shape))

Shape of dataframe is: (2304, 16)


In [10]:
# delete data date = NaN
df.drop(df[pd.isnull(df["data-date"])].index, inplace=True)
print("Shape of dataframe is: {}".format(df.shape))

Shape of dataframe is: (2202, 16)


In [11]:
# overwrite data-date with other date format for sorting
table = df["data-date"].str.split(pat = "/")
df["test"]=df["data-date"]
for i in df["data-date"].index:
    s = pd.Series([table[i][2], table[i][1], table[i][0]])
    df["data-date"][i] = s.str.cat(sep='.')
print("Shape of dataframe is: {}".format(df.shape))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Shape of dataframe is: (2202, 17)


In [12]:
#take newest data.date and delete the rest
df.sort_values(by=['user-id', 'data-date'], ascending = [1, 0], inplace = True)
df.drop_duplicates(subset =["user-id"],  keep = 'first', inplace = True)
print("Shape of dataframe is: {}".format(df.shape))

Shape of dataframe is: (729, 17)


In [16]:
# delete date-of.birth = NaN
df.drop(df[pd.isnull(df["date-of-birth"])].index, inplace=True)
print("Shape of dataframe is: {}".format(df.shape))

Shape of dataframe is: (454, 17)


In [20]:
# delete not needed columns
df.drop(['data-date', 'pay-component', 'end-date.1', 'start-date'], axis=1) # TODO: end-date.1 is not getting deleted
df.head()

Unnamed: 0,lastName,firstName,status,username,user-id,currency-code,paycompvalue,start-date,data-date,pay-component,frequency,date-of-birth,hire-date,end-date.1,termination-date,fte,test
1,Trombley,Francois,Active User,ftrombley,1,EUR,6763.26,1/1/2017,9999.31.12,BASESAL_FR,MON,6/5/1978,1/30/2010,4/12/2017,,0.88,12/31/9999
132,Schmidt,Michael,Active User,mschmidt,1000971,AUD,8939.79,1/1/2018,9999.31.12,BASESAL_DE,MON,1/10/1975,3/7/2015,12/31/9999,,1.0,12/31/9999
48,Williams,John,Active User,jwilliams,100112,USD,16000.0,1/1/1996,9999.31.12,BASESAL_US,MON,12/1/1958,3/13/2017,12/31/9999,,1.0,12/31/9999
7787,Cooper,Mya,Inactive User,mcooper,100115,USD,11000.0,1/1/1996,9999.31.12,BASESAL_US,MON,11/30/1980,1/1/1996,12/31/9999,3/31/2019,1.0,12/31/9999
8907,Davis,Sarah,Inactive User,sdavis,100135,USD,24000.0,1/1/1996,9999.31.12,BASESAL_US,MON,4/15/1962,1/1/1996,12/31/9999,3/31/2019,1.0,12/31/9999


# Output

In [17]:
#Save data to XLSX
df.to_excel('Data/output.xlsx', )