## Introduction




In [2]:
# Importing relevant libraries -
import pandas as pd
import numpy as np

# Reading .CSV files into respective pandas datafame objects -
dete_survey = pd.read_csv('dete_survey.csv')
tafe_survey = pd.read_csv('tafe_survey.csv')

### Obserations:
1. Many columns in both datasets are questions of **Y/N** type. Melting our datasets in such a way that all these *question columns* are arranged in to a single column, and their *Y/N* values in another column, will decrease the column size represented by thise question-data to just 2 columns. This will also make our data much easier to interpret.
2. Many values that are supposed to be *NaN* are represented as `-` and `Not Stated` in `tafe_survey` and `dete_survey` respectively. These will need to be changed to `NaN` type.

In [23]:
dete_survey = pd.read_csv('dete_survey.csv',na_values = 'Not Stated')
dete_survey_updated = dete_survey.drop(dete_survey.columns[28:49], axis = 1)
dete_survey_updated.columns = dete_survey_updated.columns.str.lower().str.strip()
dete_survey_updated[:3]

Unnamed: 0,id,separationtype,cease date,dete start date,role start date,position,classification,region,business unit,employment status,...,work life balance,workload,none of the above,gender,age,aboriginal,torres strait,south sea,disability,nesb
0,1,Ill Health Retirement,08/2012,1984.0,2004.0,Public Servant,A01-A04,Central Office,Corporate Strategy and Peformance,Permanent Full-time,...,False,False,True,Male,56-60,,,,,Yes
1,2,Voluntary Early Retirement (VER),08/2012,,,Public Servant,AO5-AO7,Central Office,Corporate Strategy and Peformance,Permanent Full-time,...,False,False,False,Male,56-60,,,,,
2,3,Voluntary Early Retirement (VER),05/2012,2011.0,2011.0,Schools Officer,,Central Office,Education Queensland,Permanent Full-time,...,False,False,True,Male,61 or older,,,,,


In [22]:
tafe_survey_updated = tafe_survey.drop(tafe_survey.columns[17:66], axis = 1)
tafe_survey_updated.columns = tafe_survey_updated.columns.str.lower().str.strip()
tafe_survey_updated[:3]

Unnamed: 0,record id,institute,workarea,cessation year,reason for ceasing employment,contributing factors. career move - public sector,contributing factors. career move - private sector,contributing factors. career move - self-employment,contributing factors. ill health,contributing factors. maternity/family,...,contributing factors. study,contributing factors. travel,contributing factors. other,contributing factors. none,gender. what is your gender?,currentage. current age,employment type. employment type,classification. classification,lengthofserviceoverall. overall length of service at institute (in years),lengthofservicecurrent. length of service at current workplace (in years)
0,6.34133e+17,Southern Queensland Institute of TAFE,Non-Delivery (corporate),2010.0,Contract Expired,,,,,,...,,,,,Female,26 30,Temporary Full-time,Administration (AO),1-2,1-2
1,6.341337e+17,Mount Isa Institute of TAFE,Non-Delivery (corporate),2010.0,Retirement,-,-,-,-,-,...,-,Travel,-,-,,,,,,
2,6.341388e+17,Mount Isa Institute of TAFE,Delivery (teaching),2010.0,Retirement,-,-,-,-,-,...,-,-,-,NONE,,,,,,


In [10]:
dete_survey_updated.columns = dete_survey_updated.columns.str.strip().str.lower().str.replace(' ','_')

In [11]:
replace_dict = {'Record ID':'id',
                'CESSATION YEAR':'cease_date',
                'Reason for ceasing employment': 'separationtype',
                'Gender. What is your Gender?': 'gender',
                'CurrentAge. Current Age': 'age',
                'Employment Type. Employment Type': 'employment_status',
                'Classification. Classification': 'position',
                'LengthofServiceOverall. Overall Length of Service at Institute (in years)': 'institute_service',
                'LengthofServiceCurrent. Length of Service at current workplace (in years)': 'role_service'
               }

tafe_survey_updated.rename(replace_dict, axis = 1, inplace = True)
tafe_survey_updated.columns

Index(['id', 'Institute', 'WorkArea', 'cease_date', 'separationtype',
       'Contributing Factors. Career Move - Public Sector ',
       'Contributing Factors. Career Move - Private Sector ',
       'Contributing Factors. Career Move - Self-employment',
       'Contributing Factors. Ill Health',
       'Contributing Factors. Maternity/Family',
       'Contributing Factors. Dissatisfaction',
       'Contributing Factors. Job Dissatisfaction',
       'Contributing Factors. Interpersonal Conflict',
       'Contributing Factors. Study', 'Contributing Factors. Travel',
       'Contributing Factors. Other', 'Contributing Factors. NONE', 'gender',
       'age', 'employment_status', 'position', 'institute_service',
       'role_service'],
      dtype='object')

In [12]:
dete_survey_updated.columns

Index(['id', 'separationtype', 'cease_date', 'dete_start_date',
       'role_start_date', 'position', 'classification', 'region',
       'business_unit', 'employment_status', 'career_move_to_public_sector',
       'career_move_to_private_sector', 'interpersonal_conflicts',
       'job_dissatisfaction', 'dissatisfaction_with_the_department',
       'physical_work_environment', 'lack_of_recognition',
       'lack_of_job_security', 'work_location', 'employment_conditions',
       'maternity/family', 'relocation', 'study/travel', 'ill_health',
       'traumatic_incident', 'work_life_balance', 'workload',
       'none_of_the_above', 'gender', 'age', 'aboriginal', 'torres_strait',
       'south_sea', 'disability', 'nesb'],
      dtype='object')

In [13]:
tafe_resignations = tafe_survey_updated[tafe_survey_updated['separationtype']=='Resignation']
tafe_resignations['separationtype'].value_counts()

Resignation    340
Name: separationtype, dtype: int64

In [71]:
dete_resignations = dete_survey_updated[dete_survey_updated['separationtype'].str.contains(r'Resignation')]
dete_resignations['separationtype'].value_counts()

Resignation-Other reasons               150
Resignation-Other employer               91
Resignation-Move overseas/interstate     70
Name: separationtype, dtype: int64

In [72]:
# dete_resignations['cease_date'].str[-4:].astype(float)
dete_resignations.loc[:,'cease_date'] = dete_resignations['cease_date'].str[-4:].astype(float)
dete_resignations['cease_date']

3      2012.0
5      2012.0
8      2012.0
9      2012.0
11     2012.0
        ...  
808    2013.0
815    2014.0
816    2014.0
819    2014.0
821    2013.0
Name: cease_date, Length: 311, dtype: float64

In [73]:
dete_resignations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311 entries, 3 to 821
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   id                                   311 non-null    int64  
 1   separationtype                       311 non-null    object 
 2   cease_date                           300 non-null    float64
 3   dete_start_date                      283 non-null    float64
 4   role_start_date                      271 non-null    float64
 5   position                             308 non-null    object 
 6   classification                       161 non-null    object 
 7   region                               265 non-null    object 
 8   business_unit                        32 non-null     object 
 9   employment_status                    307 non-null    object 
 10  career_move_to_public_sector         311 non-null    bool   
 11  career_move_to_private_sector   

In [74]:
dete_resignations[['dete_start_date','cease_date']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311 entries, 3 to 821
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   dete_start_date  283 non-null    float64
 1   cease_date       300 non-null    float64
dtypes: float64(2)
memory usage: 7.3 KB


In [79]:
dete_resignations.loc[:,'institute_service'] = dete_resignations['cease_date'] - dete_resignations['dete_start_date']

In [82]:
dete_resignations['institute_service'].value_counts(dropna = False).sort_index()

0.0     20
1.0     22
2.0     14
3.0     20
4.0     16
5.0     23
6.0     17
7.0     13
8.0      8
9.0     14
10.0     6
11.0     4
12.0     6
13.0     8
14.0     6
15.0     7
16.0     5
17.0     6
18.0     5
19.0     3
20.0     7
21.0     3
22.0     6
23.0     4
24.0     4
25.0     2
26.0     2
27.0     1
28.0     2
29.0     1
30.0     2
31.0     1
32.0     3
33.0     1
34.0     1
35.0     1
36.0     2
38.0     1
39.0     3
41.0     1
42.0     1
49.0     1
NaN     38
Name: institute_service, dtype: int64

In [85]:
dete_survey_updated.columns

Index(['id', 'separationtype', 'cease_date', 'dete_start_date',
       'role_start_date', 'position', 'classification', 'region',
       'business_unit', 'employment_status', 'career_move_to_public_sector',
       'career_move_to_private_sector', 'interpersonal_conflicts',
       'job_dissatisfaction', 'dissatisfaction_with_the_department',
       'physical_work_environment', 'lack_of_recognition',
       'lack_of_job_security', 'work_location', 'employment_conditions',
       'maternity/family', 'relocation', 'study/travel', 'ill_health',
       'traumatic_incident', 'work_life_balance', 'workload',
       'none_of_the_above', 'gender', 'age', 'aboriginal', 'torres_strait',
       'south_sea', 'disability', 'nesb'],
      dtype='object')

detesurveyupdated:

job_dissatisfaction

dissatisfaction_with_the_department

physical_work_environment

lack_of_recognition

lack_of_job_security

work_location

employment_conditions

work_life_balance

workload

In [99]:
print(dete_survey_updated['dissatisfaction_with_the_department'].value_counts(dropna = False))
print(dete_survey_updated['job_dissatisfaction'].value_counts(dropna = False))
print(dete_survey_updated['lack_of_recognition'].value_counts(dropna = False))
print(dete_survey_updated['lack_of_job_security'].value_counts(dropna = False))
print(dete_survey_updated['work_location'].value_counts(dropna = False))
print(dete_survey_updated['employment_conditions'].value_counts(dropna = False))
print(dete_survey_updated['work_life_balance'].value_counts(dropna = False))
print(dete_survey_updated['workload'].value_counts(dropna = False))
print(dete_survey_updated['physical_work_environment'].value_counts(dropna = False))

False    761
True      61
Name: dissatisfaction_with_the_department, dtype: int64
False    733
True      89
Name: job_dissatisfaction, dtype: int64
False    765
True      57
Name: lack_of_recognition, dtype: int64
False    794
True      28
Name: lack_of_job_security, dtype: int64
False    795
True      27
Name: work_location, dtype: int64
False    788
True      34
Name: employment_conditions, dtype: int64
False    605
True     217
Name: work_life_balance, dtype: int64
False    735
True      87
Name: workload, dtype: int64
False    806
True      16
Name: physical_work_environment, dtype: int64


In [93]:
dete_survey_updated[['dissatisfaction_with_the_department','job_dissatisfaction']]

Unnamed: 0,dissatisfaction_with_the_department,job_dissatisfaction
0,False,True
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
817,False,False
818,False,False
819,False,False
820,False,False


In [106]:
dissatisfied= [    'dissatisfaction_with_the_department',
                    'job_dissatisfaction',
                    'lack_of_recognition',
                    'lack_of_job_security',
                    'work_location',
                    'employment_conditions',
                    'work_life_balance',
                    'workload',
                    'physical_work_environment'
              ]
# dete_survey_updated[dissatisfied].isnull().sum() --> Checking for any Null values in BOOL type dissatisfaction rows
dete_survey_updated['dissatisfied'] = dete_survey_updated[dissatisfied].any(axis = 1, skipna = False)
dete_survey_updated['dissatisfied'] .value_counts()

False    447
True     375
Name: dissatisfied, dtype: int64

tafe_survey_updated:

Contributing Factors. Dissatisfaction

Contributing Factors. Job Dissatisfaction

In [112]:
tafe_survey_updated[['Contributing Factors. Dissatisfaction','Contributing Factors. Job Dissatisfaction']]

Unnamed: 0,Contributing Factors. Dissatisfaction,Contributing Factors. Job Dissatisfaction
0,,
1,-,-
2,-,-
3,-,-
4,-,-
...,...,...
697,-,-
698,-,-
699,-,-
700,,


In [121]:
print(tafe_survey_updated['Contributing Factors. Dissatisfaction'].head(25))

0                                        NaN
1                                          -
2                                          -
3                                          -
4                                          -
5                                          -
6                                          -
7                                          -
8                                          -
9                                          -
10                                         -
11                                       NaN
12                                       NaN
13                                         -
14    Contributing Factors. Dissatisfaction 
15                                         -
16                                       NaN
17                                         -
18                                       NaN
19                                         -
20    Contributing Factors. Dissatisfaction 
21                                         -
22        

In [128]:
def update_vals(value):
    if value=='-': 
        return False
    elif value=='NaN':
        return np.nan
    else:
        return True
    
tafe_survey_updated['dissatisfied'] = tafe_survey_updated[['Contributing Factors. Dissatisfaction','Contributing Factors. Job Dissatisfaction']].applymap(update_vals).any(axis = 1, skipna = False)

In [129]:
tafe_survey_updated['dissatisfied']

0       True
1      False
2      False
3      False
4      False
       ...  
697    False
698    False
699    False
700     True
701    False
Name: dissatisfied, Length: 702, dtype: bool

In [132]:
tafe_survey_up = tafe_survey_updated.copy()
dete_survey_up = dete_survey_updated.copy()

In [133]:
tafe_survey_up.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702 entries, 0 to 701
Data columns (total 24 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   id                                                   702 non-null    float64
 1   Institute                                            702 non-null    object 
 2   WorkArea                                             702 non-null    object 
 3   cease_date                                           695 non-null    float64
 4   separationtype                                       701 non-null    object 
 5   Contributing Factors. Career Move - Public Sector    437 non-null    object 
 6   Contributing Factors. Career Move - Private Sector   437 non-null    object 
 7   Contributing Factors. Career Move - Self-employment  437 non-null    object 
 8   Contributing Factors. Ill Health                     437 non-null    o

## COMBINED

In [171]:
tafe_survey_up['institute'] = 'TAFE'
dete_survey_up['institute'] = 'DETE'

# combined = merge()
tafe_survey_up.columns = tafe_survey_up.columns.str.lower()

dete_survey_up.iloc[:,:] = dete_survey_up.dropna(thresh = 500, axis = 1)
tafe_survey_up.iloc[:,:] = tafe_survey_up.dropna(thresh = 500, axis = 1)

In [175]:
tafe_survey_up = tafe_survey_up.drop(tafe_survey_up.columns[7], axis = 1)
tafe_survey_up

Unnamed: 0,id,institute,workarea,cease_date,separationtype,gender,age,dissatisfied,institute.1
0,6.341330e+17,TAFE,Non-Delivery (corporate),2010.0,Contract Expired,Female,26 30,True,TAFE
1,6.341337e+17,TAFE,Non-Delivery (corporate),2010.0,Retirement,,,False,TAFE
2,6.341388e+17,TAFE,Delivery (teaching),2010.0,Retirement,,,False,TAFE
3,6.341399e+17,TAFE,Non-Delivery (corporate),2010.0,Resignation,,,False,TAFE
4,6.341466e+17,TAFE,Delivery (teaching),2010.0,Resignation,Male,41 45,False,TAFE
...,...,...,...,...,...,...,...,...,...
697,6.350668e+17,TAFE,Delivery (teaching),2013.0,Resignation,Male,51-55,False,TAFE
698,6.350677e+17,TAFE,Non-Delivery (corporate),2013.0,Resignation,,,False,TAFE
699,6.350704e+17,TAFE,Delivery (teaching),2013.0,Resignation,Female,51-55,False,TAFE
700,6.350712e+17,TAFE,Non-Delivery (corporate),2013.0,Contract Expired,Female,41 45,True,TAFE


In [173]:
combined = pd.concat([tafe_survey_up,dete_survey_up], ignore_index = True)
combined.head()

ValueError: Plan shapes are not aligned