### Importing libraries

In [39]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from apyori import apriori

%matplotlib inline

### Reading input file

In [40]:
path_to_input_file = os.path.join(os.getcwd(), 'Employee_skills_traits.csv')
employee_skills_df = pd.read_csv(path_to_input_file)
employee_skills_df

Unnamed: 0,ID,Employment period,Time in current department,Gender,Team leader,Age,Member of professional organizations,.Net,SQL Server,HTML CSS Java Script,PHP mySQL,Fast working,Awards,Communicative
0,6723,5,4,0,1,48,1,1,1,0,1,1,0,1
1,8923,5,3,1,1,26,1,1,0,0,1,1,0,0
2,2322,11,8,0,0,34,0,0,1,1,0,1,0,1
3,235,7,5,1,1,27,1,0,1,0,1,1,1,0
4,9523,18,8,0,0,38,0,0,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,6098,12,3,1,1,32,1,0,0,0,1,1,0,1
994,12133,16,8,1,0,36,1,1,1,0,0,0,1,1
995,854,11,4,0,1,34,1,0,1,0,1,1,0,0
996,13444,8,8,1,0,36,0,0,1,0,0,0,1,1


### Removing whitespaces from column names

In [41]:
employee_skills_df.columns = employee_skills_df.columns.str.strip()
employee_skills_df.head()

Unnamed: 0,ID,Employment period,Time in current department,Gender,Team leader,Age,Member of professional organizations,.Net,SQL Server,HTML CSS Java Script,PHP mySQL,Fast working,Awards,Communicative
0,6723,5,4,0,1,48,1,1,1,0,1,1,0,1
1,8923,5,3,1,1,26,1,1,0,0,1,1,0,0
2,2322,11,8,0,0,34,0,0,1,1,0,1,0,1
3,235,7,5,1,1,27,1,0,1,0,1,1,1,0
4,9523,18,8,0,0,38,0,0,0,1,1,1,1,1


# Data Preprocessing

### Dropping duplicate records, if any

In [42]:
employee_skills_df.shape
duplicate_records = employee_skills_df[employee_skills_df.duplicated(keep=False)]
if not duplicate_records.empty:
    employee_skills_df.drop_duplicates(keep=False, inplace=True)
employee_skills_df.shape

(998, 14)

### Understanding correlation between data points

In [43]:
correlation = employee_skills_df.corr()
correlation

Unnamed: 0,ID,Employment period,Time in current department,Gender,Team leader,Age,Member of professional organizations,.Net,SQL Server,HTML CSS Java Script,PHP mySQL,Fast working,Awards,Communicative
ID,1.0,0.021231,0.045212,-0.002563,0.019724,0.058252,-0.003988,0.033203,0.027852,0.009756,0.001788,0.00747,-0.00904,0.006633
Employment period,0.021231,1.0,0.495719,0.033907,-0.048001,0.164389,-0.042311,-0.086372,-0.041851,0.050546,0.02932,0.031121,0.016359,-0.080846
Time in current department,0.045212,0.495719,1.0,0.009666,-0.005733,0.076987,-0.065801,-0.051051,-0.048757,0.005154,0.025756,0.049245,-0.006786,-0.017179
Gender,-0.002563,0.033907,0.009666,1.0,0.024301,0.010982,-0.008624,-0.027479,-0.045607,0.000482,0.062893,-0.053953,0.02418,-0.02482
Team leader,0.019724,-0.048001,-0.005733,0.024301,1.0,-0.010266,-0.037924,0.024444,0.002392,-0.034202,0.00236,0.015985,-0.002036,0.107978
Age,0.058252,0.164389,0.076987,0.010982,-0.010266,1.0,-0.052271,-0.002076,0.001022,-0.008468,0.012754,0.030642,0.022791,-0.007664
Member of professional organizations,-0.003988,-0.042311,-0.065801,-0.008624,-0.037924,-0.052271,1.0,-0.004938,0.009069,-0.017721,-0.071107,-0.019924,0.010102,0.008869
.Net,0.033203,-0.086372,-0.051051,-0.027479,0.024444,-0.002076,-0.004938,1.0,0.037948,0.008765,0.00198,-0.001729,-0.019877,-0.05227
SQL Server,0.027852,-0.041851,-0.048757,-0.045607,0.002392,0.001022,0.009069,0.037948,1.0,0.026858,-0.052332,0.000289,0.034301,-0.070264
HTML CSS Java Script,0.009756,0.050546,0.005154,0.000482,-0.034202,-0.008468,-0.017721,0.008765,0.026858,1.0,0.002714,-9.6e-05,0.005949,0.015373


### Removing columns which are not relevant to finding associations
#### Studying the correlation table we see that ID and Gender are having negative or negligent correlations with most of the other attributes and can be considered irrelevant to finding out associations, it will be wise to drop them before applying the algorithm

In [44]:
employee_skills_df.drop(columns=['ID', 'Gender'], inplace=True)
employee_skills_df


Unnamed: 0,Employment period,Time in current department,Team leader,Age,Member of professional organizations,.Net,SQL Server,HTML CSS Java Script,PHP mySQL,Fast working,Awards,Communicative
0,5,4,1,48,1,1,1,0,1,1,0,1
1,5,3,1,26,1,1,0,0,1,1,0,0
2,11,8,0,34,0,0,1,1,0,1,0,1
3,7,5,1,27,1,0,1,0,1,1,1,0
4,18,8,0,38,0,0,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
993,12,3,1,32,1,0,0,0,1,1,0,1
994,16,8,0,36,1,1,1,0,0,0,1,1
995,11,4,1,34,1,0,1,0,1,1,0,0
996,8,8,0,36,0,0,1,0,0,0,1,1


# Data Conversion/Normalization
#### Here we will convert and normalize numeric attributes such as Employment Period, Age and Time in current department to categorical variables

In [45]:
# Normalizing employment period values through equal width binning

employee_skills_df['Employment period'].describe()

employment_period_bin_label = ['0-5', '6-10', '11-15', '16-20']
cut_bins_employment_period = [0, 5, 10, 15, 20]
employee_skills_df['Employment period'] = pd.cut(employee_skills_df['Employment period'], bins=cut_bins_employment_period, labels=employment_period_bin_label)
employee_skills_df.head()


Unnamed: 0,Employment period,Time in current department,Team leader,Age,Member of professional organizations,.Net,SQL Server,HTML CSS Java Script,PHP mySQL,Fast working,Awards,Communicative
0,0-5,4,1,48,1,1,1,0,1,1,0,1
1,0-5,3,1,26,1,1,0,0,1,1,0,0
2,11-15,8,0,34,0,0,1,1,0,1,0,1
3,6-10,5,1,27,1,0,1,0,1,1,1,0
4,16-20,8,0,38,0,0,0,1,1,1,1,1


In [46]:
# Normalizing age values through equal width binning

employee_skills_df['Age'].describe()

age_bin_label = ['20-30', '31-40', '41-50', '51-60']
cut_bins_age = [20, 30, 40, 50, 60]
employee_skills_df['Age'] = pd.cut(employee_skills_df['Age'], bins=cut_bins_age, labels=age_bin_label)
employee_skills_df.head()


Unnamed: 0,Employment period,Time in current department,Team leader,Age,Member of professional organizations,.Net,SQL Server,HTML CSS Java Script,PHP mySQL,Fast working,Awards,Communicative
0,0-5,4,1,41-50,1,1,1,0,1,1,0,1
1,0-5,3,1,20-30,1,1,0,0,1,1,0,0
2,11-15,8,0,31-40,0,0,1,1,0,1,0,1
3,6-10,5,1,20-30,1,0,1,0,1,1,1,0
4,16-20,8,0,31-40,0,0,0,1,1,1,1,1


In [47]:
# Normalizing employee's time in current department values through equal width binning

employee_skills_df['Time in current department'].describe()

current_department_bin_label = ['0-3', '4-6', '7-9', '10-12']
cut_bins_curr_dept = [0, 3, 6, 9, 12]
employee_skills_df['Time in current department'] = pd.cut(employee_skills_df['Time in current department'], bins=cut_bins_curr_dept, labels=current_department_bin_label)
employee_skills_df.head()


Unnamed: 0,Employment period,Time in current department,Team leader,Age,Member of professional organizations,.Net,SQL Server,HTML CSS Java Script,PHP mySQL,Fast working,Awards,Communicative
0,0-5,4-6,1,41-50,1,1,1,0,1,1,0,1
1,0-5,0-3,1,20-30,1,1,0,0,1,1,0,0
2,11-15,7-9,0,31-40,0,0,1,1,0,1,0,1
3,6-10,4-6,1,20-30,1,0,1,0,1,1,1,0
4,16-20,7-9,0,31-40,0,0,0,1,1,1,1,1


### Converting categorical variables to series of ones and zeros quantification and comparison

In [48]:
employee_skills_df = pd.get_dummies(employee_skills_df, columns=['Employment period', 'Age', 'Time in current department'])
employee_skills_df.head()

Unnamed: 0,Team leader,Member of professional organizations,.Net,SQL Server,HTML CSS Java Script,PHP mySQL,Fast working,Awards,Communicative,Employment period_0-5,...,Employment period_11-15,Employment period_16-20,Age_20-30,Age_31-40,Age_41-50,Age_51-60,Time in current department_0-3,Time in current department_4-6,Time in current department_7-9,Time in current department_10-12
0,1,1,1,1,0,1,1,0,1,1,...,0,0,0,0,1,0,0,1,0,0
1,1,1,1,0,0,1,1,0,0,1,...,0,0,1,0,0,0,1,0,0,0
2,0,0,0,1,1,0,1,0,1,0,...,1,0,0,1,0,0,0,0,1,0
3,1,1,0,1,0,1,1,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,1,1,1,1,1,0,...,0,1,0,1,0,0,0,0,1,0
