### Importing libraries

In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx 

from apyori import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori #fpmax, fpgrowth

warnings.filterwarnings('ignore')

%matplotlib inline

### Reading input file

In [None]:
path_to_input_file = os.path.join(os.getcwd(), 'Employee_skills_traits.csv')
employee_skills_df = pd.read_csv(path_to_input_file)
employee_skills_df

### Removing whitespaces from column names

In [None]:
employee_skills_df.columns = employee_skills_df.columns.str.strip()
employee_skills_df.head()

# Data Preprocessing

### Dropping duplicate records, if any
##### Note: The dataset doesn't really have any duplicate records, it is just the ID which seems to be duplicated, all the other attributes are different even for the same id and I dont think we can just drop those records


In [None]:
# print("Dimension of the data before deleting duplicates - ",employee_skills_df.shape)

# duplicate_rows= employee_skills_df[employee_skills_df.duplicated(['ID'],keep=False)]
# print("Number of duplicate records - ", sum(employee_skills_df.duplicated(['ID'])))

# if not duplicate_rows.empty:
#     employee_skills_df.drop_duplicates(['ID'],keep='first',inplace=True)

# print("Dimension of the data after deleting duplicates - ",employee_skills_df.shape)

# The dataset doesn't really have any duplicate records, it is just the ID which seems to be duplicated, 
# all the other attributes are different even for the same id and I dont think we can just drop those records

employee_skills_df.shape
duplicate_records = employee_skills_df[employee_skills_df.duplicated(keep=False)]
if not duplicate_records.empty:
    employee_skills_df.drop_duplicates(keep=False, inplace=True)
employee_skills_df.shape

### Understanding correlation between data points

In [None]:
correlation = employee_skills_df.corr()
correlation

### Removing columns which are not relevant to finding associations
#### Studying the correlation table we see that ID and Gender are having negative or negligent correlations with most of the other attributes and can be considered irrelevant to finding out associations, it will be wise to drop them before applying the algorithm

In [None]:
employee_skills_df.drop(columns=['ID', 'Gender'], inplace=True)
employee_skills_df

# Data Conversion/Normalization
#### Here we will convert and normalize numeric attributes such as Employment Period, Age and Time in current department to categorical variables

In [None]:
# Normalizing employment period values through equal width binning

employee_skills_df['Employment period'].describe()

employment_period_bin_label = ['0-5', '6-10', '11-15', '16-20']
cut_bins_employment_period = [0, 5, 10, 15, 20]
employee_skills_df['Employment period'] = pd.cut(employee_skills_df['Employment period'], bins=cut_bins_employment_period, labels=employment_period_bin_label)
employee_skills_df.head()


In [None]:
# Normalizing age values through equal width binning

employee_skills_df['Age'].describe()

age_bin_label = ['20-30', '31-40', '41-50', '51-60']
cut_bins_age = [20, 30, 40, 50, 60]
employee_skills_df['Age'] = pd.cut(employee_skills_df['Age'], bins=cut_bins_age, labels=age_bin_label)
employee_skills_df.head()


In [None]:
# Normalizing employee's time in current department values through equal width binning

employee_skills_df['Time in current department'].describe()

current_department_bin_label = ['0-3', '4-6', '7-9', '10-12']
cut_bins_curr_dept = [0, 3, 6, 9, 12]
employee_skills_df['Time in current department'] = pd.cut(employee_skills_df['Time in current department'], bins=cut_bins_curr_dept, labels=current_department_bin_label)
employee_skills_df.head()


### Converting categorical variables to series of ones and zeros quantification and comparison

In [None]:
employee_skills_df = pd.get_dummies(employee_skills_df, columns=['Employment period', 'Age', 'Time in current department'])
employee_skills_df.head()

# Data Mining

### Finding frequent itemsets from the dataset. We start with support value of 0.5 and stop when we have enough frequent itemsets to extract rules.

In [None]:
support_list = [0.5, 0.4, 0.3, 0.2, 0.1]

for support in support_list:
    frequent_itemsets = apriori(employee_skills_df, min_support=support, use_colnames=True)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    print(f"Minimum support is {support*100}%")
    print(frequent_itemsets)
    print("-----------------------------------------------------------------------------")


### Based on the observations above, we choose the frequent itemsets by keeping the min support as 0.10 as we get enough itemsets to extract meaningful rules

In [None]:
frequent_itemsets
rules = association_rules(frequent_itemsets,  metric="lift", min_threshold=1)
rules.head()

In [None]:
print(f"Total rules generated are {rules.shape[0]}")

### Filtering rules with lift >=1 , confidence >= 0.55 and support >= 0.15

In [None]:
rules = rules[(rules['confidence']>=0.55) & (rules['lift']>=1) & (rules['support']>=0.15)]
# Sorting rules in descending order by confidence
rules.sort_values(by='confidence',ascending=False,inplace=True)
rules

# Data Visualization

### Printing rules in format of {antecedents} ---> {consequents}

In [None]:
for idx in rules.index:
    print(f"{list(rules['antecedents'][idx])} ====> {list(rules['consequents'][idx])}")

### Scatter plot to look at support and confidence values of selected rules

In [None]:
support=rules.as_matrix(columns=['support'])
confidence=rules.as_matrix(columns=['confidence'])
plt.scatter(support, confidence, alpha=0.5, marker="*")
plt.xlabel('support')
plt.ylabel('confidence') 
plt.show()

### Using directed graph to look at associations between the rules. 

- Yellow dots represent the rules
- Green dots shows antecedents and consequents
- Incoming arrows into a rule shows the antecendents
- Outgoing arrows from a rule shows the consequents

In [None]:
rules_to_show = 8

G1 = nx.DiGraph()

color_map=[]
N = 50
colors = np.random.rand(N)    
strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7']   


for i in range (rules_to_show):      
    G1.add_nodes_from(["R"+str(i)])

    for a in rules.iloc[i]['antecedents']:
        G1.add_nodes_from([a])
        G1.add_edge(a, "R"+str(i), color=colors[i] , weight = 2)

    for c in rules.iloc[i]['consequents']:
        G1.add_nodes_from([c])
        G1.add_edge("R"+str(i), c, color=colors[i],  weight=2)

for node in G1:
   found_a_string = False
   for item in strs: 
       if node==item:
            found_a_string = True
   if found_a_string:
        color_map.append('yellow')
   else:
        color_map.append('green')       


edges = G1.edges()
colors = [G1[u][v]['color'] for u,v in edges]
weights = [G1[u][v]['weight'] for u,v in edges]

pos = nx.spring_layout(G1, k=30, scale=1)
nx.draw(G1, pos, edges=edges, node_color = color_map, edge_color=colors, width=weights, font_size=16, with_labels=False)            

for p in pos:  # raise text positions
    pos[p][1] += 0.07
nx.draw_networkx_labels(G1, pos)
plt.show()