### Shelmith N. Kariuki
### 9th November, 2018
### dplython 💃 💃 💃 🙌 🙌 🙌

This script was motivated by my liking of dplyr in R. A big shout out to the author of the dplython package, Chris Riederererer(@yeahtwoERs)👏 👏 👏 👏.

Something more interesting is that it combines dplyr and tidyr functions into one package. I can spread and gather as I always do in R. Pretty Awesome!!!!


In [1]:
#!pip install git+https://github.com/dodger487/dplython.git

In [2]:
import numpy as np
import pandas as pd
from dplython import (DplyFrame, X,select, sift,
  sample_n, sample_frac, head, arrange, mutate, group_by,if_else,
  summarize, DelayFunction,gather,spread)
##DplyFrame: Converts a dataframe to a form that iw workable with dplyr
##select: Used for selecting dataframes that we need
##sift: Used for filtering observations based on a certain condition
##arrange: Used for sorting/ordering observations.
##mutate: Used for generating new variables.
##group_by: Used for grouping observations. 
##summarize: Important for summary statistics

In [3]:
## Read in the wafanyikazi dataset
wf_df = pd.read_csv("/Users/shelmith/Documents/Personal Development/R/wafanyikazi.csv")

## Delete the first column, since we do not need it
wf_df = wf_df.iloc[:,1:]

##Convert the dataset to one that is executable with dplyr functions
wf_df = DplyFrame(wf_df)
wf_df.head(n=5)


Unnamed: 0,Sid,Gender,Age,Department,Role,Income,Marital_Status,County,Leave_Days,Promotion
0,10715,Male,31,Finance,Mid,5991,Single,Kisumu,11,No
1,17041,Male,48,Research Analyst,Junior,3387,Divorced,Wajir,8,Yes
2,16232,Male,35,Operations,Junior,3170,Married,Mombasa,0,No
3,19576,Female,41,Finance,Senior,5557,Married,Nyeri,8,No
4,13463,Female,43,Associate,Junior,1651,Married,Nairobi,2,Yes


In [4]:
## Select the demographic variables (Gender,Age,Marital_Status)
wf_demo = wf_df >> select(X.Gender, X.Age, X.Marital_Status)
wf_demo.head()


Unnamed: 0,Gender,Age,Marital_Status
0,Male,31,Single
1,Male,48,Divorced
2,Male,35,Married
3,Female,41,Married
4,Female,43,Married


In [5]:
##Select all those members who belong to the data team, and have already been promoted.
wf_data = wf_df >> sift((X.Department == "Data") & (X.Promotion == "Yes"))
wf_data.head(5)

Unnamed: 0,Sid,Gender,Age,Department,Role,Income,Marital_Status,County,Leave_Days,Promotion
6,10784,Male,45,Data,Mid,5340,Single,Nairobi,12,Yes
21,13823,Male,31,Data,Mid,8378,Divorced,Wajir,10,Yes
26,11251,Female,46,Data,Mid,5340,Single,Kisumu,15,Yes
36,15961,Female,25,Data,Senior,3604,Divorced,Wajir,7,Yes
55,15876,Female,20,Data,Junior,1434,Divorced,Laikipia,17,Yes


In [6]:
## How many single people ladies come from Nyeri?
wf_sl = (wf_df >>
sift((X.Gender=="Female")&(X.Marital_Status=="Single")&(X.County =="Nyeri")))
wf_sl.head(5)

Unnamed: 0,Sid,Gender,Age,Department,Role,Income,Marital_Status,County,Leave_Days,Promotion
33,18997,Female,32,Associate,Senior,5340,Single,Nyeri,14,No
71,11450,Female,24,Research Analyst,Mid,7510,Single,Nyeri,13,No
138,19243,Female,32,Data,Senior,8161,Single,Nyeri,11,No
142,19558,Female,40,Research Analyst,Senior,7293,Single,Nyeri,16,No
226,14877,Female,47,Finance,Senior,8595,Single,Nyeri,3,No


In [7]:
##Generate a variable called Jinsia, that converts the Gender variable to swahili
where = DelayFunction(np.where)
wf_df2 = (wf_df >> 
          mutate(Jinsia = where(X.Gender == "Female","Mke","Mme")))
wf_df2.head(n=5)


Unnamed: 0,Sid,Gender,Age,Department,Role,Income,Marital_Status,County,Leave_Days,Promotion,Jinsia
0,10715,Male,31,Finance,Mid,5991,Single,Kisumu,11,No,Mme
1,17041,Male,48,Research Analyst,Junior,3387,Divorced,Wajir,8,Yes,Mme
2,16232,Male,35,Operations,Junior,3170,Married,Mombasa,0,No,Mme
3,19576,Female,41,Finance,Senior,5557,Married,Nyeri,8,No,Mke
4,13463,Female,43,Associate,Junior,1651,Married,Nairobi,2,Yes,Mke


In [8]:
## Calculate the average income, per Gender
avg_income_df = (wf_df >> 
             group_by(X.Gender)>>
             summarize(avg_income = X.Income.mean()))
avg_income_df

Unnamed: 0,Gender,avg_income
0,Female,5371.130802
1,Male,5613.931559


In [9]:
## Calculate the average income, per gender, per department

avg_income_df2 = (wf_df >>
                 group_by(X.Gender, X.Department) >>
                 summarize(avg_income = X.Income.mean())>>
                 arrange(X.Department))
avg_income_df2 = avg_income_df2.reset_index()  
avg_income_df2

Unnamed: 0,index,Gender,Department,avg_income
0,0,Female,Associate,5345.046512
1,5,Male,Associate,5071.941176
2,1,Female,Data,5613.42
3,6,Male,Data,5270.396226
4,2,Female,Finance,5574.714286
5,7,Male,Finance,5936.75
6,3,Female,Operations,5043.285714
7,8,Male,Operations,6284.854167
8,4,Female,Research Analyst,5264.521739
9,9,Male,Research Analyst,5533.327273


In [10]:
##  Are there more males than females in the dataset? What is the % gender gap?
gender_gap = (wf_df >>
             group_by(X.Gender) >>
             summarize(gender_count = X.Gender.count())>>
             mutate(gender_perc = ((X.gender_count/X.gender_count.sum())*100).round(0))>>
             select(X.Gender, X.gender_perc))
gender_gap 

Unnamed: 0,Gender,gender_perc
0,Female,47.0
1,Male,53.0


In [11]:
## Now we need to reshape this data, so that Female and Male can be variables,
## and gender_perc can be value under them.
# gender_gap2 =(gender_gap >>
#               spread(X.Gender, X.gender_perc))
# gender_gap2

##Waiting for Chris Riederererer to help me debug this issue.