# PLOT FUNCTIONS

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import sklearn as sklearn
from scipy import stats
import xgboost as xgb

In [3]:
# Colors to use
# for one variable with two / many categories: blue ='#3B89F3'
# for two categories: red ='#E93356'// blue ='#3B89F3'
# for multiple categories: red ='#E93356'// blue ='#3B89F3' // yellow = '#ffdd00' // green ='#94c11c' // orange = ''#f39100' // petrol = '#009bb4' // light green = '#C6D59F' // dark orange = '#BF7300'

In [4]:
# Read dataset
data  = 'datasets/aug_train.csv'
train = pd.read_csv(data)

data = 'datasets/train_clean.csv'
train_clean = pd.read_csv(data)

data = 'datasets/train_model.csv'
train_model = pd.read_csv(data)

data = 'datasets/train_cat_graph.csv'
train_cat_graph = pd.read_csv(data)

### Print Version

In [5]:
import sys
def print_version():
    print("Python version: {}".format(sys.version))
    print("NumPy version: {}".format(np.__version__))
    print("pandas version: {}".format(pd.__version__))
    print("matplotlib version: {}".format(mpl.__version__))
    print("seaborn version: {}".format(sns.__version__))
    print("scikit-learn version: {}".format(sklearn.__version__))
    print("XGBoost version: {}".format(xgb.__version__))

In [6]:
#print_version()

### Tables

In [7]:
def show_raw_data():
    display(train.head(5))

In [8]:
def show_clean_data():
    display(train_clean.head(5))

In [9]:
def show_binary_data():
    display(train_model.head(5))

### Basic data / data types / mean / median for variable

In [10]:
# Print the data types for each variable
#print("\nThese are the data types for each variable:")
#display(train.dtypes)

def show_basics(variable_name):
    name = (variable_name).upper()
    print("\n",name)
    print("\nDatatype",train[variable_name].dtype)
    print("\nNumber of unique values:", train[variable_name].nunique())
    display(train[variable_name].value_counts())

    print("Occurrences of 'NaN':", train[variable_name].isna().sum())
    percent_nan = train[variable_name].isna().sum()/train['enrollee_id'].count()*100
    print("\nProportion of 'NaN':",percent_nan.round(2))
    print("\nMost frequent value:", train[variable_name].mode())
    
    prop = train[variable_name].isin(train[variable_name].mode()).sum()/train[variable_name].value_counts().sum()
    prop = (prop*100).round(2)
    print("\nProportion most frequent / all:", prop)
    print("____________________________________________________")

def show_all_basics():    
    for variable in {'enrollee_id','city','city_development_index','gender','relevent_experience','enrolled_university','education_level','major_discipline','experience','company_size','company_type','last_new_job','training_hours','target'}:
        show_basics(variable)

In [11]:
# Print the data types for each variable
#show_all_basics()

In [12]:
# Show median and mean for all cardinal variables (training_hours & city_development_index)
def name_median_mean_all_cardinal_data():
    print("\nStandard deviation for cardinal scaled variables:\n", train_clean[['city_development_index','training_hours']].std())
    print("\nMean for cardinal scaled variables:\n",train_clean[['city_development_index','training_hours']].mean())
    print("\nMedian for cardinal scaled variables:\n",train_clean[['city_development_index','training_hours']].median())

In [13]:
# Show median and mean for all cardinal variables (training_hours & city_development_index)
#name_median_mean_all_cardinal_data():

### Overview of imbalanced variables

In [14]:
# Overview of inbalanced variables
# for highlighting the first highest bar: red '#E93356'

def plot_imbalance():

    fig, axes = plt.subplots(1,4, figsize=(20, 5))
    sns.despine(left=True, bottom=True)

    n=0
    fig.suptitle('Countplots of various variables')

    cols = ['gender', 'relevent_experience','enrolled_university','education_level' ]

    for col in cols:
        descending_order = train[col].value_counts().sort_values(ascending=False).index
        sns.countplot(ax=axes[n], data=train, order=descending_order, palette=['#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3'], x=col)
        axes[n].set_title('Count of {}'.format(col))
        axes[n].set_xlabel('')
        axes[n].set_ylabel('')
        axes[n].tick_params('x',labelrotation=45)
        n += 1

    plt.show()
    fig, axes = plt.subplots(1,4, figsize=(20, 5))
    sns.despine(left=True, bottom=True)

    n=0
    cols = ['major_discipline','experience', 'company_type','last_new_job' ]


    for col in cols:
        descending_order = train[col].value_counts().sort_values(ascending=False).index
        sns.countplot(ax=axes[n], data=train, order=descending_order, palette=['#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3','#3B89F3'], x=col)
        axes[n].set_title('Count of {}'.format(col))
        axes[n].set_xlabel('')
        axes[n].set_ylabel('')
        axes[n].tick_params('x',labelrotation=45)
        n += 1

    plt.show()

In [15]:
# Overview of imbalanced variables
#plot_imbalance()

### Plot: count

In [16]:
# Count plots standard

def countplot(variable_name):
    train[variable_name].value_counts().head(10).sort_values(ascending=False).plot(kind='bar',figsize=(5,5), color=['#3B89F3'])
    plt.xticks(rotation=45)
    sns.despine(left=True, bottom=True)

def h_countplot(variable_name):
    train[variable_name].value_counts().head(10).sort_values(ascending=True).plot(kind='barh', figsize=(5,5), color=['#3B89F3'])
    sns.despine(left=True, bottom=True)

### Plot: frequency

In [17]:
# Frequency plots standard

def freq_candidate(variable_name):
    col_list = train_clean[variable_name].dropna()
    col_list=col_list.unique().tolist()

    train_clean['freq'] = train_clean[train_clean['target']>0].groupby([variable_name])[variable_name].transform('count')
    train_clean['freq_candidate'] = train_clean.groupby([variable_name])[variable_name].transform('count')
    train_clean['freq_target'] = train_clean.groupby([variable_name])['freq'].transform('max')

    f, ax = plt.subplots(figsize=(5,5))

    freq_variable = train_clean.loc[:, (variable_name, "freq_target",'freq_candidate')]
    freq_variable.drop_duplicates(keep="first", inplace=True)
    freq_variable.sort_values('freq_target', ascending = True, inplace=True)
    freq_variable = freq_variable.dropna().sort_values('freq_candidate', ascending = False).head(10)

    variable_data = freq_variable[freq_variable[variable_name].isin(col_list)].head(10)
    
    
    plt.bar(height="freq_candidate", x=variable_name, data=freq_variable, label="total candidates", color='#3B89F3')
    
    plt.bar(height="freq_target", x=variable_name, data=variable_data, label="job change: yes", color="#E93356")
    
    ax.legend(loc="upper right")
    ax.set(xlabel =None, ylabel = 'frequency of "willing to change job"')
    plt.xticks(rotation=45)
    sns.despine(left=True, bottom=True)
    

In [18]:
# Frequency plots non-standard

def h_freq_candidate_c(variable_name):
    train_clean['freq'] = train_clean[train_clean['target']>0].groupby([variable_name])['city'].transform('count')
    train_clean['freq_candidate'] = train_clean.groupby([variable_name])[variable_name].transform('count')
    train_clean['freq_target'] = train_clean.groupby([variable_name])['freq'].transform('max')
    f, ax = plt.subplots(figsize=(5,5))

    freq_variable = train_clean.loc[:, (variable_name, "freq_target",'freq_candidate')]
    freq_variable.drop_duplicates(keep="first", inplace=True)
    freq_variable.sort_values('freq_target', ascending = True, inplace=True)
    freq_variable = freq_variable.dropna()

    # Plot the total candidates
    sns.barplot(x="freq_candidate", y=variable_name, data=freq_variable,orient='h', order=freq_variable.sort_values('freq_candidate', ascending = False).city.head(10),
            label="total candidates", color='#3B89F3')

    # Plot candidates that are willing to change job
    sns.barplot(x="freq_target", y=variable_name, data=freq_variable, orient='h',order=freq_variable.sort_values('freq_candidate', ascending = False).city.head(10),
            label="job change: yes", color="#E93356")

    # Adding a legend and informative axis label
    ax.legend(loc="lower right")
    plt.xticks(rotation=45)
    ax.set(xlabel =None, ylabel = 'frequency of "willing to change job"')
    sns.despine(left=True, bottom=True)
    

def h_freq_candidate_cdi(variable_name):
    train_clean['freq'] = train_clean[train_clean['target']>0].groupby([variable_name])['city'].transform('count')
    train_clean['freq_candidate'] = train_clean.groupby([variable_name])[variable_name].transform('count')
    train_clean['freq_target'] = train_clean.groupby([variable_name])['freq'].transform('max')
    f, ax = plt.subplots(figsize=(5,5))

    freq_variable = train_clean.loc[:, (variable_name, "freq_target",'freq_candidate')]
    freq_variable.drop_duplicates(keep="first", inplace=True)
    freq_variable.sort_values('freq_target', ascending = True, inplace=True)
    freq_variable = freq_variable.dropna()

    # Plot the total candidates
    sns.barplot(x="freq_candidate", y=variable_name, data=freq_variable,orient='h', order=freq_variable.sort_values('freq_candidate', ascending = False).city_development_index.head(10),
            label="total candidates", color='#3B89F3')

    # Plot candidates that are willing to change job
    sns.barplot(x="freq_target", y=variable_name, data=freq_variable, orient='h',order=freq_variable.sort_values('freq_candidate', ascending = False).city_development_index.head(10),
            label="job change: yes", color="#E93356")

    # Adding a legend and informative axis label
    ax.legend(loc="lower right")
    plt.xticks(rotation=45)
    ax.set(xlabel =None, ylabel = 'frequency of "willing to change job"')
    sns.despine(left=True, bottom=True)
    
def h_freq_candidate_th(variable_name):
    train_clean['freq'] = train_clean[train_clean['target']>0].groupby([variable_name])['city'].transform('count')
    train_clean['freq_candidate'] = train_clean.groupby([variable_name])[variable_name].transform('count')
    train_clean['freq_target'] = train_clean.groupby([variable_name])['freq'].transform('max')
    f, ax = plt.subplots(figsize=(5,5))

    freq_variable = train_clean.loc[:, (variable_name, "freq_target",'freq_candidate')]
    freq_variable.drop_duplicates(keep="first", inplace=True)
    freq_variable.sort_values('freq_target', ascending = True, inplace=True)
    freq_variable = freq_variable.dropna()

    # Plot the total candidates
    sns.barplot(x="freq_candidate", y=variable_name, data=freq_variable,orient='h', order=freq_variable.sort_values('freq_candidate', ascending = False).training_hours.head(10),
            label="total candidates", color='#3B89F3')

    # Plot candidates that are willing to change job
    sns.barplot(x="freq_target", y=variable_name, data=freq_variable, orient='h',order=freq_variable.sort_values('freq_candidate', ascending = False).training_hours.head(10),
            label="job change: yes", color="#E93356")

    # Adding a legend and informative axis label
    ax.legend(loc="lower right")
    plt.xticks(rotation=45)
    ax.set(xlabel =None, ylabel = 'frequency of "willing to change job"')
    sns.despine(left=True, bottom=True)

### Plot: proportion

In [19]:
# Proportion plot standards

def prop_candidate(variable_name, width):
    fig_dims = (width, 5)
    fig, ax = plt.subplots(figsize=fig_dims)
    
    train_clean['freq'] = train_clean[train_clean['target']>0].groupby([variable_name])[variable_name].transform('count')
    train_clean['freq_candidate'] = train_clean.groupby([variable_name])[variable_name].transform('count')

    prop_variable = train_clean.copy()
    prop_variable = prop_variable.sort_values('freq_candidate', ascending = False)

    
    lp=sns.barplot(x=variable_name, y='target', data=prop_variable, color='#3B89F3', ci=95)
    lp.axes.set_ylim(0,0.5)
    lp.axes.set(xlabel =None, ylabel = 'proportion of "willing to change job"')
    lp.set_xticklabels(lp.get_xticklabels(), rotation=45);
    ax.yaxis.set_major_formatter(PercentFormatter(1.0))
    sns.despine(left=True, bottom=True)   

In [20]:
# Combine frequency and proportion plots

def freq_and_prop_candidate(variable_name):
    freq_candidate(variable_name)
    prop_candidate(variable_name)

### Plot: distribution

In [21]:
# Displot for cardianal scale

def displot(variable_name):

    sns.despine(left=True, bottom=True)

    ax=sns.displot(x= variable_name, y='target', data=train_clean, kind='kde', color = '#3B89F3', fill=True)
    ax.set(ylabel='distribution: willing to change job (0=no 1=yes)')
    

In [22]:
# Descriptive statistic for training_hours

def displot_median_mean_th():

    sns.displot(train_model['training_hours'], kde=False, color='#3B89F3')
    plt.axvline(x=train_model.training_hours.mean(), linewidth=3, color='#E93356', label="mean") 
    plt.axvline(x=train_model.training_hours.median(), linewidth=3, color='#ffdd00', label="median")
    plt.ylabel("Count")
    plt.legend(["mean", "median"])
    plt.xticks(rotation=45)
    sns.despine(left=True, bottom=True)

In [23]:
# Descriptive statistic for city_development_index

def displot_median_mean_cdi():

    sns.displot(train_model['city_development_index'], kde=False, color='#3B89F3')
    plt.axvline(x=train_model.city_development_index.mean(), linewidth=3, color='#E93356', label="mean") 
    plt.axvline(x=train_model.city_development_index.median(), linewidth=3, color='#ffdd00', label="median")
    plt.ylabel("Count")
    plt.legend(["mean", "median"])
    plt.xticks(rotation=45)
    sns.despine(left=True, bottom=True)

### Plot: pie

In [24]:
# Pie plot

def pie_plot(variable_name):
    values = train_clean[train_clean['target'] == 1][variable_name].value_counts()
    labels = values.keys()
    bar,ax = plt.subplots(figsize=(8,8))
    plt.pie(x = values, labels = labels , autopct="%.1f%%",pctdistance=0.9, colors = ('#3B89F3','#f39100','#009bb4', '#94c11c','#BF7300' ,'#ffdd00', '#C6D59F'))
    plt.title('')

In [25]:
# Pie plot for group "company size = no info"

def pie_plot_cs_no_info(variable_name):
    values = train_clean[train_clean["company_size"] == "no info"][variable_name].value_counts()
    labels = values.keys()
    bar,ax = plt.subplots(figsize=(8,8))
    plt.pie(x = values, labels = labels , autopct="%.2f%%",pctdistance=0.9, colors = ('#3B89F3','#f39100','#009bb4', '#94c11c','#BF7300' ,'#ffdd00', '#C6D59F'))
    plt.title('')

In [26]:
# Pie plot for group "last new job = 0"

def pie_plot_lnj_zero(variable_name):
    values = train_clean[train_clean["last_new_job"] == "never"][variable_name].value_counts()
    labels = values.keys()
    bar,ax = plt.subplots(figsize=(8,8))
    plt.pie(x = values, labels = labels , autopct="%.2f%%",pctdistance=0.9, colors = ('#3B89F3','#f39100','#009bb4', '#94c11c','#BF7300' ,'#ffdd00', '#C6D59F'))
    plt.title('')

In [27]:
#pie_plot_lnj_zero('experience')

### Plot: regression (for cardinal variables)

In [28]:
# Regression plot for cardianal scale

def regplot(variable_name):

    sns.despine(left=True, bottom=True)
    
    ax=sns.lmplot(x= variable_name, y='target', data=train_clean, scatter_kws={"color": "white"}, ci=None,y_jitter=.02, logistic=True, truncate=False, line_kws={'color' : 'red'})
    ax.set(ylabel='distribution: willing to change job')
    

### Plot: category (for showing impact of City Dev Index over other variables )

In [29]:
# Category plot showing impact of City Dev Index over other variables ('Working_Experience' und 'DataScience_Experience')

def cat_plot(variable_name):
    g= sns.catplot(x='City_Development_Index',y='target',hue=variable_name,kind='bar',data = train_cat_graph, palette=['#E93356','#3B89F3'])
    g.fig.set_figwidth(12)

# Plots and tables based on functions

In [30]:
# Creates head of table of raw data 
#show_raw_data()

In [31]:
# Creates head of table of clean data 
#show_clean_data()

In [32]:
# Creates head of table of binary data 
#show_binary_data()

## Plot on target

In [33]:
#countplot('target')

## Plot on city development index

In [34]:
#h_freq_candidate_cdi('city_development_index')

In [42]:
#displot('city_development_index')

In [40]:
#displot_median_mean_cdi()

In [38]:
#regplot('city_development_index')

## Plot on company size

In [71]:
#freq_candidate('company_size')

In [72]:
#prop_candidate('company_size',5)

In [89]:
#pie_plot_cs_no_info('experience_group')

## Plot on company type

In [75]:
#freq_candidate('company_type')

In [135]:
#prop_candidate('company_type',5)

In [97]:
#cat_plot('Company Type')

## Plot on education level

In [64]:
#freq_candidate('education_level')

In [56]:
#prop_candidate('education_level',5)

In [95]:
#cat_plot('Highest Education')

## Plot on relevant experience

In [46]:
#freq_candidate('relevent_experience')

In [48]:
#prop_candidate('relevent_experience',5)

## Plot on last new job

In [79]:
#freq_candidate('last_new_job')

In [80]:
#prop_candidate('last_new_job',5)

## Plot on experience

In [66]:
#freq_candidate('experience')

In [68]:
#prop_candidate('experience_group',5)

## Plot on enrolled university

In [50]:
#freq_candidate('enrolled_university')

In [52]:
#prop_candidate('enrolled_university',5)

## Plot on others

In [42]:
#freq_candidate('gender')

In [44]:
#prop_candidate('gender',5)

In [36]:
#h_freq_candidate_c('city')

In [58]:
#pie_plot('major_discipline')

In [60]:
#freq_candidate('major_discipline')

In [62]:
#prop_candidate('major_discipline',5)

In [82]:
#h_freq_candidate_th('training_hours')

In [84]:
#displot('training_hours')

In [87]:
#displot_median_mean_th()

In [93]:
#regplot('training_hours')