In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import scipy.stats as scs
from scipy import stats
from copy import deepcopy

# Data set

In [2]:
# Read dataset
data = 'datasets/aug_train.csv'
train= pd.read_csv(data)

In [3]:
def show_raw_data():
    display(train.head(5))

In [4]:
# Creates head of table of raw data 
#show_raw_data()

### Cleaning data

In [5]:
train_clean = train.copy()

In [6]:
path = 'datasets/train_clean.csv'
train_clean.to_csv(path, index = False)

In [7]:
data = 'datasets/train_clean.csv'
train_clean= pd.read_csv(data)

In [8]:
# Prep 1:
# Stripping 'city_' from values of column 'city'
train_clean['city'] = train_clean['city'].map(lambda x: x.lstrip('city_'))

In [9]:
# Prep 2:
# Creating a new variable by rounding city_development_index
train_clean['cdi_round'] = train_clean['city_development_index'].round(1)

In [10]:
# Prep 3:
# Replacing NaN-values by the value 'no info' for nominally scaled variables
train_clean[['gender']] = train_clean[['gender']].replace(np.nan, 'no info', regex=True)

In [11]:
# Prep 6:
# Replacing NaN-values by the value 'no info' for nominally scaled variables

train_clean[['major_discipline']] = train_clean[['major_discipline']].replace(np.nan, 'no info', regex=True)

In [12]:
# Prep 8:
# Creating new variable

train_clean['experience_group'] = train_clean['experience']
train_clean['experience_group'] = train_clean['experience_group'].replace(("<1","1","2","3","4"),"<5")
train_clean['experience_group'] = train_clean['experience_group'].replace(("5","6","7","8","9","10"),"5-10")
train_clean['experience_group'] = train_clean['experience_group'].replace(("11","12","13","14","15"),"11-15")
train_clean['experience_group'] = train_clean['experience_group'].replace(("16","17","18","19","20"),"16-20")
train_clean['experience_group'] = train_clean['experience_group'].replace(">20",">20")

In [13]:
# Prep 9 - - raus
# Replacing NaN-values by the value 'no info' for nominally scaled variables
train_clean[['company_size']] = train_clean[['company_size']].replace(np.nan, 'no info', regex=True)

In [14]:
# Prep 10:
# Replacing NaN-values by the value 'no info' for nominally scaled variables
train_clean[['company_type']] = train_clean[['company_type']].replace(np.nan, 'no info', regex=True)

In [15]:
# Prep 12
# Creating new variables

train_clean['training_ten_hours'] = (np.floor(train_clean['training_hours']/40)*40).astype(int)

In [16]:
# Prep 4 / Prep 5 / Prep 7 / Prep 9/ Prep 11:
# Deleting rows with NaN-values for ordinally scaled variables
train_clean = train_clean.dropna()
  
# To reset the indices 
train_clean = train_clean.reset_index(drop = True)

In [17]:
path = 'datasets/train_clean.csv'
train_clean.to_csv(path, index = False)

In [18]:
def show_clean_data():
    print("Number of rows and columns in data set:", train_clean.shape)
    display(train_clean.head())

In [19]:
# Creates head of table of clean data 
#show_clean_data()

### Transfroming data: train_cleaned

In [20]:
train_clean_next = train_clean.copy()

In [21]:
path = 'datasets/train_clean_next.csv'
train_clean_next.to_csv(path, index = False)

In [22]:
data = 'datasets/train_clean_next.csv'
train_clean_next= pd.read_csv(data)

In [23]:
train_clean_next['gender_original'] = train_clean_next['gender']
train_clean_next['relevent_experience_original'] = train_clean_next['relevent_experience']
train_clean_next['enrolled_university_original'] = train_clean_next['enrolled_university']
train_clean_next['education_level_original'] = train_clean_next['education_level']
train_clean_next['major_discipline_original'] = train_clean_next['major_discipline']
train_clean_next['experience_group_original'] = train_clean_next['experience_group']
train_clean_next['company_size_original'] = train_clean_next['company_size']
train_clean_next['company_type_original'] = train_clean_next['company_type']
train_clean_next['last_new_job_original'] = train_clean_next['last_new_job']
train_clean_next = pd.get_dummies(train_clean_next, 
                          columns=[
                                'gender',
                                'relevent_experience',
                                'enrolled_university',
                                'education_level',
                                'major_discipline',
                                'experience_group',
                                'company_size',
                                'company_type',
                                'last_new_job'
                                  ], 
                          prefix=[
                                'gender',
                                'relevent_experience',
                                'enrolled_university',
                                'education_level',
                                'major_discipline',
                                'experience_group',
                                'company_size',
                                'company_type',
                                'last_new_job'
                                  ])

train_clean_next=train_clean_next.rename(columns={

                                "gender_Female": "Female",
                                "gender_Male": "Male",
                                "gender_Other": "Gender_other",
                                "gender_no info": "Gender_no_info",
    
                                "relevent_experience_No relevent experience" :"No_experience_in_data_science",
                                "relevent_experience_Has relevent experience" :"Experience_in_data_science",
    
                                "enrolled_university_no_enrollment" :"No_course",
                                "enrolled_university_Part time course" :"Part_time_course",
                                "enrolled_university_Full time course" :"Full_time_course",
    
                                "education_level_Primary School" : "Primary_School", 
                                "education_level_High School" : "High_School",
                                "education_level_Masters" : "Masters",
                                "education_level_Graduate" : "Graduate",
                                "education_level_Phd" : "Phd",
    
                                "major_discipline_Arts" : "Major_in_Arts",
                                "major_discipline_Business Degree" : "Major_in_Business_Degree",
                                "major_discipline_Humanities" : "Major_in_Humanities",
                                "major_discipline_STEM" : "Major_in_STEM",
                                "major_discipline_Other" : "Major_in_Other",
                                "major_discipline_No Major" : "No_Major",
                                "major_discipline_no info" : "Major_no_info",
    
                                "company_type_Early Stage Startup" : "Currently_in_Early_Stage_Startup",
                                "company_type_Funded Startup" : "Currently_in_Funded_Startup",
                                "company_type_Public Sector" : "Currently_in_Public_Sector",
                                "company_type_Pvt Ltd" : "Currently_in_Pvt_Ltd",
                                "company_type_NGO" : "Currently_in_NGO",
                                "company_type_Other" : "Currently_in_other_company_type",
                                "company_type_no info" : "Company_type_no_info",
    
                                "last_new_job_never" : "Last_new_job_never",
                                "last_new_job_1" : "1_year_between_previous_and_current_job",
                                "last_new_job_2" : "2_years_between_previous_and_current_job",
                                "last_new_job_3" : "3_years_between_previous_and_current_job",
                                "last_new_job_4" : "4_years_between_previous_and_current_job",
                                "last_new_job_>4" : "More_than_4_years_between_previous_and_current_job",
    
                                "experience_group_<5" : "Work_experience_up_to_5_y",
                                "experience_group_5-10" : "Work_experience_5_to_10_y",
                                "experience_group_11-15" : "Work_experience_11_to_15_y",
                                "experience_group_16-20" : "Work_experience_16_to_20_y",
                                "experience_group_>20" : "Work_experience_more_than_20_y",
    
                                "company_size_<10" : "Company_size_up_to_10",
                                "company_size_10/49" : "Company_size_10_to_49",
                                "company_size_50-99" : "Company_size_50_to_99",
                                "company_size_100-500" : "Company_size_100_to_500",
                                "company_size_500-999" : "Company_size_500_to_999",
                                "company_size_1000-4999" : "Company_size_1000_to_4999",
                                "company_size_5000-9999" : "Company_size_5000_to_9999",
                                "company_size_10000+" : "Company_size_more_than_10000",
                                "company_size_no info" : "Company_size_no_info"      
                                }, inplace = False)


In [24]:
train_cleaned = train_clean_next.copy()

In [25]:
path = 'datasets/train_cleaned.csv'
train_cleaned.to_csv(path, index = False)

### Transfroming data: train_model (for XGBoost)

In [26]:
train_cleaned_drop = train_cleaned.copy()

In [27]:
path = 'datasets/train_cleaned_drop.csv'
train_cleaned_drop.to_csv(path, index = False)

In [28]:
data = 'datasets/train_cleaned_drop.csv'
train_cleaned_drop= pd.read_csv(data)

In [29]:
to_drop = ['Gender_no_info','Company_type_no_info','Major_no_info','enrollee_id', 'experience','gender_original','relevent_experience_original','enrolled_university_original','education_level_original',
           'major_discipline_original','experience_group_original','company_size_original',
           'company_type_original','last_new_job_original']
train_cleaned_drop.drop(to_drop, axis=1, inplace=True)

In [30]:
train_model = train_cleaned_drop.copy()

In [31]:
path = 'datasets/train_model.csv'
train_model.to_csv(path, index = False)

In [32]:
train_model.shape

(18014, 51)

### Transfroming data: train_binary (for SHAP values)

In [33]:
train_model_next = train_model.copy()

In [34]:
path = 'datasets/train_model_next.csv'
train_model_next.to_csv(path, index = False)

In [35]:
data = 'datasets/train_model_next.csv'
train_model_next= pd.read_csv(data)

In [36]:
train_model_next['city_split'] = train_model_next['city']
train_model_next['city_split'] = '0'
train_model_next.loc[train_model_next['city'] == 21, 'city_split'] = '1'

train_model_next['cdi_round'] = train_model_next['city_development_index'].round(1)
train_model_next['cdi_split'] = train_model_next['cdi_round'].map({
                                    0.0: 1,
                                    0.1: 1,
                                    0.2: 1,
                                    0.3: 1,
                                    0.4: 1,
                                    0.5: 1,
                                    0.6: 1,
                                    0.7: 0,
                                    0.8: 0,
                                    0.9: 0,
                                    1.0: 0
                                    })
train_model_next['training_week'] = (np.floor(train_model_next['training_hours']/40)*40).astype(int)
train_model_next['training_split']= train_model_next['training_week'].map({
                                0 : 0,
                                40 : 0,
                                80 : 1,
                                120 : 1,
                                160 : 1,
                                200 : 1,
                                240 : 1,
                                280 : 1,
                                320 : 1,
                                360 : 1,
                                400 : 1
                                    }).astype(int)


train_model_next = train_model_next.drop([
                                'city',
                                'city_development_index',
                                'cdi_round',
                                'training_hours',
                                'training_week'
                                ], axis=1)

train_model_next = pd.get_dummies(train_model_next, 
                          columns=[
                                'city_split',
                                'cdi_split',
                                'training_split'
                                  ], 
                          prefix=[
                                'city_split',
                                'cdi_split',
                                'training_split'
                                  ])

train_model_next=train_model_next.rename(columns={ 
                                "city_split_0" : "City_is_not_City21",
                                "city_split_1" : "City_is_City21",
                                "cdi_split_0" : "City_development_index_more_than_0.6",
                                "cdi_split_1" : "City_development_index_up_to_0.6",
                                "training_split_0" :"Training_up_to_40_hours",
                                "training_split_1" :"Training_more_than_40_hours"
                                }, inplace = False)


In [37]:
to_drop = ['training_ten_hours']
train_model_next.drop(to_drop, axis=1, inplace=True)

In [38]:
train_binary = train_model_next.copy()

In [39]:
path = 'datasets/train_binary.csv'
train_binary.to_csv(path, index = False)

### Transfroming data: train_chi (for Chi2-tests)

In [40]:
train_cleaned_next = train_cleaned.copy()

In [41]:
path = 'datasets/train_cleaned_next.csv'
train_cleaned_next.to_csv(path, index = False)

In [42]:
data = 'datasets/train_cleaned_next.csv'
train_cleaned_next= pd.read_csv(data)

In [43]:
# Creating binary variables for city_development_index:
train_cleaned_next['cdi_round'] = train_cleaned_next['city_development_index'].round(1)
train_cleaned_next['cdi_split']= train_cleaned_next['cdi_round'].map({
                                    0.0: 1,
                                    0.1: 1,
                                    0.2: 1,
                                    0.3: 1,
                                    0.4: 1,
                                    0.5: 1,
                                    0.6: 1,
                                    0.7: 0,
                                    0.8: 0,
                                    0.9: 0,
                                    1.0: 0
                                    }).astype(int)

# Creating binary variables for city:
train_cleaned_next['city_split'] = train_cleaned_next['city']
train_cleaned_next['city_split'] = '0'
train_cleaned_next.loc[train_cleaned_next['city'] == 21, 'city_split'] = '1'

# Creating binary variables for experience_split:
train_cleaned_next['experience_split_20']= train_cleaned_next['experience_group_original'].map({
                                "<5": 0,
                                "5-10": 0,
                                "11-15": 0,
                                "16-20": 0,
                                ">20": 1
                                })

# Creating binary variables for experience_split:
train_cleaned_next['experience_split_5']= train_cleaned_next['experience_group_original'].map({
                                "<5": 1,
                                "5-10": 0,
                                "11-15": 0,
                                "16-20": 0,
                                ">20": 0
                                })

# Creating binary variables for relevant_experience_split:
train_cleaned_next['relevant_experience_split']= train_cleaned_next['relevent_experience_original'].map({
                                    "No relevent experience": 1,
                                    "Has relevent experience": 0
                                    }).astype(int)

# Creating binary variables for last_job_1year:
train_cleaned_next['last_new_job_split']= train_cleaned_next['last_new_job_original'].map({
                                "never" : 1,
                                "1" : 0,
                                "2" : 0,
                                "3" : 0,
                                "4" : 0,
                                ">4" : 0
                                })

# Creating binary variables for company_type_split:
train_cleaned_next['company_type_split']= train_cleaned_next['company_type_original'].map({
                                "Early Stage Startup": 0,
                                "Funded Startup": 0,
                                "Public Sector": 0,
                                "Pvt Ltd": 1,
                                "NGO": 0,
                                "Other": 0,
                                "no info" : 0
                                })

# Creating binary variables for enrolled_university_split:
train_cleaned_next['enrolled_university_split']= train_cleaned_next['enrolled_university_original'].map({
                                "no_enrollment": 1,
                                "Part time course": 0,
                                "Full time course": 0
                                })

# Creating binary variables for education_level_split:
train_cleaned_next['education_level_split_g']= train_cleaned_next['education_level_original'].map({
                                "Primary School": 0,
                                "High School": 0,
                                "Masters": 0,
                                "Graduate": 1,
                                "Phd": 0
                                })

# Creating binary variables for education_level_split:
train_cleaned_next['education_level_split_h']= train_cleaned_next['education_level_original'].map({
                                "Primary School": 0,
                                "High School": 1,
                                "Masters": 0,
                                "Graduate": 0,
                                "Phd": 0
                                })

# Creating binary variables for company_size_split:
train_cleaned_next['company_size_split']= train_cleaned_next['company_size_original'].map({
                               "<10" : 0,
                                "10/49" : 0,
                                "50-99" : 0, 
                                "100-500" : 0,
                                "500-999" : 0,
                                "1000-4999" : 0,
                                "5000-9999" : 0,
                                "5000_to_9999": 0,
                                "10000+" : 0,
                                "no info" : 1 
                                })




In [44]:
train_chi = train_cleaned_next.copy()

In [45]:
path = 'datasets/train_chi.csv'
train_chi.to_csv(path, index = False)

### Transfroming data: train_graph (for cat plot)

In [46]:
train_graph = train.copy()

In [47]:
train_graph['cdi_round'] = train_graph['city_development_index'].round(1)
train_graph['cdi_split'] = train_graph['cdi_round'].map({
                                    0.0: 0,
                                    0.1: 0,
                                    0.2: 0,
                                    0.3: 0,
                                    0.4: 0,
                                    0.5: 0,
                                    0.6: 0,
                                    0.7: 1,
                                    0.8: 1,
                                    0.9: 1,
                                    1.0: 1
                                    })

train_graph['City_split'] = train_graph['city']

train_graph.loc[train_graph['City_split'] != "city_21", 'City_split'] = "not_city_21"


In [48]:
train_graph['Company Type']=train_graph['company_type']
train_graph['Highest Education']= train_graph['education_level']
train_graph['City_Development_Index']=train_graph['cdi_split']
train_graph['City_Development_Index']=train_graph['City_Development_Index'].replace(1,"> 0.6")
train_graph['City_Development_Index']=train_graph['City_Development_Index'].replace(0,"<= 0.6")
train_graph['Company Type']=train_graph['Company Type'].replace(("Early Stage Startup","Funded Startup","Public Sector", "NGO", "Other", "no info"), "NOT employed in a Pvt Ltd company")
train_graph['Company Type']=train_graph['Company Type'].replace("Pvt Ltd", "Employed in a Pvt Ltd company")
train_graph['Highest Education']=train_graph['Highest Education'].replace( ("Primary School","High School","Masters", "Phd"),"NOT Graduate")
train_graph['Highest Education']=train_graph['Highest Education'].replace("Graduate","Graduate")

                            

In [49]:
train_graph['DataScience_Experience']=train_graph['relevent_experience']
train_graph['Working_Experience']= train_graph['experience']
train_graph['City_Development_Index']=train_graph['cdi_split']
train_graph['City_Development_Index']=train_graph['City_Development_Index'].replace(1,"> 0.6")
train_graph['City_Development_Index']=train_graph['City_Development_Index'].replace(0,"<= 0.6")
train_graph['DataScience_Experience']=train_graph['DataScience_Experience'].replace("No relevent experience","Novice in data science")
train_graph['DataScience_Experience']=train_graph['DataScience_Experience'].replace("Has relevent experience","Experienced in data science")
train_graph['Working_Experience']=train_graph['Working_Experience'].replace(("<1","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20"),"Under 20 years worked")
train_graph['Working_Experience']=train_graph['Working_Experience'].replace(">20","Over 20 years worked")

In [50]:
train_cat_graph = train_graph.copy()

In [51]:
path = 'datasets/train_cat_graph.csv'
train_cat_graph.to_csv(path, index = False)