# HR analysis

## Data Pre processing

#### Import libraries

In [1]:
import pandas as pd
import numpy as np
from pre_cleaner import pre_cleaner, to_clean
from sklearn.preprocessing import OrdinalEncoder

In [2]:
data = pd.read_csv('aug_train_clean.csv', index_col=0)
data.head()

Unnamed: 0,city_development_index,enrolled_university,education_level,experience,company_size,last_new_job,training_hours,target,gender_Female,gender_Male,...,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM,company_type_Early Stage Startup,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd
0,0.92,no_enrollment,Graduate,>20,,1,36,1.0,0,1,...,0,0,0,1,0,0,0,0,0,1
1,0.776,no_enrollment,Graduate,15,50-99,>4,47,0.0,0,1,...,0,0,0,1,0,0,0,0,0,1
2,0.624,Full time course,Graduate,5,,never,83,0.0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,0.789,no_enrollment,Graduate,<1,,never,52,1.0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0.767,no_enrollment,Masters,>20,50-99,4,8,0.0,0,1,...,0,0,0,1,0,1,0,0,0,0


In [3]:
#pre_cleaner(data)

In [4]:
to_clean(data)

"enrolled_university", "education_level", "experience", "company_size", "last_new_job", 

### Cleaning "enrolled_university"

In [5]:
print(data.enrolled_university.describe(), '\n')
print(data.enrolled_university.isna().sum(), '\n')
print(data.enrolled_university.unique())

count             19158
unique                3
top       no_enrollment
freq              14203
Name: enrolled_university, dtype: object 

0 

['no_enrollment' 'Full time course' 'Part time course']


In [6]:
oe = OrdinalEncoder(categories=[['no_enrollment', 'Part time course', 'Full time course']])
data.enrolled_university = oe.fit_transform(pd.DataFrame(data.enrolled_university))
# Now 'no_enrollment' is mapped to 0, 'Part time course' to 1, and 'Full time course' to 2
print(data.enrolled_university.unique())
data.enrolled_university = data.enrolled_university.astype('int32')
print(data.enrolled_university.unique(), '\n')
print(data.enrolled_university.dtype)

[0. 2. 1.]
[0 2 1] 

int32


### Cleaning  "education_level"

In [7]:
print(data.education_level.describe(), '\n')
print(data.education_level.isna().sum(), '\n')
print(data.education_level.unique())

count        19158
unique           5
top       Graduate
freq         12058
Name: education_level, dtype: object 

0 

['Graduate' 'Masters' 'High School' 'Phd' 'Primary School']


In [8]:
oe = OrdinalEncoder(categories=[['Primary School', 'High School', 'Graduate', 'Masters', 'Phd']])
data.education_level = oe.fit_transform(pd.DataFrame(data.education_level))
# Now 'Primary School is mapped to 0, 'High School' to 1, 'Graduate' to 2, 'Masters' to 3, and 'Phd' to 4
print(data.education_level.unique())
data.education_level = data.education_level.astype('int32')
print(data.education_level.unique(), '\n')
print(data.education_level.dtype)


[2. 3. 1. 4. 0.]
[2 3 1 4 0] 

int32


### Cleaning  "experience"


In [9]:
print(data.experience.describe(), '\n')
print(data.experience.isna().sum(), '\n')
print(data.experience.unique())

count     19158
unique       22
top         >20
freq       3351
Name: experience, dtype: object 

0 

['>20' '15' '5' '<1' '11' '13' '7' '17' '2' '16' '1' '4' '10' '14' '18'
 '19' '12' '3' '6' '9' '8' '20']


In [10]:
print(data.experience.unique(), '\n')
change = {'>20': 21, '<1': 0}
data.experience = data.experience.replace(change)
data.experience = pd.to_numeric(data.experience)
print(data.experience.unique())

['>20' '15' '5' '<1' '11' '13' '7' '17' '2' '16' '1' '4' '10' '14' '18'
 '19' '12' '3' '6' '9' '8' '20'] 

[21 15  5  0 11 13  7 17  2 16  1  4 10 14 18 19 12  3  6  9  8 20]


### Cleaning "company_size"

In [11]:
print(data.company_size.describe(), '\n')
print(data.company_size.isna().sum(), '\n')
print(data.company_size.unique())

count     13220
unique        8
top       50-99
freq       3083
Name: company_size, dtype: object 

5938 

[nan '50-99' '<10' '10000+' '5000-9999' '1000-4999' '10/49' '100-500'
 '500-999']


In [12]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data.company_size = imp.fit_transform(pd.DataFrame(data.company_size))
print(data.company_size.isna().sum())

0


In [13]:
print(data.company_size.describe(), '\n')
print(data.company_size.isna().sum(), '\n')
print(data.company_size.unique())

count     19158
unique        8
top       50-99
freq       9021
Name: company_size, dtype: object 

0 

['50-99' '<10' '10000+' '5000-9999' '1000-4999' '10/49' '100-500'
 '500-999']


In [14]:
oe = OrdinalEncoder(categories=[['<10', '10/49', '50-99', '100-500', '500-999', '1000-4999', '5000-9999', '10000+']])
data.company_size = oe.fit_transform(pd.DataFrame(data.company_size))

print(data.company_size.unique())
data.company_size = data.company_size.astype('int32')
print(data.company_size.unique(), '\n')
print(data.company_size.dtype)

[2. 0. 7. 6. 5. 1. 3. 4.]
[2 0 7 6 5 1 3 4] 

int32


### Cleaning "last_new_job"

In [15]:
print(data.last_new_job.describe(), '\n')
print(data.last_new_job.isna().sum(), '\n')
print(data.last_new_job.unique())

count     19158
unique        6
top           1
freq       8463
Name: last_new_job, dtype: object 

0 

['1' '>4' 'never' '4' '3' '2']


In [16]:
print(data.last_new_job.unique(), '\n')
change = {'>4': 5, 'never': 0}
data.last_new_job = data.last_new_job.replace(change)
data.last_new_job = pd.to_numeric(data.last_new_job)
print(data.last_new_job.unique())

['1' '>4' 'never' '4' '3' '2'] 

[1 5 0 4 3 2]


### Last look to the data

In [17]:
minimum = [data[col].min() for col in data]
maximum = [data[col].max() for col in data]

print('min \t max \t col')
for col, mi, mx in zip(data.columns, minimum, maximum):
    print(mi, '\t', mx, '\t', col)

min 	 max 	 col
0.448 	 0.949 	 city_development_index
0 	 2 	 enrolled_university
0 	 4 	 education_level
0 	 21 	 experience
0 	 7 	 company_size
0 	 5 	 last_new_job
1 	 336 	 training_hours
0.0 	 1.0 	 target
0 	 1 	 gender_Female
0 	 1 	 gender_Male
0 	 1 	 gender_Other
0 	 1 	 relevent_experience_No relevent experience
0 	 1 	 major_discipline_Arts
0 	 1 	 major_discipline_Business Degree
0 	 1 	 major_discipline_Humanities
0 	 1 	 major_discipline_No Major
0 	 1 	 major_discipline_Other
0 	 1 	 major_discipline_STEM
0 	 1 	 company_type_Early Stage Startup
0 	 1 	 company_type_Funded Startup
0 	 1 	 company_type_NGO
0 	 1 	 company_type_Other
0 	 1 	 company_type_Public Sector
0 	 1 	 company_type_Pvt Ltd


In [18]:
# almost all column are okay except "training_hours" and "target"
# At first dtype change in "target" column
print(data.target.unique())
data.target = data.target.astype('int32')
print(data.target.unique(), '\n')
print(data.target.dtype)

[1. 0.]
[1 0] 

int32


In [19]:
# Now is time to get "training_hours" column back in order
print(data.training_hours.nunique())
print(data.training_hours.unique())

241
[ 36  47  83  52   8  24  18  46 123  32 108  23  26 106   7 132  68  50
  48  65  13  22 148  72  40 141  82 145 206 152  42  14 112  87  20  21
  92 102  43  45  19  90  25  15  98 142  28 228  29  12  17  35   4 136
  27  74  86  75 332 140 182 172  33  34 150 160   3   2 210 101  59 260
 131 109  70  51  60 164 290 133  76 156 120 100  39  55  49   6 125 326
 198  11  41 114 246  81  31  84 105  38 178 104 202  88 218  62  10  80
  77  37 162 190  30  16   5  54  44 110 262 107 134 103  96  57 240  94
 113  56  64 320   9 129  58 126 166  95  97 204 116 161 146 302  53 143
 124 214 288 306 322  67  61 130 220  78 314 226 280  91 234 163 151  85
 256 168 144  66 128  73 122 154  63 292 188  71 135 138 184  89 157 118
 111 192 127 216 139 196  99 167 276 121  69 155 316 242 304 284 278 310
 222 212 250 180 258 330 158 149 165  79 194 176 174 312 200 328 300 153
 232 336 308 147 298 224 254 248 236 170 264 119 117 334 324   1 238 266
 282 268 244 272 294 270 286]


In [20]:
# This column has many values but all values are important, so they must be standarized or normalized
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

data.training_hours = sc.fit_transform(pd.DataFrame(data.training_hours))
print(f'min: {data.training_hours.min():.4f}')
print(f'max: {data.training_hours.max():.4f}')

min: -1.0718
max: 4.5063


## Random forest 

#### Import libraries

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

### Split data to test and trainig set 

In [22]:
X = data.drop('target', axis=1)
y = data.target

print(X.shape)
print(y.shape)

(19158, 23)
(19158,)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification




In [25]:
fi = 10
ii = 5
ri = 5

score = [[0,0,0],0]

# n_classes(2) * n_clusters_per_class(2) must be smaller or equal 2**n_informative(1)=2

for f in range(fi):
    for i in range(ii):
        for r in range(ri):
            if 2*(i+2) <= 2**(i+1):
                X, y = make_classification(n_samples=500*(f+1), n_features=i+1,
                                           n_informative=i+1, n_redundant=0,
                                           random_state=0, shuffle=False)

                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

                clf = RandomForestClassifier(max_depth=4, random_state=1)
                clf.fit(X_train, y_train)
                score_now = clf.score(X_test, y_test)
                if score_now > score[1]:
                    score[1] = score_now
                    score[0] = [f,i,r]
                    
                print(f'Score ({f},{i},{r}): {score[f,i,r]}')
            


KeyboardInterrupt: 