# DATA

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hr-analytics-job-change-of-data-scientists/sample_submission.csv
/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv
/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv


In [2]:
train = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

# Data Exploratory

In [3]:
# function Exploratory Data Analysis
def eda(dfA, all=False, desc='Exploratory Data Analysis'):
    print(desc)
    print(f'\nShape:\n{dfA.shape}')
    print(f'\nDTypes - Numerics')
    print(dfA.select_dtypes(include=np.number).columns.tolist())
    print(f'\nDTypes - Categoricals')
    print(dfA.select_dtypes(include='object').columns.tolist())
    print(f'\nIs Null: {dfA.isnull().sum().sum()}')
    print(f'{dfA.isnull().mean().sort_values(ascending=False)}')
    dup = dfA.duplicated()
    print(f'\nDuplicated: \n{dfA[dup].shape}\n')
    try:
        print(dfA[dfA.duplicated(keep=False)].sample(4))
    except:
        pass
    if all:  # here you put yours prefered analysis that detail more your dataset
        
        print(f'\nDTypes - Numerics')
        print(dfA.describe(include=[np.number]))
        print(f'\nDTypes - Categoricals')
        print(dfA.describe(include=['object']))


In [4]:
eda(train)

Exploratory Data Analysis

Shape:
(19158, 14)

DTypes - Numerics
['enrollee_id', 'city_development_index', 'training_hours', 'target']

DTypes - Categoricals
['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']

Is Null: 20733
company_type              0.320493
company_size              0.309949
gender                    0.235306
major_discipline          0.146832
education_level           0.024011
last_new_job              0.022080
enrolled_university       0.020148
experience                0.003393
target                    0.000000
training_hours            0.000000
relevent_experience       0.000000
city_development_index    0.000000
city                      0.000000
enrollee_id               0.000000
dtype: float64

Duplicated: 
(0, 14)



In [5]:
# function Fill NaN values
def cleanNaN(dfA):
  for col in dfA:
    if type(dfA[col]) == 'object':
        dfA[col] = dfA[col].fillna('unknow')
    else:
        dfA[col] = dfA[col].fillna(0)
  return dfA

In [6]:
treino = cleanNaN(train)

In [7]:
eda(treino)

Exploratory Data Analysis

Shape:
(19158, 14)

DTypes - Numerics
['enrollee_id', 'city_development_index', 'training_hours', 'target']

DTypes - Categoricals
['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']

Is Null: 0
target                    0.0
training_hours            0.0
last_new_job              0.0
company_type              0.0
company_size              0.0
experience                0.0
major_discipline          0.0
education_level           0.0
enrolled_university       0.0
relevent_experience       0.0
gender                    0.0
city_development_index    0.0
city                      0.0
enrollee_id               0.0
dtype: float64

Duplicated: 
(0, 14)



In [8]:
# changing Categoricals to number
def catToNumeric(dfA):
    for x in dfA.select_dtypes(include='object').columns.tolist():
        ncol = 'cc_'+x 
        dfA[x] = pd.Categorical(dfA[x])
        dfA[ncol] = dfA[x].cat.codes
    return dfA

In [9]:
treino2 = catToNumeric(treino)
ncols = treino2.select_dtypes(include=np.number).columns.tolist()
ncols

['enrollee_id',
 'city_development_index',
 'training_hours',
 'target',
 'cc_city',
 'cc_gender',
 'cc_relevent_experience',
 'cc_enrolled_university',
 'cc_education_level',
 'cc_major_discipline',
 'cc_experience',
 'cc_company_size',
 'cc_company_type',
 'cc_last_new_job']

In [10]:
import plotly.graph_objects as go
import plotly.figure_factory as ff

**Correlation between variables**

In [11]:
def correlation(dfA, varT, minValue=0.5, showGraphic=True, title='Correlation between variables'):
    corr = dfA.corr()
    print(f'\nAnalysing features:\n'
          f'Target: {varT}\n'
          f'minValue de ref.: {minValue}\n'
          f'\nMain Features:')
    corrs = corr[varT]
    features = []
    for i in range(0, len(corrs)):
        if corrs[i] > minValue and corrs.index[i] != varT:
            print(corrs.index[i], f'{corrs[i]:.2f}')
            features.append(corrs.index[i])
    if showGraphic:     
        z_text = np.around(corr.values, decimals=2) 
        fig2 = ff.create_annotated_heatmap(corr.values, 
                                          annotation_text=z_text,
                                           x=corr.index.values.tolist(),
                                y=corr.columns.values.tolist(),
                                colorscale='Viridis',
                                hoverongaps = False)
        
        for i in range(len(fig2.layout.annotations)):
            fig2.layout.annotations[i].font.size = 8
            
        fig2.show()

        
        
    
    return features

In [12]:
varTarget = 'target'

In [13]:
varsFeatures = correlation(treino2, varT=varTarget, minValue=-0.999)


Analysing features:
Target: target
minValue de ref.: -0.999

Main Features:
enrollee_id 0.05
city_development_index -0.34
training_hours -0.02
cc_city 0.06
cc_gender -0.08
cc_relevent_experience 0.13
cc_enrolled_university -0.15
cc_education_level -0.08
cc_major_discipline 0.06
cc_experience -0.01
cc_company_size -0.19
cc_company_type -0.20
cc_last_new_job -0.03


# Regressors

In [14]:
# ML Algorithms sklearn
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyRegressor


In [15]:
regressors = [
        DecisionTreeRegressor(),
        RandomForestRegressor(),
        SVR(),
        LinearRegression(),
        GradientBoostingRegressor(),
        PoissonRegressor(),
        DummyRegressor(),
        LogisticRegression(),
        GaussianNB()
    ]

In [16]:
teste = cleanNaN(test)

In [17]:
teste2 = catToNumeric(teste)

In [18]:
Xtreino = treino2[varsFeatures]
ytreino = treino2[varTarget]

In [19]:
Xteste = teste2[varsFeatures]

In [20]:
reg = []
sco = []
for regressor in regressors:
    modelo = regressor
    modelo.fit(Xtreino, np.array(ytreino))
    sco.append(round(modelo.score(Xtreino, ytreino),2))
    previsao = modelo.predict(Xteste)
    reg.append(regressor)


overflow encountered in exp


overflow encountered in exp


invalid value encountered in true_divide


invalid value encountered in add


lbfgs failed to converge (status=2):
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [21]:
meuMae = pd.DataFrame(columns=['Regressor', 'score'])
meuMae['Regressor'] = reg
meuMae['score'] = sco

In [22]:
meuMae = meuMae.sort_values(by='score', ascending=False)
meuMae

Unnamed: 0,Regressor,score
0,DecisionTreeRegressor(),1.0
1,"(DecisionTreeRegressor(max_features='auto', ra...",0.89
7,LogisticRegression(),0.75
8,GaussianNB(),0.75
4,([DecisionTreeRegressor(criterion='friedman_ms...,0.28
3,LinearRegression(),0.17
5,PoissonRegressor(),-0.0
6,DummyRegressor(),0.0
2,SVR(),-0.12


In [23]:
f'Best Regressor: {meuMae["Regressor"].values[0]}'

'Best Regressor: DecisionTreeRegressor()'

In [24]:
modelo = DecisionTreeRegressor()
modelo.fit(Xtreino, np.array(ytreino))
previsao = modelo.predict(Xteste)

previsao

In [25]:
submission = pd.DataFrame()
submission['enrollee_id'] = Xteste.enrollee_id
submission['target'] = previsao

In [26]:
submission.head()

Unnamed: 0,enrollee_id,target
0,32403,0.0
1,9858,0.0
2,31806,1.0
3,27385,0.0
4,27724,0.0


In [27]:
submission.to_csv('submission.csv', index=False)

# Basic... but done!