In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('train.csv', index_col='Employee_ID')
X_test_full = pd.read_csv('test.csv', index_col='Employee_ID')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['Attrition_rate'], inplace=True)
y = X_full.Attrition_rate
X_full.drop(['Attrition_rate'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Age', 'Education_Level',
                                                   'Time_of_service',
                                                   'Time_since_promotio

In [4]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [10]:
X_train

Unnamed: 0_level_0,Gender,Relationship_Status,Hometown,Decision_skill_possess,Compensation_and_Benefits,Age,Education_Level,Time_of_service,Time_since_promotion,growth_rate,...,Post_Level,Pay_Scale,Work_Life_balance,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EID_10814,F,Married,Lebanon,Analytical,type3,34.0,3,5.0,4,40,...,3,4.0,4.0,3,-1.8176,-0.4537,2.0,2,8,3
EID_4658,F,Married,Franklin,Behavioral,type2,57.0,3,25.0,3,20,...,1,2.0,2.0,5,1.6081,-0.4537,2.0,4,7,1
EID_17936,F,Single,Franklin,Behavioral,type2,20.0,3,1.0,1,34,...,3,8.0,3.0,3,1.6081,-0.4537,1.0,4,9,4
EID_24477,F,Married,Franklin,Conceptual,type2,26.0,1,4.0,1,42,...,3,2.0,4.0,4,-0.1048,0.7075,2.0,2,5,3
EID_12200,F,Single,Lebanon,Conceptual,type2,39.0,3,11.0,2,66,...,2,8.0,2.0,4,0.7516,0.7075,3.0,4,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EID_23071,F,Married,Lebanon,Directive,type3,26.0,4,2.0,2,68,...,2,6.0,4.0,3,-0.1048,-0.4537,2.0,4,8,4
EID_15023,F,Married,Springfield,Conceptual,type3,,3,,2,29,...,1,6.0,4.0,3,,-0.4537,2.0,2,6,2
EID_3851,M,Married,Springfield,Analytical,type3,49.0,3,10.0,3,68,...,1,4.0,2.0,3,-0.1048,-0.4537,2.0,2,6,3
EID_7149,M,Married,Lebanon,Behavioral,type2,62.0,3,36.0,4,20,...,2,4.0,2.0,5,-0.1048,-0.4537,2.0,2,8,3


In [8]:

# Define model
model = RandomForestRegressor(n_estimators=500, random_state=100)


# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 0.1368813354056741


In [None]:
preds_test = my_pipeline.predict(X_test)

In [None]:
output = pd.DataFrame({'Employee_ID': X_test.index, 'Attrition_rate': preds_test})
output.to_csv('my_submission.csv', index=False)