In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as sm
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
X = pd.read_csv('aug_train.csv', index_col='enrollee_id') 
X_test = pd.read_csv('aug_test.csv', index_col='enrollee_id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['target'], inplace=True)
y = X.target
X.drop(['target'], axis=1, inplace=True)

In [3]:
from sklearn.model_selection import train_test_split

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [4]:
# To see the number of unique values, number of NAN values and percentage of NAN values for each column.
def get_info(df):
    info=pd.DataFrame({'num_of_unique':df.nunique(),
                     'num_of_NAN':df.isna().sum(),
                     'col_NAN_percantage':df.isna().mean().round(4) * 100})
    return info

In [5]:
X_info=get_info(X)

In [6]:
X.describe()

Unnamed: 0,city_development_index,training_hours
count,19158.0,19158.0
mean,0.828848,65.366896
std,0.123362,60.058462
min,0.448,1.0
25%,0.74,23.0
50%,0.903,47.0
75%,0.92,88.0
max,0.949,336.0


In [7]:
#X_DM, (D)Dropping, (M) Missing Values
X_DM =X.copy()

cols_with_hight_missing =X_info.loc[X_info.col_NAN_percantage>20].index
X_DM.drop(cols_with_hight_missing, axis=1, inplace=True)
X_test.drop(cols_with_hight_missing, axis=1, inplace=True)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_DM, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [12]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X1_train = X_train[my_cols].copy()
X1_valid = X_valid[my_cols].copy()
X1_test = X_test[my_cols].copy()

In [15]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [17]:
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X1_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X1_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 0.3138153450204159


In [19]:
preds_test = my_pipeline.predict(X1_test)