In [123]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

In [167]:
def clean_data(data, datatype="train"):
    if datatype == "train":
        predictors = data.drop(["target"], axis=1)
        target = data.target
        X_train, X_valid, y_train, y_valid = train_test_split(
            predictors, 
            target,
            train_size=0.8,
            test_size=0.2,
            random_state=0
        )
        # Handling missing columns with imputation
        imputer = SimpleImputer(strategy="most_frequent")
        imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
        imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))
        # Put back removed columns by imputation
        imputed_X_train.columns = X_train.columns
        imputed_X_valid.columns = X_valid.columns
        # Ordinal encoding categorical variables
        cat_vars = [col for col in X_train.columns if predictors[col].dtype == "object"]
        encoder = OrdinalEncoder()
        imputed_X_train[cat_vars] = encoder.fit_transform(imputed_X_train[cat_vars])
        imputed_X_valid[cat_vars] = encoder.transform(imputed_X_valid[cat_vars])
        return imputed_X_train.astype(float), imputed_X_valid.astype(float), y_train.astype(float), y_valid.astype(float)
    elif datatype == "test":
        # Imputation
        imputer = SimpleImputer(strategy="most_frequent")
        imputed_data = pd.DataFrame(imputer.fit_transform(data))
        imputed_data.columns = data.columns
        # Ordinal encoding
        cat_vars = [col for col in data.columns if data[col].dtype == "object"]
        encoder = OrdinalEncoder()
        imputed_data[cat_vars] = encoder.fit_transform(imputed_data[cat_vars])
        return imputed_data.astype(float)

In [168]:
def get_mae(X_train, X_valid, y_train, y_valid, n_estimators):
    model = XGBRegressor(n_estimators=n_estimators, n_jobs=4)
    model.fit(
        X_train, 
        y_train, 
        early_stopping_rounds=5, 
        eval_set=[(X_valid, y_valid)], 
        verbose=False
    )
#     model = RandomForestRegressor(n_estimators=n_estimators, random_state=0)
#     model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    return mean_absolute_error(y_valid, predictions)

def get_best_estimator(X_train, X_valid, y_train, y_valid):
    best_estimator = 50
    for i in range(50, 500):
        mae = get_mae(X_train, X_valid, y_train, y_valid, i)
        if mae < get_mae(X_train, X_valid, y_train, y_valid, best_estimator):
            best_estimator = i
    return best_estimator    

In [181]:
class Model:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.model = XGBRegressor(n_estimators=self.n_estimators, n_jobs=4)
    
    def train(self, data):
        X_train, X_valid, y_train, y_valid = clean_data(data, datatype="train")
        self.model.fit(
            X_train, 
            y_train, 
            early_stopping_rounds=5, 
            eval_set=[(X_valid, y_valid)], 
            verbose=False
        )
        predictions = self.model.predict(X_valid)
        return f"Model trained with MEAN ABSOLUTE ERROR: {get_mae(X_train, X_valid, y_train, y_valid, self.n_estimators)}"
              
    def save_prediction(self, data):
        cleaned_data = clean_data(data, datatype="test")
        predictions = self.model.predict(cleaned_data)
        # Save test predictions to file
        output = pd.DataFrame({
            'id': cleaned_data.id.astype(int),
            'target': predictions
        })
        output.to_csv('submission.csv', index=False)
        return f"Predictions saved to 'submission.csv'"

In [182]:
# best_estimator = get_best_estimator(X_train, X_valid, y_train, y_valid)

In [183]:
# model = Model(n_estimators=best_estimator)
model = Model(n_estimators=100)

In [184]:
train_data = pd.read_csv("train.csv")
model.train(train_data)

'Model trained with MEAN ABSOLUTE ERROR: 0.5769100038861545'

In [185]:
test_data = pd.read_csv("test.csv")
model.save_prediction(test_data)

"Predictions saved to 'submission.csv'"