In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

In [43]:
def clean_data(data, datatype="train"):
    if datatype == "train":
        predictors = data.drop(["id", "target"], axis=1)
        target = data.target
        X_train, X_valid, y_train, y_valid = train_test_split(
            predictors, 
            target,
            train_size=0.8,
            test_size=0.2,
            random_state=0
        )
        # Handling missing columns with imputation
        imputer = SimpleImputer(strategy="constant")
        imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
        imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))
        # Put back removed columns by imputation
        imputed_X_train.columns = X_train.columns
        imputed_X_valid.columns = X_valid.columns
        
        # Ordinal encoding categorical variables
        cat_vars = [col for col in X_train.columns if predictors[col].dtype == "object"]
        # encoder = OrdinalEncoder()
        # imputed_X_train[cat_vars] = encoder.fit_transform(imputed_X_train[cat_vars])
        # imputed_X_valid[cat_vars] = encoder.transform(imputed_X_valid[cat_vars])
        
        
        # One-hot encoding categorical variables
        OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(imputed_X_train[cat_vars]))
        OH_cols_valid = pd.DataFrame(OH_encoder.transform(imputed_X_valid[cat_vars]))
        # One-hot encoding removed index; put it back
        OH_cols_train.index = imputed_X_train.index
        OH_cols_valid.index = imputed_X_valid.index

        # Remove categorical columns (will replace with one-hot encoding)
        num_X_train = imputed_X_train.drop(cat_vars, axis=1)
        num_X_valid = imputed_X_valid.drop(cat_vars, axis=1)

        # Add one-hot encoded columns to numerical features
        OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
        OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
        
        return OH_X_train.astype(float), OH_X_valid.astype(float), y_train.astype(float), y_valid.astype(float)
    elif datatype == "test":
        # Imputation
        imputer = SimpleImputer(strategy="constant")
        data = data.drop(["id"], axis=1)
        imputed_data = pd.DataFrame(imputer.fit_transform(data))
        imputed_data.columns = data.columns
        # Ordinal encoding
        cat_vars = [col for col in data.columns if data[col].dtype == "object"]
        encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        OH_cols_test = pd.DataFrame(encoder.fit_transform(imputed_data[cat_vars]))
        OH_cols_test.index = imputed_data.index
        num_data = imputed_data.drop(cat_vars, axis=1)
        OH_data = pd.concat([num_data, OH_cols_test], axis=1)
        
        # imputed_data[cat_vars] = encoder.fit_transform(imputed_data[cat_vars])
        return OH_data.astype(float)

In [11]:
def get_mae(X_train, X_valid, y_train, y_valid, n_estimators):
    model = XGBRegressor(n_estimators=n_estimators, n_jobs=4)
    model.fit(
        X_train, 
        y_train, 
        early_stopping_rounds=5, 
        eval_set=[(X_valid, y_valid)], 
        verbose=False
    )
#     model = RandomForestRegressor(n_estimators=n_estimators, random_state=0)
#     model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    return mean_absolute_error(y_valid, predictions)

def get_best_estimator(X_train, X_valid, y_train, y_valid):
    best_estimator = 50
    for i in range(50, 500):
        mae = get_mae(X_train, X_valid, y_train, y_valid, i)
        if mae < get_mae(X_train, X_valid, y_train, y_valid, best_estimator):
            best_estimator = i
    return best_estimator    

In [12]:
class Model:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.model = XGBRegressor(n_estimators=self.n_estimators, n_jobs=4)
    
    def train(self, data):
        X_train, X_valid, y_train, y_valid = clean_data(data, datatype="train")
        self.model.fit(
            X_train, 
            y_train, 
            early_stopping_rounds=5, 
            eval_set=[(X_valid, y_valid)], 
            verbose=False
        )
        predictions = self.model.predict(X_valid)
        return f"Model trained with MEAN ABSOLUTE ERROR: {get_mae(X_train, X_valid, y_train, y_valid, self.n_estimators)}"
              
    def save_prediction(self, data):
        cleaned_data = clean_data(data, datatype="test")
        predictions = self.model.predict(cleaned_data)
        # Save test predictions to file
        output = pd.DataFrame({
            'id': data.id.astype(int),
            'target': predictions
        })
        output.to_csv('submission.csv', index=False)
        return f"Predictions saved to 'submission.csv'"

In [6]:
# best_estimator = get_best_estimator(X_train, X_valid, y_train, y_valid)

In [7]:
# model = Model(n_estimators=best_estimator)
model = Model(n_estimators=100)

In [35]:
train_data = pd.read_csv("train.csv")
model.train(train_data)

'Model trained with MEAN ABSOLUTE ERROR: 0.5768163908385996'

In [44]:
test_data = pd.read_csv("test.csv")
model.save_prediction(test_data)

"Predictions saved to 'submission.csv'"

In [34]:
train_data = pd.read_csv("train.csv")
X_train, X_valid, y_train, y_valid = clean_data(train_data, datatype="train")
get_mae(X_train, X_valid, y_train, y_valid, 200)

0.5768163908385996

In [36]:
X_train.cont1.shape

(240000,)

In [37]:
X_train.shape

(240000, 70)

In [30]:
X_train.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,0.0,1.0,0.0,2.0,1.0,1.0,0.0,4.0,4.0,8.0,...,0.277709,0.398071,0.487289,0.869034,0.439229,0.352714,0.228661,0.363903,0.331813,0.47468
1,0.0,1.0,0.0,2.0,1.0,1.0,0.0,4.0,2.0,5.0,...,0.281611,0.388998,0.176158,1.000809,0.27151,0.440423,0.311323,0.164655,0.279045,0.452538
2,1.0,1.0,0.0,0.0,1.0,2.0,0.0,4.0,4.0,14.0,...,0.278146,0.403402,0.502907,0.324734,0.511943,0.328226,0.543738,0.332158,0.31199,0.463616
3,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.0,6.0,5.0,...,0.686815,0.821791,0.538206,0.631913,0.405548,0.753464,0.903191,0.91115,0.611461,0.477071
4,0.0,0.0,0.0,2.0,1.0,3.0,0.0,3.0,2.0,6.0,...,0.257302,0.748708,0.520747,0.828223,0.996511,0.926008,0.693743,0.782965,0.819652,0.772523
