In [1]:
# fetch the train and test data
import pandas as pd

train = pd.read_csv("train.csv", index_col="ID")
test = pd.read_csv("test.csv", index_col="ID")

In [2]:
# drop the missing row from target column
train.dropna(axis=0, subset=['Time_taken (min)'], inplace=True)

# assign target column to a variable
target_col = train['Time_taken (min)']

# drop target column from training data
train.drop(['Time_taken (min)'], axis=1, inplace=True)

In [3]:
# Split the training and validation set from training data 
from sklearn.model_selection import train_test_split

train_x, valid_x, train_target_y, valid_target_y = train_test_split(train,target_col,train_size=0.8,test_size=0.2, random_state=0)

In [4]:
# Select numeric columns from train data
numeric_cols = [cname for cname in train_x.columns if train_x[cname].dtype in ['int64','float64']]

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in train_x.columns if train_x[cname].nunique() < 10 and 
                        train_x[cname].dtype == "object"]

In [5]:
# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
train_x1 = train_x[my_cols].copy()
valid_x1 = valid_x[my_cols].copy()
test_x = test[my_cols].copy()

In [6]:
#One-hot encode the data (to shorten the code, we use pandas)
train_x1 = pd.get_dummies(train_x1)
valid_x1 = pd.get_dummies(valid_x1)
test_x = pd.get_dummies(test_x)
train_x1, valid_x1 = train_x1.align(valid_x1, join='left', axis=1)
train_x1, test_x = train_x1.align(test_x, join='left', axis=1)

In [8]:
#Build the model and create the piple
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

# Define the model
model = XGBRegressor(n_estimators=1250, learning_rate=0.05, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer(strategy='median')),
                              ('model', model)
                             ])


In [9]:
# fit the model, predict the result and find the MAE score

from sklearn.metrics import mean_absolute_error

# Fit the model
my_pipeline.fit(train_x1,train_target_y)

# Get predictions
predictions = my_pipeline.predict(valid_x1)

# Calculate MAE
mae = mean_absolute_error(valid_target_y, predictions)

print("Mean Absolute Error:" , mae)

Mean Absolute Error: 3.4902959290872686


In [10]:
# predict the result from test data

preds_test = my_pipeline.predict(test_x)

In [11]:
#Save test predictions to file
output = pd.DataFrame({'ID': test_x.index,
                       'Time_taken (min)': preds_test})
output.to_csv('sample_submission.csv', index=False)