In [1]:
import xgboost
import pandas as pd

class Dataset():
    def __init__(self, data_path='/kaggle/input/titanic/', label_name='Survived'):
        
        self.train_data_full = pd.read_csv(data_path + 'train.csv')
        self.train_labels = self.train_data_full[label_name]
        self.train_data = self.train_data_full.drop(columns=[label_name])
        
        self.test_data = pd.read_csv(data_path + 'test.csv')
        
    def process_data(self, data):
        object_columns = data.select_dtypes(include=['object']).columns
        data[object_columns] = data[object_columns].astype('category')
        return data
    
    def fetch_data(self, set_name='train'):
        if set_name == 'train':
            return self.process_data(self.train_data), self.train_labels
        if set_name == 'test':
            return self.process_data(self.test_data) 
    
class Model():
    def __init__(self):
        self.model = xgboost.XGBRegressor(
                                    gamma=1,                 
                                    learning_rate=0.01,
                                    max_depth=3,
                                    n_estimators=100,                                                                    
                                    subsample=0.8,
                                    random_state=34,
                                    enable_categorical=True, # true, because we have categorical columns in our dataset
                                    tree_method='hist'
                                  )
    
    def fit(self, data, labels):
        """
        Input: takes dataset as input and fits model.
        Output: nothing.
        """
        self.model.fit(data, labels)
        
    def predict(self, data):
        """
        Input: takes test data to predict on.
        Output: returns probabilities.
        """
        output_labels = self.model.predict(data)
        return output_labels

def convert_probabilities_to_labels(probabilities, threshold=.5):
    """
    Logic: if predicted probability is greater than .5 we say they survived i.e label 1. 0 otherwise.
    """
    
    labels = list(probabilities > threshold)
    return labels
    
def generate_formated_csv(test_data, predicted_probabilities):
    """
    Input: takes test dataset, predicted probabilities
    Output: returns csv with two columns PassengerId,Survived. 
    """
    
    # it's given in dataset description that csv must contain 418 columns.
    assert len(test_data) == 418
    assert len(predicted_probabilities) == 418
    
    submission_dataframe = pd.DataFrame()
    submission_dataframe['PassengerId'] = test_data['PassengerId']
    submission_dataframe['Survived'] = convert_probabilities_to_labels(predicted_probabilities)
    
    return submission_dataframe
        
    
if __name__ == "__main__":
    #config 
    root_path = '/kaggle/input/titanic/'
    
    # preparing data using Dataset class we wrote above 
    dataset = Dataset(root_path)
    train_data, train_labels = dataset.fetch_data('train') 
    test_data = dataset.fetch_data('test')
    
    # initialising and training model
    model = Model()
    model.fit(train_data, train_labels)
    test_probabilities = model.predict(test_data)
    
    # converting probabilities to required format
    dataframe = generate_formated_csv(test_data, test_probabilities)
    dataframe.to_csv('submission.csv', index=False)

In [2]:
submission_file = pd.read_csv('submission.csv')
submission_file.head(418)

Unnamed: 0,PassengerId,Survived
0,892,False
1,893,True
2,894,False
3,895,False
4,896,True
...,...,...
413,1305,False
414,1306,True
415,1307,False
416,1308,False
