<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Exploration" data-toc-modified-id="Data-Exploration-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Exploration</a></span><ul class="toc-item"><li><span><a href="#Load-and-check" data-toc-modified-id="Load-and-check-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load and check</a></span></li></ul></li><li><span><a href="#Feature-Engineering" data-toc-modified-id="Feature-Engineering-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature Engineering</a></span></li><li><span><a href="#Algorithms" data-toc-modified-id="Algorithms-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Algorithms</a></span><ul class="toc-item"><li><span><a href="#Run-a-randomized-searchCV-with-XGBoostClassifier" data-toc-modified-id="Run-a-randomized-searchCV-with-XGBoostClassifier-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Run a randomized searchCV with XGBoostClassifier</a></span></li><li><span><a href="#Save-a-trained-model" data-toc-modified-id="Save-a-trained-model-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Save a trained model</a></span></li></ul></li><li><span><a href="#Challenges" data-toc-modified-id="Challenges-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Challenges</a></span><ul class="toc-item"><li><span><a href="#Making-a-submission" data-toc-modified-id="Making-a-submission-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Making a submission</a></span></li></ul></li></ul></div>

# Data Exploration

## Load and check

In [None]:
# Loading data
xtest = pd.read_csv('xtest_challenge.csv')

# Counting NA values in dataframe
print(xtest.isna().sum().sum())

# Feature Engineering

# Algorithms

## Run a randomized searchCV with XGBoostClassifier

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

# Classifier
xgb = XGBClassifier(
    n_jobs=-1,
    silent=False)

# Create hyperparameter options
xgb_max_depth=[3, 5, 7, 10]             # Usual values between 3-10
xgb_learning_rate=[0.1, 0.5, 1, 1.2]    # Makes the model more robust by shrinking the weights on each step
xgb_n_estimators=[100, 200, 500, 1000, 1200]
xgb_booster=['gbtree']                  #, 'gblinear', 'dart']
xgb_reg_lambda=[1, 2]                   # L2 used to reduce overfitting

hyperparameters = dict(
    max_depth = xgb_max_depth, 
    learning_rate = xgb_learning_rate,
    n_estimators = xgb_n_estimators,
    booster=xgb_booster,
    reg_lambda=xgb_lambda)

# Create randomized grid search
rscv = RandomizedSearchCV(xgb, hyperparameters, random_state=1, n_iter=50, cv=5, verbose=10, n_jobs=-1)
# Fit randomized search
best_model = rscv.fit(xtrain_part, ytrain_part)

# View Hyperparameter Values Of Best Model
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best learning_rate:', best_model.best_estimator_.get_params()['learning_rate'])
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])

## Save a trained model
Credit : https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

In [None]:
import pickle

model = LogisticRegression()
model.fit(X_train, Y_train)

# Save the model to disk
lrmodel_filename = 'saved_lr.sav'
pickle.dump(model, open(lrmodel_filename, 'wb'))
 
# Load the model from disk
loaded_model = pickle.load(open(lrmodel_filename, 'rb'))
result = loaded_model.score(X_test, Y_test)

# Challenges

## Making a submission

In [4]:
def make_submission(test_data, algorithm, filename='submission.csv'):
    """Creates a CSV file for challenge submission
  
    test_data: Description of arg1 
    algorithm: Algo used for making prediction
    filename: 'submission.csv'
    """
    ytest = algorithm.predict(test_data)
    np.savetxt(fichier, ytest, fmt = '%1.0d', delimiter=',')