<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Exploration" data-toc-modified-id="Data-Exploration-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Exploration</a></span><ul class="toc-item"><li><span><a href="#Load-and-check" data-toc-modified-id="Load-and-check-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load and check</a></span></li><li><span><a href="#Correlation-Matrix" data-toc-modified-id="Correlation-Matrix-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Correlation Matrix</a></span></li><li><span><a href="#Plotting-data-distribution" data-toc-modified-id="Plotting-data-distribution-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Plotting data distribution</a></span></li></ul></li><li><span><a href="#Feature-Engineering" data-toc-modified-id="Feature-Engineering-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature Engineering</a></span><ul class="toc-item"><li><span><a href="#Plot-feature-importance" data-toc-modified-id="Plot-feature-importance-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Plot feature importance</a></span></li><li><span><a href="#Creating-new-columns" data-toc-modified-id="Creating-new-columns-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Creating new columns</a></span></li></ul></li><li><span><a href="#Algorithms" data-toc-modified-id="Algorithms-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Algorithms</a></span><ul class="toc-item"><li><span><a href="#RandSearchCV-XGBoostClassifier" data-toc-modified-id="RandSearchCV-XGBoostClassifier-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>RandSearchCV XGBoostClassifier</a></span></li><li><span><a href="#RandSearchCV-Light-Gradient-Boosting-Machine" data-toc-modified-id="RandSearchCV-Light-Gradient-Boosting-Machine-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>RandSearchCV Light Gradient Boosting Machine</a></span></li><li><span><a href="#Save-a-trained-model" data-toc-modified-id="Save-a-trained-model-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Save a trained model</a></span></li></ul></li><li><span><a href="#Challenges" data-toc-modified-id="Challenges-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Challenges</a></span><ul class="toc-item"><li><span><a href="#Making-a-submission" data-toc-modified-id="Making-a-submission-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Making a submission</a></span></li></ul></li><li><span><a href="#Metrics-and-Data-manipulation" data-toc-modified-id="Metrics-and-Data-manipulation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Metrics and Data manipulation</a></span><ul class="toc-item"><li><span><a href="#Submissions-table" data-toc-modified-id="Submissions-table-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Submissions table</a></span></li><li><span><a href="#Dataset-reduction-/-optimization" data-toc-modified-id="Dataset-reduction-/-optimization-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Dataset reduction / optimization</a></span></li></ul></li></ul></div>

In [2]:
import pandas as pd

# Data Exploration

50 visualisations with Matplotlib : 
https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/

## Load and check

In [None]:
# Loading data
xtest = pd.read_csv('file.csv')

# Counting NA values in dataframe
print(xtest.isna().sum().sum())

## Correlation Matrix

In [None]:
correlations = train.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
plt.show()

## Plotting data distribution

In [None]:
plt.figure(4, figsize=(15, 9))

for col, n in zip(xtrain.columns[0:14], range(1, 15)):
    
    plt.subplot(4, 4, n)
    plt.xlabel(col)
    
    plt.hist(xtrain[col], label=('x', 'y'))
    plt.draw()

# Feature Engineering

## Plot feature importance

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

plt.figure(figsize=(10,10))

name = "Random Forest"

indices = np.argsort(rf.feature_importances_)[::-1][:40]
g = sns.barplot(y=X_train.columns[indices][:40],x = rf.feature_importances_[indices][:40] , orient='h')
g.set_xlabel("Relative importance",fontsize=12)
g.set_ylabel("Features",fontsize=12)

g.tick_params(labelsize=9)
g.set_title("Feature importance")
plt.savefig('images/importance.png')

plt.show()

## Creating new columns

In [None]:
xtrain_fe = xtrain.copy()

for column in xtrain_fe.columns:
    new_col = column+'_sq'
    xtrain_fe[new_col] = pow(xtrain_fe[column], 2)

# Algorithms

## RandSearchCV XGBoostClassifier

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

# Classifier
xgb = XGBClassifier(
    n_jobs=-1,
    silent=False)

# Create hyperparameter options
xgb_max_depth=[3, 5, 7, 10]             # Usual values between 3-10
xgb_learning_rate=[0.1, 0.5, 1, 1.2]    # Makes the model more robust by shrinking the weights on each step
xgb_n_estimators=[100, 200, 500, 1000, 1200]
xgb_booster=['gbtree']                  #, 'gblinear', 'dart']
xgb_reg_lambda=[1, 2]                   # L2 used to reduce overfitting

hyperparameters = dict(
    max_depth = xgb_max_depth, 
    learning_rate = xgb_learning_rate,
    n_estimators = xgb_n_estimators,
    booster=xgb_booster,
    reg_lambda=xgb_lambda)

# Create randomized grid search
rscv = RandomizedSearchCV(xgb, hyperparameters, random_state=1, n_iter=50, cv=5, verbose=10, n_jobs=-1)
# Fit randomized search
best_model = rscv.fit(xtrain_part, ytrain_part)

# View Hyperparameter Values Of Best Model
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best learning_rate:', best_model.best_estimator_.get_params()['learning_rate'])
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])

## RandSearchCV Light Gradient Boosting Machine

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb

xtrain_lgbm = xtrain.head(n=100000)
ytrain_lgbm = ytrain.head(n=100000)

# Classifier
lgb_estimator = lgb.LGBMClassifier(
    boosting_type='gbdt',  
    objective='binary', 
    n_estimators=100, 
    learning_rate=0.1, 
    metric='binary_logloss',
    n_jobs=-1)

# Create hyperparameter options
hyperparameters = dict(
    num_leaves = [31, 60, 128, 160, 250], 
    reg_alpha = [0.1, 0.5],
    min_data_in_leaf = [30, 50, 100, 300, 400],
    n_estimators = [100, 200, 500, 1000, 2000, 5000]
    lambda_l1 = [0, 1, 1.5],
    lambda_l2 = [0, 1],
    max_depth = [7],
    )

# Create randomized grid search
rscv = RandomizedSearchCV(
    lgb_estimator, hyperparameters,
    n_iter=100, cv=5, 
    verbose=20, n_jobs=-1)

# Fit randomized search
best_model = rscv.fit(xtrain_lgbm, np.ravel(ytrain_lgbm))

# View Hyperparameter Values Of Best Model
print('Best num_leaves:', best_model.best_estimator_.get_params()['num_leaves'])
print('Best reg_alpha:', best_model.best_estimator_.get_params()['reg_alpha'])
print('Best min_data_in_leaf:', best_model.best_estimator_.get_params()['min_data_in_leaf'])
print('Best lambda_l1:', best_model.best_estimator_.get_params()['lambda_l1'])
print('Best lambda_l2:', best_model.best_estimator_.get_params()['lambda_l2'])

## Save a trained model
Credit : https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

In [None]:
import pickle

model = LogisticRegression()
model.fit(X_train, Y_train)

# Save the model to disk
filename = 'saved_lr.sav'
pickle.dump(model, open(filename, 'wb'))
 
# Load the model from disk
filename = 'saved_lr.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)

# Challenges

## Making a submission

In [4]:
def make_submission(test_data, algorithm, filename='submission.csv'):
    """Creates a CSV file for challenge submission
  
    test_data: Description of arg1 
    algorithm: Algo used for making prediction
    filename: 'submission.csv'
    """
    ytest = algorithm.predict(test_data)
    np.savetxt(fichier, ytest, fmt = '%1.0d', delimiter=',')

# Metrics and Data manipulation

## Submissions table

In [3]:
lgbm_submissions = {}
lgbm_submissions['1'] = [500, 0.1, -1, 0.98465]
lgbm_submissions['2'] = [3000, 0.08, -1, 0.985015830111]
lgbm_submissions['3'] = [5000, 0.01, -1, 0.987662524559]

lgbm_subs_df = pd.DataFrame.from_dict(
    lgbm_submissions, 
    orient='index',
    columns=['n_estimators', 'learning_rate', 'num_leaves', 'score'])

lgbm_subs_df

Unnamed: 0,n_estimators,learning_rate,num_leaves,score
1,500,0.1,-1,0.98465
2,3000,0.08,-1,0.985016
3,5000,0.01,-1,0.987663


## Dataset reduction / optimization

Fit a KNNClassifier on the training set, and find neighbors in the test set. Only keep these neighbors. This technique can filter out training samples to make it more similar to the testing set.

In [4]:
from sklearn.neighbors import KNeighborsClassifier

def optimize_dataset(Xtrain, Ytrain, Xtest, n_neighbors=3, save_csv=False):
    
    knn = KNeighborsClassifier(
        n_neighbors=3,
        n_jobs=-1)

    print('Fitting KNNClassifier')
    knn.fit(Xtrain, np.ravel(Ytrain))

    # Find indices of similar points.
    neighbors_idx_list = np.unique(np.ravel(knn.kneighbors(xtest)[1]))
    
    # Filter out with indices
    xtrain_opt = Xtrain.iloc[neighbors_idx_list, :]
    ytrain_opt = Ytrain.iloc[neighbors_idx_list, :]

    print("Now we have ", xtrain_opt.shape[0], " lines for our training set.")
    
    if save_csv:
        print("Writing CSV files : x_optim.csv / y_optim.csv")
        xtrain_opt.to_csv('x_optim.csv')
        ytrain_opt.to_csv('y_optim.csv')