In [1]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import normalize
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import sys,random
#def main(argv):

#Great snippet from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
        #    df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


alldata = reduce_mem_usage(pd.read_csv('train.csv',sep=';'))
alldata['is_test'] = np.random.choice([True,False],size=len(alldata))

columns = ['cash_in_out','display_type','scanner_code_reader','atm_id']
alldata.drop(columns, inplace=True, axis=1)


categorical_cols = [col for col in alldata.select_dtypes(include=["object"]).columns]

#Mean encoding of categorical variables
for col in categorical_cols:
    means = alldata.loc[~alldata.is_test, :].groupby(col)["target"].mean()
    alldata.loc[:, "%s_MEAN" % col] = alldata.loc[:, col].map(means)
    
    #Missing values is filled with global mean
    alldata.loc[:, "%s_MEAN" % col] = alldata.loc[:, "%s_MEAN" % col].fillna(means.mean())
    
    
    
alldata.loc[:, categorical_cols] = alldata.loc[:, categorical_cols].apply(lambda x: LabelEncoder().fit_transform(x.astype(str))) 

Memory usage of dataframe is 43.54 MB
Memory usage after optimization is: 10.65 MB
Decreased by 75.6%


In [11]:
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

cols_to_drop = ["target", "is_test"]
X_train = alldata.loc[~alldata.is_test, :].drop(cols_to_drop, axis=1)
y_train = alldata.loc[~alldata.is_test, "target"]

X_test = alldata.loc[alldata.is_test, :].drop(cols_to_drop, axis=1)

n_splits = 3
cv = StratifiedKFold(n_splits=n_splits, random_state=42)

oof_preds = np.zeros(X_train.shape[0])

sub = pd.read_csv("sample_submission.csv")
sub["target"] = 0
feature_importances = pd.DataFrame()

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
    
    X_fit = X_train.iloc[fit_idx]
    y_fit = y_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    model = LGBMClassifier(
        max_depth=3,
        num_leaves=5 ** 2 - 1,
        learning_rate=0.003,
        n_estimators=3000,
        min_child_samples=80,
        subsample=0.8,
        colsample_bytree=1,
        reg_alpha=0,
        reg_lambda=0
        #,random_state=np.random.randint(10e6)
    )

    model.fit(
        X_fit,
        y_fit,
        eval_set=[(X_fit, y_fit), (X_val, y_val)],
        eval_names=('fit', 'val'),
        eval_metric='binary_logloss',
        early_stopping_rounds=150,
        verbose=False
    )
      
    oof_preds[val_idx] = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
    #sub['target'] += model.predict_proba(X_test, num_iteration=model.best_iteration_)[:,1]
    
    fi = pd.DataFrame()
    fi["feature"] = X_train.columns
    fi["importance"] = model.feature_importances_
    fi["fold"] = (i+1)
    
    feature_importances = pd.concat([feature_importances, fi], axis=0)
    
    y_pred=oof_preds[val_idx]
    y_pred=y = [1 if i >=np.mean(y_pred) else 0 for i in y_pred]
    
    print("Fold {} F1: {:.8f}".format(i+1, f1_score(y_val, y_pred)))

y_pred=oof_preds
y_pred=y = [1 if i >=np.mean(y_pred) else 0 for i in y_pred]
print('F1 score %.8f' % f1_score(y_train, y_pred))   
    
#sub["TARGET"] /= n_splits
#sub.to_csv("lgbm.csv", index=None, float_format="%.8f")

Fold 1 F1: 0.60823654
Fold 2 F1: 0.64033264
Fold 3 F1: 0.61169102
F1 score 0.60473208
