In [1]:
#adapted from https://www.kaggle.com/myltykritik/simple-lgbm-image-features
#Reference: https://www.kaggle.com/kgeorge/yolo-v3-object-detection-for-petfinder/

import json

import scipy as sp
import pandas as pd
import numpy as np

from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from collections import Counter

import lightgbm as lgb
np.random.seed(369)

from joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm, tqdm_notebook

In [2]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

### Image Features ###

In [3]:
import cv2
import pandas as pd
import numpy as np
import os
from tqdm import tqdm, tqdm_notebook
from keras.applications.densenet import preprocess_input, DenseNet121
import tensorflow as tf

train_df = pd.read_csv('input/train/train.csv')
img_size = 256
batch_size = 16

train_detect = pd.read_csv('input/train/train_detections.csv')

train_detect = train_detect.set_index("name", drop = False)
print(train_detect.loc["0008c5398-1", "x1"])

train_detect.loc[:, 'name'].replace(regex=True, to_replace="-1", value="")

#pet_ids = train_detect['name'].values

Using TensorFlow backend.


24


name
0008c5398-1    0008c5398
000a290e4-1    000a290e4
000fb9572-1    000fb9572
0011d7c25-1    0011d7c25
00156db4a-1    00156db4a
001a1aaad-1    001a1aaad
001b1507c-1    001b1507c
002230dea-1    002230dea
002278114-1    002278114
0025a8313-1    0025a8313
0038234c6-1    0038234c6
0038c9343-1    0038c9343
003dd2e26-1    003dd2e26
0045ed62a-1    0045ed62a
004709939-1    004709939
004a26127-1    004a26127
004c2f355-1    004c2f355
0052dcf47-1    0052dcf47
00553ae55-1    00553ae55
0058586f1-1    0058586f1
005afe792-1    005afe792
005bb92d8-1    005bb92d8
0063bd7e0-1    0063bd7e0
0063f83c9-1    0063f83c9
00648f96f-1    00648f96f
006610fe3-1    006610fe3
006d301e9-1    006d301e9
006ffebaf-1    006ffebaf
00709d75b-1    00709d75b
0073c33d0-1    0073c33d0
                 ...    
ff8e7c016-1    ff8e7c016
ff96988fc-1    ff96988fc
ff9ce365b-1    ff9ce365b
ff9d8cb25-1    ff9d8cb25
ff9f62e79-1    ff9f62e79
ffa5c6c35-1    ffa5c6c35
ffaa73202-1    ffaa73202
ffab93d18-1    ffab93d18
ffb315803-1    ffb31

In [4]:
def crop1(path, graypath, graypathcropped, valid_types):
    import sys, os
    i = 0
    img_preview = -1
    for f in tqdm_notebook(os.listdir(path), desc='cropping progress'):
        i += 1
        ext = os.path.splitext(f)[1]
        
        if ext.lower() not in valid_types:
            return
        
        n = f.split("-")[0]
        #print(n)
        if not os.path.isfile(graypath + n + "/" + f):
            image = cv2.imread(path+"/"+f) #pass in 0 as second parameter to automatically convert to grayscale
            cropped = image
            
            if image is not None:
                #try:
                    x1 = train_detect.loc[ n + "-1", "x1" ]
                    x2 = train_detect.loc[ n + "-1", "x2" ]
                    y1 = train_detect.loc[ n + "-1", "y1" ]
                    y2 = train_detect.loc[ n + "-1", "y2" ]
                    #print(y1, y2, x1,x2)
                    cropped = image[y1:y2, x1:x2]
                
                    if i <= img_preview:
                        cv2.imshow('cropped_image',cropped) 
                        cv2.waitKey(0)                 # Waits forever for user to press any key
                        cv2.destroyAllWindows()        # Closes displayed windows

                    if not os.path.isfile(graypath + "/" + f):
                        cv2.imwrite(graypath + "/" + f, cropped)
                        #print("\rCreated File: " + graypath + "/" + f, end='')
                #except:
                 #   pass
    return

def crop2(path, graypath, graypathcropped, valid_types):
    import sys, os
    i = 0
    img_preview = -1
    for f in tqdm_notebook(os.listdir(path), desc='grayscale cropping progress'):
        i += 1
        ext = os.path.splitext(f)[1]
        
        if ext.lower() not in valid_types:
            return
        
        n = f.split("-")[0]
        
        if not os.path.isfile(graypathcropped + n + "/" + f):
            image = cv2.imread(path+"/"+f, 0) #pass in 0 as second parameter to automatically convert to grayscale
            cropped = image
            
            if image is not None:
                try:
                    x1 = train_detect.loc[n + "-1", "x1"]
                    x2 = train_detect.loc[n + "-1", "x2"]
                    y1 = train_detect.loc[n + "-1", "y1"]
                    y2 = train_detect.loc[n + "-1", "y2"]
                    
                    cropped = image[y1:y2, x1:x2]
                
                    if i <= img_preview:
                        cv2.imshow('cropped_image',cropped) 
                        cv2.waitKey(0)                 # Waits forever for user to press any key
                        cv2.destroyAllWindows()        # Closes displayed windows

                    if not os.path.isfile(graypathcropped + "/" + f):
                        cv2.imwrite(graypathcropped + "/" + f, cropped)
                        #print("\rCreated File: " + graypathcropped + "/" + f, end='')
                except:
                    pass
    return

def crop():
    import sys, os
    
    path = "input/train/train_images/"
    graypath = "input/train/cropped_images/"
    graypathcropped = "input/train/grayscale_cropped_images/"
    
    if not os.path.exists("input"):
        os.mkdir("input")
    if not os.path.exists(path):
        os.mkdir(path)
    if not os.path.exists(graypath):
        os.mkdir(graypath)
    if not os.path.exists(graypathcropped):
        os.mkdir(graypathcropped)
    
    valid_types = [".jpg",".gif",".png",".tga"]
    #for f in tqdm_notebook(os.listdir(path), desc='Directory Labelling Progress'):
    #    ext = os.path.splitext(f)[1]
    #    if ext.lower() not in valid_types:
    #        return
    #    n = f.split("-")[0]
    #    if not os.path.exists(graypath + n + "/"):
    #        os.mkdir(graypath + n + "/")
    #        print("\rCreated Directory: " + graypath + n + "/", end='')
    useTF = False
    if useTF:
        import tensorflow as tf
        from keras import backend as K

        #https://regressionsessionsblog.wordpress.com/2018/06/11/stuck-on-an-issue-making-keras-predictions-in-parallel/
        #https://github.com/keras-team/keras/issues/4740
        jobs = 6
        config = tf.ConfigProto(intra_op_parallelism_threads=jobs, \
                            inter_op_parallelism_threads=jobs, \
                            allow_soft_placement=True, \
                            device_count = {'CPU': jobs})
        sess = tf.Session(config=config)
        K.set_session(sess)
    
        with tf.Session() as sess:
            sess.run(tf.no_op())
            crop1(path, graypath, graypathcropped, valid_types)
            crop2(path, graypath, graypathcropped, valid_types)
            sess.close()
    else:
        crop1(path, graypath, graypathcropped, valid_types)
        crop2(path, graypath, graypathcropped, valid_types)
    return


In [5]:
print("skip cropping")
#crop()
print("done")

skip cropping
done


In [6]:
pet_ids = train_df['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

In [7]:
def resize_to_square(im):
    old_size = im.shape[:2] # old_size is in (height, width) format
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    # new_size should be in (width, height) format
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

In [8]:
features = {}

def train_keras():
    from keras.models import Model
    from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
    from keras import backend as K
    import tensorflow as tf

    #https://regressionsessionsblog.wordpress.com/2018/06/11/stuck-on-an-issue-making-keras-predictions-in-parallel/
    #https://github.com/keras-team/keras/issues/4740
    jobs = 6
    config = tf.ConfigProto(intra_op_parallelism_threads=jobs, \
                            inter_op_parallelism_threads=jobs, \
                            allow_soft_placement=True, \
                            device_count = {'CPU': jobs})
    session = tf.Session(config=config)
    K.set_session(session)

    print("start")
    inp = Input((img_size,img_size,3))
    backbone = DenseNet121(input_tensor = inp, 
                            weights="input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                            include_top = False)
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
    x = AveragePooling1D(4)(x)
    out = Lambda(lambda x: x[:,:,0])(x)

    m = Model(inp,out)
    print("done")

    

    print("start keras prediction")

    for b in tqdm_notebook(range(n_batches)):
        start = b*batch_size
        end = (b+1)*batch_size
        batch_pets = pet_ids[start:end]
        batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
        for i,pet_id in enumerate(batch_pets):
            try:
                batch_images[i] = load_image("input/train/cropped_images/", pet_id)
            except:
                pass
        batch_preds = m.predict(batch_images)
        for i,pet_id in enumerate(batch_pets):
            features[pet_id] = batch_preds[i]
    print("done keras prediction")
    return m

In [9]:
from keras.models import Model
#m = train_keras("input/train/cropped_images/")
m = train_keras()

start
done
start keras prediction


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))


done keras prediction


In [10]:
def save_json(features, filename="train_features"):
    with open(filename + ".json", 'w') as file:
        json.dump(features,file)
        #json.dump(dictionary, file, sort_keys=True, indent=4)
    return
def load_json(filename="file", dictionary=features):
    with open(filename + ".json", 'r') as file:
        loaded_file = json.load(file)
    return loaded_file

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
            np.int16, np.int32, np.int64, np.uint8,
            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, 
            np.float64)):
            return float(obj)
        elif isinstance(obj,(np.ndarray,)): #### This is the fix
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

dumped = json.dumps(features, cls=NumpyEncoder)

with open("train_features.json", 'w') as f:
    json.dump(dumped, f)
    print("Saved train features json to disk")

Saved train features json to disk


In [11]:

#print(m.summary())

m.save_weights("keras_model_weights.h5")
print("Saved model weights to disk")
#with open("train_features.json", 'r') as f:
#    train_loaded_features = json.load(f)
#    train_loaded_features = json.loads(train_loaded_features)
#    print("loaded train features")
    

Saved model weights to disk


In [13]:
train_feats = pd.DataFrame.from_dict(features, orient='index')
train_feats.columns = ['pic_'+str(i) for i in range(train_feats.shape[1])]
test_df = pd.read_csv('input/test/test.csv')

pet_ids = test_df['PetID'].values
n_batches = len(pet_ids) // batch_size + 1
test_features = {}

def train_test_keras(m, n_batches):
    from keras.models import Model
    from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
    from keras import backend as K
    import tensorflow as tf

    #https://regressionsessionsblog.wordpress.com/2018/06/11/stuck-on-an-issue-making-keras-predictions-in-parallel/
    #https://github.com/keras-team/keras/issues/4740
    jobs = 6
    config = tf.ConfigProto(intra_op_parallelism_threads=jobs, \
                            inter_op_parallelism_threads=jobs, \
                            allow_soft_placement=True, \
                            device_count = {'CPU': jobs})
    session = tf.Session(config=config)
    K.set_session(session)

    print("start create model")
    inp = Input((img_size,img_size,3))
    backbone = DenseNet121(input_tensor = inp, 
                            weights="input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                            include_top = False)
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
    x = AveragePooling1D(4)(x)
    out = Lambda(lambda x: x[:,:,0])(x)

    m = Model(inp,out)
    print("done")
    
    print("start keras test prediction")

    for b in tqdm_notebook(range(n_batches)):
        start = b*batch_size
        end = (b+1)*batch_size
        batch_pets = pet_ids[start:end]
        batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
        for i,pet_id in enumerate(batch_pets):
            try:
                batch_images[i] = load_image("input/test/test_images/", pet_id)
            except:
                pass
        batch_preds = m.predict(batch_images)
        for i,pet_id in enumerate(batch_pets):
            test_features[pet_id] = batch_preds[i]
    print("done keras test prediction")
    return

train_test_keras(m, n_batches)

dumped = json.dumps(test_features, cls=NumpyEncoder)

with open("test_features.json", 'w') as f:
    json.dump(dumped, f)
    print("Saved test features json to disk")

start create model
done
start keras test prediction


HBox(children=(IntProgress(value=0, max=247), HTML(value='')))


done keras test prediction
Saved test features json to disk


In [14]:
test_feats = pd.DataFrame.from_dict(features, orient='index')
test_feats.columns = ['pic_'+str(i) for i in range(test_feats.shape[1])]
test_feats = test_feats.reset_index()
test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

train_feats = train_feats.reset_index()
train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

test_feats.head()

test_feats.to_csv(r'csv_out/img_test_feats_prediction_values.csv')
print("saved image test features prediction values to CSV")

saved image test features prediction values to CSV


### Start Here after image prediction files have been made ### 

this uses train and test.csv files along with the test_features and train_features json files, from which you will be able to run the remainder of the notebook

In [15]:
train = pd.read_csv("input/train/train.csv")
test = pd.read_csv("input/test/test.csv")

target = train['AdoptionSpeed']
train_id = train['PetID']
test_id = test['PetID']

loaded_train_feats = {}
loaded_test_feats = {}
with open("train_features.json", 'r') as f:
    loaded_train_feats = json.load(f)
    loaded_train_feats = json.loads(loaded_train_feats)
    print("loaded train features")

with open("test_features.json", 'r') as f:
    loaded_test_feats = json.load(f)
    loaded_test_feats = json.loads(loaded_test_feats)
    print("loaded test features") 

print("creating dataframes from feature dictionaries")
train_feats = pd.DataFrame.from_dict(loaded_train_feats, orient='index')
train_feats.columns = ['pic_'+str(i) for i in range(train_feats.shape[1])]

test_feats = pd.DataFrame.from_dict(loaded_test_feats, orient='index')
test_feats.columns = ['pic_'+str(i) for i in range(test_feats.shape[1])]

train_feats = train_feats.reset_index()
train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)
test_feats = test_feats.reset_index()
test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

print("joining dataframes")
train = train.join(train_feats.set_index('PetID'),on='PetID')
test = test.join(test_feats.set_index('PetID'),on='PetID')
#train = pd.merge(train, train_feats, on = ['PetID'], how = 'left')
#test = pd.merge(test, test_feats, left_on = ['PetID'], right_on = ['PetID'], how = 'outer')

train.drop(['AdoptionSpeed', 'PetID'], axis=1, inplace=True)
test.drop(['PetID'], axis=1, inplace=True)

#prepared = train
#prepared.drop(['RescuerID', 'Name', 'Description', 'Vaccinated', 
#               'Color3', 'Dewormed', 'Health', 'VideoAmt', 'Type'], axis=1, inplace=True)

#train.drop(['RescuerID', 'Name', 'Description', 'Vaccinated', 
#               'Color3', 'Dewormed', 'Health', 'VideoAmt', 'Type'], axis=1, inplace=True)
#test.drop(['RescuerID', 'Name', 'Description', 'Vaccinated', 
#               'Color3', 'Dewormed', 'Health', 'VideoAmt', 'Type'], axis=1, inplace=True)

#prepared.to_csv(r'csv_out/cleaned_img_merge.csv')
#print("saved prepared csv")
train.to_csv(r'csv_out/train_csv_img_merge.csv')
print("saved train csv image merge")
test.to_csv(r'csv_out/test_csv_img_merge.csv')
print("saved test csv image merge")
print("done")

loaded train features
loaded test features
creating dataframes from feature dictionaries
joining dataframes
saved train csv image merge
saved test csv image merge
done


In [16]:
doc_sent_mag = []
doc_sent_score = []
nf_count = 0
print("loading sentiment train")
for petid in train_id:
    try:
        with open('input/train/train_sentiment/' + petid + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except  Exception:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)
        pass

train.loc[:, 'doc_sent_mag'] = doc_sent_mag
train.loc[:, 'doc_sent_score'] = doc_sent_score

doc_sent_mag = []
doc_sent_score = []
nf_count = 0
print("loading sentiment test")
for petid in test_id:
    try:
        with open('input/test/test_sentiment/' + petid + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except Exception:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)
        pass

test.loc[:, 'doc_sent_mag'] = doc_sent_mag
test.loc[:, 'doc_sent_score'] = doc_sent_score
print("done")

loading sentiment train
loading sentiment test
done


In [17]:
print("scikit training sentiment descriptions")
## WITHOUT ERROR FIXED
train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')
    
# Fit TFIDF
tfv.fit(list(train_desc))
X =  tfv.transform(train_desc)
X_test = tfv.transform(test_desc)
print("X (tfidf):", X.shape)

svd = TruncatedSVD(n_components=200)
svd.fit(X)
# print(svd.explained_variance_ratio_.sum())
# print(svd.explained_variance_ratio_)
X = svd.transform(X)
print("X (svd):", X.shape)

X = pd.DataFrame(X, columns=['svd_{}'.format(i) for i in range(200)])
train = pd.concat((train, X), axis=1)
X_test = svd.transform(X_test)
X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(200)])
test = pd.concat((test, X_test), axis=1)

print("train:", train.shape)
print("done")

scikit training sentiment descriptions
X (tfidf): (14720, 10000)
X (svd): (14720, 200)
train: (14720, 480)
done


In [18]:
vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
print("loading metadata train")
for petid in train_id:
    try:
        with open('input/train/train_metadata/' + petid + '-1.json', 'r') as f:
            data = json.load(f)
            vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
            vertex_xs.append(vertex_x)
            vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
            vertex_ys.append(vertex_y)
            bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
            bounding_confidences.append(bounding_confidence)
            bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
            bounding_importance_fracs.append(bounding_importance_frac)
            dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
            dominant_blues.append(dominant_blue)
            dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
            dominant_greens.append(dominant_green)
            dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
            dominant_reds.append(dominant_red)
            dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
            dominant_pixel_fracs.append(dominant_pixel_frac)
            dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
            dominant_scores.append(dominant_score)
            if data.get('labelAnnotations'):
                label_description = data['labelAnnotations'][0]['description']
                label_descriptions.append(label_description)
                label_score = data['labelAnnotations'][0]['score']
                label_scores.append(label_score)
            else:
                nl_count += 1
                label_descriptions.append('nothing')
                label_scores.append(-1)
    except Exception:
        nf_count += 1
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)
        pass

print(nf_count)
print(nl_count)
train.loc[:, 'vertex_x'] = vertex_xs
train.loc[:, 'vertex_y'] = vertex_ys
train.loc[:, 'bounding_confidence'] = bounding_confidences
train.loc[:, 'bounding_importance'] = bounding_importance_fracs
train.loc[:, 'dominant_blue'] = dominant_blues
train.loc[:, 'dominant_green'] = dominant_greens
train.loc[:, 'dominant_red'] = dominant_reds
train.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
train.loc[:, 'dominant_score'] = dominant_scores
train.loc[:, 'label_description'] = label_descriptions
train.loc[:, 'label_score'] = label_scores


vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
print("loading metadata test")
for petid in test_id:
    try:
        with open('input/test/test_metadata/' + petid + '-1.json', 'r') as f:
            data = json.load(f)
            vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
            vertex_xs.append(vertex_x)
            vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
            vertex_ys.append(vertex_y)
            bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
            bounding_confidences.append(bounding_confidence)
            bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
            bounding_importance_fracs.append(bounding_importance_frac)
            dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
            dominant_blues.append(dominant_blue)
            dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
            dominant_greens.append(dominant_green)
            dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
            dominant_reds.append(dominant_red)
            dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
            dominant_pixel_fracs.append(dominant_pixel_frac)
            dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
            dominant_scores.append(dominant_score)
            if data.get('labelAnnotations'):
                label_description = data['labelAnnotations'][0]['description']
                label_descriptions.append(label_description)
                label_score = data['labelAnnotations'][0]['score']
                label_scores.append(label_score)
            else:
                nl_count += 1
                label_descriptions.append('nothing')
                label_scores.append(-1)
    except Exception:
        nf_count += 1
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)
        pass

print(nf_count)
test.loc[:, 'vertex_x'] = vertex_xs
test.loc[:, 'vertex_y'] = vertex_ys
test.loc[:, 'bounding_confidence'] = bounding_confidences
test.loc[:, 'bounding_importance'] = bounding_importance_fracs
test.loc[:, 'dominant_blue'] = dominant_blues
test.loc[:, 'dominant_green'] = dominant_greens
test.loc[:, 'dominant_red'] = dominant_reds
test.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
test.loc[:, 'dominant_score'] = dominant_scores
test.loc[:, 'label_description'] = label_descriptions
test.loc[:, 'label_score'] = label_scores
print("done")

loading metadata train
363
2
loading metadata test
137
done


In [19]:
train.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)
test.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)

train.drop(['dominant_green', 'doc_sent_score', 'FurLength', 'Vaccinated', 
               'Color3', 'Dewormed', 'Health', 'VideoAmt', 'Type',
           'bounding_importance', 'bounding_confidence', 'pic_79',
           'pic_252', 'pic_109', 'pic_197', 'pic_17', 'pic_104', 'pic_59', 'label_description'], axis=1, inplace=True)
test.drop(['dominant_green', 'doc_sent_score', 'FurLength', 'Vaccinated', 
               'Color3', 'Dewormed', 'Health', 'VideoAmt', 'Type',
           'bounding_importance', 'bounding_confidence', 'pic_79',
           'pic_252', 'pic_109', 'pic_197', 'pic_17', 'pic_104', 'pic_59', 'label_description'], axis=1, inplace=True)

In [20]:
numeric_cols = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'AdoptionSpeed', 
                'doc_sent_mag', 'doc_sent_score', 'dominant_score', 'dominant_pixel_frac', 
                'dominant_red', 'dominant_green', 'dominant_blue', 'bounding_importance', 
                'bounding_confidence', 'vertex_x', 'vertex_y', 'label_score'] +\
               [col for col in train.columns if col.startswith('pic') or col.startswith('svd')]
cat_cols = list(set(train.columns) - set(numeric_cols))
train.loc[:, cat_cols] = train[cat_cols].astype('category')
test.loc[:, cat_cols] = test[cat_cols].astype('category')
print(train.shape)
print(test.shape)

#print(test.head())
drop = "_drop100"
train.to_csv(r'csv_out/train_ims_merged' + drop + '.csv')
print("saved train csv image, metadata, sentiment merged")
test.to_csv(r'csv_out/test_ims_merged' + drop +'.csv')
print("saved test csv image, metadata, sentiment merged")
print("getting categorical features")
# get the categorical features
foo = train.dtypes
cat_feature_names = foo[foo == "category"]
cat_features = [train.columns.get_loc(c) for c in train.columns if c in cat_feature_names]
print("done")

(14720, 469)
(3948, 469)
saved train csv image, metadata, sentiment merged
saved test csv image, metadata, sentiment merged
getting categorical features
done


In [21]:
N_SPLITS = 5
def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):
    kf = StratifiedKFold(n_splits=N_SPLITS, random_state=42, shuffle=True)
    fold_splits = kf.split(train, target)
    cv_scores = []
    qwk_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0], N_SPLITS))
    all_coefficients = np.zeros((N_SPLITS, 4))
    feature_importance_df = pd.DataFrame()
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/' + str(N_SPLITS))
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y, importances, coefficients, qwk = model_fn(dev_X, dev_y, val_X, val_y, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        all_coefficients[i-1, :] = coefficients
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            qwk_scores.append(qwk)
            print(label + ' cv score {}: RMSE {} QWK {}'.format(i, cv_score, qwk))
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = train.columns.values
        fold_importance_df['importance'] = importances
        fold_importance_df['fold'] = i
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)        
        i += 1
    print('{} cv RMSE scores : {}'.format(label, cv_scores))
    print('{} cv mean RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv QWK scores : {}'.format(label, qwk_scores))
    print('{} cv mean QWK score : {}'.format(label, np.mean(qwk_scores)))
    print('{} cv std QWK score : {}'.format(label, np.std(qwk_scores)))
    pred_full_test = pred_full_test / float(N_SPLITS)
    results = {'label': label,
               'train': pred_train, 'test': pred_full_test,
                'cv': cv_scores, 'qwk': qwk_scores,
               'importance': feature_importance_df,
               'coefficients': all_coefficients}
    return results

params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 70,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.85,
          'feature_fraction': 0.8,
          'min_split_gain': 0.02,
          'min_child_samples': 150,
          'min_child_weight': 0.02,
          'lambda_l2': 0.0475,
          'verbosity': -1,
          'data_random_seed': 17,
          'early_stop': 600,
          'verbose_eval': 100,
          'num_rounds': 10000}

def runLGB(train_X, train_y, test_X, test_y, test_X2, params):
    print('Prep LGB')
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(test_X, label=test_y)
    watchlist = [d_train, d_valid]
    print('Train LGB')
    num_rounds = params.pop('num_rounds')
    verbose_eval = params.pop('verbose_eval')
    early_stop = None
    if params.get('early_stop'):
        early_stop = params.pop('early_stop')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      categorical_feature=list(cat_features),
                      early_stopping_rounds=early_stop)
    
    print('Predict 1/2')
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    optR = OptimizedRounder()
    optR.fit(pred_test_y, test_y)
    coefficients = optR.coefficients()
    pred_test_y_k = optR.predict(pred_test_y, coefficients)
    print("Valid Counts = ", Counter(test_y))
    print("Predicted Counts = ", Counter(pred_test_y_k))
    print("Coefficients = ", coefficients)
    qwk = quadratic_weighted_kappa(test_y, pred_test_y_k)
    print("QWK = ", qwk)
    print('Predict 2/2')
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    return pred_test_y.reshape(-1, 1), pred_test_y2.reshape(-1, 1), model.feature_importance(), coefficients, qwk

print("Run LightGBM Train Model")
results = run_cv_model(train, test, target, runLGB, params, rmse, 'lgb')
print("done")

Run LightGBM Train Model
Started lgb fold 1/5
Prep LGB
Train LGB


New categorical_feature is [1, 2, 3, 4, 5, 6, 7, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 600 rounds.
[100]	training's rmse: 1.04084	valid_1's rmse: 1.09391
[200]	training's rmse: 0.965753	valid_1's rmse: 1.06837
[300]	training's rmse: 0.914011	valid_1's rmse: 1.05779
[400]	training's rmse: 0.87058	valid_1's rmse: 1.05167
[500]	training's rmse: 0.836749	valid_1's rmse: 1.04782
[600]	training's rmse: 0.807803	valid_1's rmse: 1.04604
[700]	training's rmse: 0.777991	valid_1's rmse: 1.04441
[800]	training's rmse: 0.749481	valid_1's rmse: 1.04385
[900]	training's rmse: 0.723822	valid_1's rmse: 1.04309
[1000]	training's rmse: 0.69892	valid_1's rmse: 1.04303
[1100]	training's rmse: 0.676189	valid_1's rmse: 1.04316
[1200]	training's rmse: 0.654646	valid_1's rmse: 1.04298
[1300]	training's rmse: 0.63375	valid_1's rmse: 1.04288
[1400]	training's rmse: 0.614807	valid_1's rmse: 1.04289
[1500]	training's rmse: 0.597068	valid_1's rmse: 1.04306
[1600]	training's rmse: 0.579241	valid_1's rmse: 1.0433
[1700]	training's rmse: 0.56177	valid_1

New categorical_feature is [1, 2, 3, 4, 5, 6, 7, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 600 rounds.
[100]	training's rmse: 1.03943	valid_1's rmse: 1.09915
[200]	training's rmse: 0.964991	valid_1's rmse: 1.07676
[300]	training's rmse: 0.912249	valid_1's rmse: 1.067
[400]	training's rmse: 0.869413	valid_1's rmse: 1.06285
[500]	training's rmse: 0.833839	valid_1's rmse: 1.06029
[600]	training's rmse: 0.803624	valid_1's rmse: 1.05955
[700]	training's rmse: 0.774878	valid_1's rmse: 1.05926
[800]	training's rmse: 0.747557	valid_1's rmse: 1.05874
[900]	training's rmse: 0.72294	valid_1's rmse: 1.05861
[1000]	training's rmse: 0.699911	valid_1's rmse: 1.05876
[1100]	training's rmse: 0.678402	valid_1's rmse: 1.05853
[1200]	training's rmse: 0.657311	valid_1's rmse: 1.05851
[1300]	training's rmse: 0.63607	valid_1's rmse: 1.05849
[1400]	training's rmse: 0.616601	valid_1's rmse: 1.05843
[1500]	training's rmse: 0.597772	valid_1's rmse: 1.05911
[1600]	training's rmse: 0.577968	valid_1's rmse: 1.0593
[1700]	training's rmse: 0.559231	valid_1

New categorical_feature is [1, 2, 3, 4, 5, 6, 7, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 600 rounds.
[100]	training's rmse: 1.04006	valid_1's rmse: 1.09996
[200]	training's rmse: 0.964893	valid_1's rmse: 1.07498
[300]	training's rmse: 0.910559	valid_1's rmse: 1.06355
[400]	training's rmse: 0.867815	valid_1's rmse: 1.05838
[500]	training's rmse: 0.830995	valid_1's rmse: 1.05519
[600]	training's rmse: 0.799863	valid_1's rmse: 1.05257
[700]	training's rmse: 0.770883	valid_1's rmse: 1.05121
[800]	training's rmse: 0.743874	valid_1's rmse: 1.04991
[900]	training's rmse: 0.71792	valid_1's rmse: 1.04935
[1000]	training's rmse: 0.692925	valid_1's rmse: 1.04861
[1100]	training's rmse: 0.669265	valid_1's rmse: 1.04832
[1200]	training's rmse: 0.647364	valid_1's rmse: 1.04837
[1300]	training's rmse: 0.625744	valid_1's rmse: 1.04842
[1400]	training's rmse: 0.604596	valid_1's rmse: 1.04849
[1500]	training's rmse: 0.585075	valid_1's rmse: 1.04859
[1600]	training's rmse: 0.565442	valid_1's rmse: 1.04881
[1700]	training's rmse: 0.546799	val

New categorical_feature is [1, 2, 3, 4, 5, 6, 7, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 600 rounds.
[100]	training's rmse: 1.03962	valid_1's rmse: 1.09666
[200]	training's rmse: 0.965072	valid_1's rmse: 1.06827
[300]	training's rmse: 0.911996	valid_1's rmse: 1.05432
[400]	training's rmse: 0.869446	valid_1's rmse: 1.04679
[500]	training's rmse: 0.834647	valid_1's rmse: 1.04306
[600]	training's rmse: 0.803773	valid_1's rmse: 1.04085
[700]	training's rmse: 0.776717	valid_1's rmse: 1.03956
[800]	training's rmse: 0.750434	valid_1's rmse: 1.0386
[900]	training's rmse: 0.72572	valid_1's rmse: 1.03804
[1000]	training's rmse: 0.703729	valid_1's rmse: 1.03752
[1100]	training's rmse: 0.682189	valid_1's rmse: 1.03704
[1200]	training's rmse: 0.660931	valid_1's rmse: 1.03639
[1300]	training's rmse: 0.641308	valid_1's rmse: 1.03692
[1400]	training's rmse: 0.621574	valid_1's rmse: 1.0366
[1500]	training's rmse: 0.602505	valid_1's rmse: 1.03659
[1600]	training's rmse: 0.584552	valid_1's rmse: 1.03628
[1700]	training's rmse: 0.567102	valid

New categorical_feature is [1, 2, 3, 4, 5, 6, 7, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 600 rounds.
[100]	training's rmse: 1.04108	valid_1's rmse: 1.0906
[200]	training's rmse: 0.965861	valid_1's rmse: 1.06338
[300]	training's rmse: 0.914826	valid_1's rmse: 1.05328
[400]	training's rmse: 0.872472	valid_1's rmse: 1.04747
[500]	training's rmse: 0.837458	valid_1's rmse: 1.04408
[600]	training's rmse: 0.80573	valid_1's rmse: 1.04234
[700]	training's rmse: 0.776965	valid_1's rmse: 1.04097
[800]	training's rmse: 0.748866	valid_1's rmse: 1.04028
[900]	training's rmse: 0.721613	valid_1's rmse: 1.03991
[1000]	training's rmse: 0.697298	valid_1's rmse: 1.03939
[1100]	training's rmse: 0.6749	valid_1's rmse: 1.03906
[1200]	training's rmse: 0.652837	valid_1's rmse: 1.03882
[1300]	training's rmse: 0.632131	valid_1's rmse: 1.03836
[1400]	training's rmse: 0.611143	valid_1's rmse: 1.03846
[1500]	training's rmse: 0.592863	valid_1's rmse: 1.0385
[1600]	training's rmse: 0.574033	valid_1's rmse: 1.03857
[1700]	training's rmse: 0.555655	valid_1

In [22]:
imports = results['importance'].groupby('feature')['feature', 'importance'].mean().reset_index()
imports.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
1,Breed1,1266.8
0,Age,741.0
17,label_score,502.0
2,Breed2,436.0
468,vertex_y,415.6
10,State,392.2
9,Quantity,330.6
408,svd_46,274.6
306,svd_133,233.6
467,vertex_x,217.8


In [23]:
print("saving out feature importance list to CSV")
imports.to_csv("csv_out/features_importance.csv")
print("done")

saving out feature importance list to CSV
done


In [24]:
optR = OptimizedRounder()
coefficients_ = np.mean(results['coefficients'], axis=0)
print(coefficients_)
# manually adjust coefs
coefficients_[0] = 1.645
coefficients_[1] = 2.115
coefficients_[3] = 2.84
train_predictions = [r[0] for r in results['train']]
train_predictions = optR.predict(train_predictions, coefficients_).astype(int)
Counter(train_predictions)

[0.51074191 1.79537768 2.51294698 2.86880325]


Counter({3: 3089, 2: 4259, 4: 3954, 1: 2924, 0: 494})

In [25]:
optR = OptimizedRounder()
coefficients_ = np.mean(results['coefficients'], axis=0)
print(coefficients_)
# manually adjust coefs
coefficients_[0] = 1.645
coefficients_[1] = 2.115
coefficients_[3] = 2.84
test_predictions = [r[0] for r in results['test']]
test_predictions = optR.predict(test_predictions, coefficients_).astype(int)
Counter(test_predictions)

[0.51074191 1.79537768 2.51294698 2.86880325]


Counter({3: 897, 4: 1136, 2: 1125, 1: 724, 0: 66})

In [26]:
print("True Distribution:")
print(pd.value_counts(target, normalize=True).sort_index())
print("Test Predicted Distribution:")
print(pd.value_counts(test_predictions, normalize=True).sort_index())
print("Train Predicted Distribution:")
print(pd.value_counts(train_predictions, normalize=True).sort_index())

True Distribution:
0    0.026970
1    0.206386
2    0.268682
3    0.217527
4    0.280435
Name: AdoptionSpeed, dtype: float64
Test Predicted Distribution:
0    0.016717
1    0.183384
2    0.284954
3    0.227204
4    0.287741
dtype: float64
Train Predicted Distribution:
0    0.033560
1    0.198641
2    0.289334
3    0.209851
4    0.268614
dtype: float64


In [27]:
pd.DataFrame(sk_cmatrix(target, train_predictions), index=list(range(5)), columns=list(range(5)))

Unnamed: 0,0,1,2,3,4
0,58,114,110,64,51
1,229,1064,1006,463,276
2,132,957,1425,801,640
3,62,556,1039,832,713
4,13,233,679,929,2274


In [28]:
quadratic_weighted_kappa(target, train_predictions)
rmse(target, [r[0] for r in results['train']])
submission = pd.DataFrame({'PetID': test_id, 'AdoptionSpeed': test_predictions})
submission.head()

Unnamed: 0,PetID,AdoptionSpeed
0,378fcc4fc,3
1,73c10e136,4
2,72000c4c5,4
3,e147a4b9f,3
4,43fbba852,4


In [29]:
submission.to_csv('submissions/LightBGM with image features/submission.csv', index=False)