In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from scipy.stats import mode
from math import factorial
import gc
import sys

### Data Loading

In [2]:
# Importing the dataset
train_df = pd.read_csv('train.csv', index_col='row_id')
test_df = pd.read_csv('test.csv', index_col='row_id')

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
print("Train Data: ")
train_df = reduce_mem_usage(train_df)
print("Test Data: ")
test_df = reduce_mem_usage(test_df)

Train Data: 
Mem. usage decreased to 221.25 Mb (49.7% reduction)
Test Data: 
Mem. usage decreased to 109.86 Mb (49.8% reduction)


In [5]:
# Label encoding of categorical features
label = LabelEncoder()
y = train_df['target']
y_label = label.fit_transform(y)

features = train_df.columns.drop('target')

#### GCD and Interger implement

In [7]:
def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

train_int = pd.DataFrame({col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in features})
test_int = pd.DataFrame({col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in features})

In [76]:
train_int

Unnamed: 0_level_0,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,A0T0G9C1,...,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0,target,res
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,10000
1,0,0,0,1000,0,1000,0,0,0,0,...,0,1000,1000,0,0,0,0,0,6,1000
2,0,8,50,243,468,510,443,239,44,3,...,91,167,192,115,20,18,29,2,6,1
3,1,4,55,359,692,762,596,311,40,3,...,143,266,288,196,31,25,56,0,6,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,10000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,10,40,290,550,530,400,320,20,0,...,100,190,230,70,10,70,30,0,6,10
199996,0,0,0,0,0,0,0,0,0,0,...,0,1000,1000,0,0,0,0,0,9,1000
199997,1,11,38,83,181,203,163,99,38,2,...,174,196,299,137,26,45,31,1,8,1
199998,0,0,0,0,0,0,0,0,0,0,...,0,2000,0,0,0,0,0,0,7,1000


In [53]:
def gcd_of_all(df_i, elements):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

train_int['target'] = y_label
train_int['res'] = gcd_of_all(train_int, features)
test_int['res'] = gcd_of_all(test_int, features)
# convert float labels to int to make things easier
test_int = test_int.astype({'target': 'int32'})

In [65]:
res = [1, 10, 1000, 10000]
nFolds = 5

etc_params = {
        'n_estimators': 300,
        'n_jobs': -1,
        'bootstrap': False,
        'verbose': 0
        }

#sca = StandardScaler()

y_pred = pd.DataFrame()
acc_avg = 0
n_valid = 0
for res_i in res:
    print(f"\nResolution = {res_i}")
    X_train = train_int.loc[train_int['res'] == res_i][features]
    y_train = train_int.loc[train_int['res'] == res_i]['target']
    X_test = test_int.loc[test_int['res'] == res_i][features]

    y_preds = []
    y_probs = []
    perf = []
    cv = KFold(n_splits=nFolds, shuffle=True, random_state=42)
    for fold, (train_idx, valid_idx) in enumerate(cv.split(X_train, y_train)):         
        X_train_cv = X_train.iloc[train_idx] 
        y_train_cv = y_train.iloc[train_idx]  
        X_valid = X_train.iloc[valid_idx]
        y_valid = y_train.iloc[valid_idx]
        
        # train
        clf = ExtraTreesClassifier(**etc_params)    
        clf.fit(X_train_cv, y_train_cv.values.ravel())

        # predict
        y_pred_val = clf.predict(X_valid)
        acc = accuracy_score(y_valid.values.ravel(),  y_pred_val)
        perf.append(acc)
        
        y_preds.append(clf.predict(X_test))
        y_probs.append(clf.predict_proba(X_test))
        print(f"CV - FOLD {fold+1} | Samples train: {len(train_idx)} | Samples validation: {len(valid_idx)} | acc = {acc:.4f}")

            
    print(f">>> Average across folds for res = {res_i} : acc = {np.mean(perf):.2f}")
    acc_avg += np.mean(perf)*len(valid_idx)
    n_valid += len(valid_idx)
    
    # Majority vote
    y_pred_res = mode(y_preds).mode[0]
    test_ind_res = test_int[test_int['res'] == res_i].index
    y_pred = y_pred.append(pd.DataFrame(label.inverse_transform(y_pred_res), index=test_ind_res, columns=['target']))
    test_int.loc[test_ind_res, 'target'] = y_pred_res.astype(int)
    y_pred.loc[test_ind_res]['res'] = res_i
print(f"\n>>> Weighted avg across folds and resolutions: {acc_avg/n_valid:.4f}")
sub = y_pred.copy()


Resolution = 1
CV - FOLD 1 | Samples train: 39975 | Samples validation: 9994 | acc = 1.0000
CV - FOLD 2 | Samples train: 39975 | Samples validation: 9994 | acc = 1.0000
CV - FOLD 3 | Samples train: 39975 | Samples validation: 9994 | acc = 1.0000
CV - FOLD 4 | Samples train: 39975 | Samples validation: 9994 | acc = 1.0000
CV - FOLD 5 | Samples train: 39976 | Samples validation: 9993 | acc = 1.0000
>>> Average across folds for res = 1 : acc = 1.00

Resolution = 10
CV - FOLD 1 | Samples train: 40001 | Samples validation: 10001 | acc = 1.0000
CV - FOLD 2 | Samples train: 40001 | Samples validation: 10001 | acc = 1.0000
CV - FOLD 3 | Samples train: 40002 | Samples validation: 10000 | acc = 1.0000
CV - FOLD 4 | Samples train: 40002 | Samples validation: 10000 | acc = 1.0000
CV - FOLD 5 | Samples train: 40002 | Samples validation: 10000 | acc = 1.0000
>>> Average across folds for res = 10 : acc = 1.00

Resolution = 1000
CV - FOLD 1 | Samples train: 40046 | Samples validation: 10012 | acc = 0

In [54]:
test_high = test_int[(test_int['res']==1) | (test_int['res']==10)]
test_low = test_int[(test_int['res']==1000) | (test_int['res']==10000)]

In [55]:
print(f"Total number of duplicated samples in high res test: {test_high.duplicated().sum()} out of {test_high.shape[0]} ({test_high.duplicated().sum()/test_high.shape[0]*100:.2f}%)")
print(f"Total number of duplicated samples in low res test: {test_low.duplicated().sum()} out of {test_low.shape[0]} ({test_low.duplicated().sum()/test_low.shape[0]*100:.2f}%)")

Total number of duplicated samples in high res test: 1533 out of 50159 (3.06%)
Total number of duplicated samples in low res test: 25246 out of 49841 (50.65%)


In [58]:
train_ext = pd.concat([train_int, test_high.drop_duplicates()])
features = list(features)
features.append('res')
print(f"Nb samples in train set extended: {len(train_ext)} ({(len(train_ext)-len(train_int))/len(train_ext)*100:.2f}% increase)")

Nb samples in train set extended: 248626 (19.56% increase)


In [60]:
X_train = train_ext[features].copy()
y_train = train_ext['target'].copy()
X_test = test_low[features].copy()
index_test_low = test_low.index.copy()

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Mem. usage decreased to 201.07 Mb (26.6% reduction)
Mem. usage decreased to 40.16 Mb (26.9% reduction)


In [61]:
#credits: https://www.kaggle.com/alexandreayari/tps-02-22-extratrees-gcd-memory-opti
nFolds = 10 

SEED = 2022  
N_ESTIMATORS = 2000
MAX_DEPTH = 3691
MIN_SAMPLES_SPLIT = 3
MIN_SAMPLES_LEAF = 1
CRITERION  = 'gini'
VERBOSE = 0

#sca = StandardScaler()

y_preds_ext = []
y_probs_ext = []
perf = []
acc_avg = 0
n_valid = 0
cv = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=2022)
print("Starting training...")
for fold, (train_idx, valid_idx) in enumerate(cv.split(X_train, y_train)):         
    X_train_cv = X_train.iloc[train_idx] 
    y_train_cv = y_train.iloc[train_idx]  
    X_valid = X_train.iloc[valid_idx]
    y_valid = y_train.iloc[valid_idx]
        
    #X_train_cv = sca.fit_transform(X_train_cv)
    #X_valid = sca.transform(X_valid)

    etc_params = {
        'n_estimators': N_ESTIMATORS,
        'max_depth': MAX_DEPTH,
        'min_samples_split': MIN_SAMPLES_SPLIT,
        'min_samples_leaf': MIN_SAMPLES_LEAF,
        'criterion': CRITERION,
        'bootstrap': False,
        'n_jobs': -1,
        'random_state': SEED + fold + 1,
        'verbose': VERBOSE,
    }

    
    # train
    clf = ExtraTreesClassifier(**etc_params)    
    clf.fit(X_train_cv, y_train_cv.values.ravel())

    # predict
    y_pred_val = clf.predict(X_valid)
    acc = accuracy_score(y_valid.values.ravel(),  y_pred_val)
    perf.append(acc)
        
    y_preds_ext.append(clf.predict(X_test))
    y_probs_ext.append(clf.predict_proba(X_test))
    print(f"CV - FOLD {fold} | acc = {acc:.4f}")

print(f">>> Average across folds: {np.mean(perf):.4f}")

Starting training...
CV - FOLD 0 | acc = 0.9965
CV - FOLD 1 | acc = 0.9978
CV - FOLD 2 | acc = 0.9972
CV - FOLD 3 | acc = 0.9969
CV - FOLD 4 | acc = 0.9969
CV - FOLD 5 | acc = 0.9972
CV - FOLD 6 | acc = 0.9972
CV - FOLD 7 | acc = 0.9975
CV - FOLD 8 | acc = 0.9971
CV - FOLD 9 | acc = 0.9970
>>> Average across folds: 0.9971


In [75]:
y_preds_ext

[array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0]),
 array([6, 7, 6, ..., 0, 0, 0])]

In [68]:
# credits: https://www.kaggle.com/max1mum/extra-trees-cv-voting
mean_prob = sum(y_probs_ext) / len(y_probs_ext)

# The distribution of bacteria types
target_dist = pd.Series(y_train).value_counts().sort_index() / len(y_train) * 100

# Finds the difference in percent between the normal and tuned target distributions
def get_diff(deltas, distribution):
    tuned_predictions = pd.Series(np.argmax(mean_prob + deltas, axis=1))
    return distribution - tuned_predictions.value_counts().sort_index() / len(X_test) * 100

# The list of probability deltas to match distributions
deltas = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

diff = get_diff(deltas, target_dist)
print("Mean difference before tuning:", diff.abs().mean(), "%")

# Finding the optimal probability deltas
for i in range(1000):
    diff_max_id = np.argmax(diff.abs())

    if diff[diff_max_id] > 0.1:
        deltas[diff_max_id] += 0.001
    elif diff[diff_max_id] < -0.1:
        deltas[diff_max_id] -= 0.001
    else:
        break
    diff = get_diff(deltas, target_dist)

print("Mean difference after tuning:", diff.abs().mean(), "%")
mean_prob += deltas

Mean difference before tuning: 10.007627152663481 %
Mean difference after tuning: 10.007627152663481 %


In [69]:
y_pred_tuned = label.inverse_transform(np.argmax(mean_prob, axis=1))

In [70]:
len(y_pred_tuned)

49841

In [71]:
sub.loc[index_test_low, 'target'] = y_pred_tuned
sub.sort_index(inplace=True)

In [72]:
assert(sub.index.duplicated(keep='first').any() == False)

In [74]:
sub.to_csv('submission.csv')