In [1]:
import numpy as numpy
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
import joblib
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer

In [2]:
combined_train = pd.read_csv("combined_train.csv")
combined_test = pd.read_csv("combined_test.csv")
train_metadata = pd.read_csv("train/train_metadata.csv")
test_metadata = pd.read_csv("test/test_metadata.csv")

In [3]:
pixels = pd.read_csv("./image_flat/flattened_images.csv")
digits = pd.read_csv("./dig_feature/ocr_digit_detection.csv")
cnn = pd.read_csv("./CNN_feature/cnn_features.csv")

In [4]:
pixels_test = pd.read_csv("./image_flat/flattened_images_test.csv")
digits_test = pd.read_csv("./dig_feature/ocr_digit_detection_test.csv")
cnn_test = pd.read_csv("./CNN_feature/cnn_features_test.csv")

In [5]:
class_ids = train_metadata["ClassId"]
test_ids = test_metadata['id']

In [6]:
train_df = pd.concat([combined_train, pixels, digits], axis=1)
train_df = train_df.drop("image_path", axis=1)
test_df = pd.concat([combined_test, pixels_test, digits_test], axis=1)
test_df = test_df.drop("image_path", axis=1)

In [41]:
train_df

Unnamed: 0,hog_pca_0,hog_pca_1,hog_pca_2,hog_pca_3,hog_pca_4,hog_pca_5,hog_pca_6,hog_pca_7,hog_pca_8,hog_pca_9,...,0,1,2,3,4,5,6,7,8,9
0,-0.763458,0.927880,0.264329,-0.638673,0.831455,-0.527469,-0.959387,-0.538355,1.262615,-0.245512,...,0,0,0,0,0,0,0,0,0,0
1,1.049284,3.608200,-1.817190,0.904380,-0.282195,0.623239,-0.510430,-0.388628,-0.457727,-0.033302,...,0,0,0,0,0,0,0,0,0,0
2,-1.552440,-0.432374,-0.318422,0.671877,-0.381869,-0.224374,0.565182,0.267907,0.144193,-0.150904,...,0,0,0,0,0,0,0,0,0,0
3,-1.556871,0.214406,0.973758,0.613876,-0.657341,0.007032,0.140899,-0.159095,-0.187679,-0.142929,...,1,0,0,0,0,0,0,1,0,0
4,-0.944294,-0.334833,0.415215,-0.607014,-1.004900,0.172754,-0.692467,-0.193404,1.385177,-0.230809,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5483,-0.562495,0.643567,0.254784,-1.583481,0.202211,1.504130,0.263140,0.253128,0.480586,-0.309651,...,0,0,0,0,0,0,0,0,0,0
5484,-1.370223,-0.398313,0.378050,0.168035,-0.212531,-0.341123,-0.318070,-0.342226,0.765626,-0.362583,...,0,0,0,0,0,0,0,0,0,0
5485,-1.951729,-1.117028,-1.187795,0.350751,-0.016377,-0.663806,0.582249,-0.877313,-0.562128,-0.194185,...,1,0,0,0,0,1,0,0,0,0
5486,-2.264590,-1.266415,-0.831067,0.579172,-0.497001,-0.237512,-0.715835,0.255480,-0.007029,-0.222388,...,1,0,0,0,1,0,1,0,0,0


In [7]:
svm = make_pipeline(StandardScaler(), SVC(kernel='rbf', gamma=0.008, C=8.6))
rf = RandomForestClassifier(n_estimators=1000, max_depth=100, min_samples_split=5, min_samples_leaf=2)
lr = make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', solver='saga', C=1))
estimators = [('svm', svm), ('rf', rf), ('lr', lr)]

In [25]:
stack = StackingClassifier(estimators, n_jobs=4)
stack.fit(train_df, class_ids)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
output = pd.DataFrame()
output["id"] = test_ids
output["ClassId"] = stack.predict(test_df)
output.to_csv("output.csv", index=False)

We attempt again, now with MPL and also just the CNN features

In [8]:
mpl = MLPClassifier(hidden_layer_sizes=(1000,), max_iter=300)
estimators.append(('mpl', mpl))

In [None]:
cnn = cnn.drop("image_path", axis=1)
cnn_test = cnn_test.drop("image_path", axis=1)

In [16]:
X = cnn.to_numpy(copy=True)
X.setflags(write=True)

y = class_ids.to_numpy(copy=True)
y.setflags(write=True)

In [None]:
cnnStack = StackingClassifier(estimators, LogisticRegression(max_iter=2000, solver='sag'), n_jobs=4)
cnnStack.fit(X, y)



In [21]:
output = pd.DataFrame()
output["id"] = test_ids
output["ClassId"] = cnnStack.predict(cnn_test)
output.to_csv("output.csv", index=False)



Frankly, the pixel data doesnt seem so useful for the non-NN models, so we decided to just split it and the do the stacking manually

In [10]:
def rename_columns(df, prefix):
    df = df.copy()
    new_columns = []
    count = 0
    for col in df.columns:
        if col == "image_path":
            new_columns.append(col)
        else:
            new_columns.append(f"{prefix}_{count}")
            count += 1
    df.columns = new_columns
    return df

In [11]:
tempBase = pd.concat([combined_train, digits, cnn], axis=1)
tempBase = rename_columns(tempBase, "subset")
subset = tempBase.drop("image_path", axis=1).columns


tempExtra = rename_columns(pixels, "extra")
extra = tempExtra.drop('image_path', axis=1).columns

all_train = pd.concat([tempBase, tempExtra], axis=1)
all_train = all_train.drop("image_path", axis=1)

# For test ###############
tempBase = pd.concat([combined_test, digits_test, cnn_test], axis=1)
tempBase = rename_columns(tempBase, "subset")

tempExtra = rename_columns(pixels_test, "extra")

all_test = pd.concat([tempBase, tempExtra], axis=1)
all_test = all_test.drop("image_path", axis=1)

New pipelines to choose the right columns

In [12]:
subset_transformer = ColumnTransformer([('subset', 'passthrough', subset)])
full_transformer = ColumnTransformer([('full', 'passthrough', list(subset)+list(extra))])

In [13]:
svm2 = make_pipeline(subset_transformer, StandardScaler(), SVC(kernel='rbf', gamma=0.001009, C=2.136833, verbose=1))
rf2 = make_pipeline(subset_transformer, RandomForestClassifier(n_estimators=1000, max_depth=100, min_samples_split=5, min_samples_leaf=2, verbose=1))
lr2 = make_pipeline(subset_transformer, StandardScaler(), LogisticRegression(max_iter=1000, penalty='l1', solver='saga', C=1, verbose=1))
mlp2 = make_pipeline(full_transformer, MLPClassifier(hidden_layer_sizes=(1000,), max_iter=300, verbose=1))
estimators2 = [("svm2", svm2), ("rf2", rf2), ("lr2", lr2), ("mlp2", mlp2)]

In [95]:
stackFinal = StackingClassifier(estimators2, final_estimator=LogisticRegression(max_iter=4000, solver='sag'), verbose=1, n_jobs=4)
stackFinal.fit(all_train, class_ids)

In [96]:
joblib.dump(stackFinal, "finalStack.pkl")

['finalStack.pkl']

In [98]:
output = pd.DataFrame()
output["id"] = test_ids
output["ClassId"] = stackFinal.predict(all_test)
output.to_csv("output.csv", index=False)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    1.7s


Im of the belief that the random forest is overfit

In [99]:
estimators3 = [("svm2", svm2), ("lr2", lr2), ("mlp2", mlp2)]

In [100]:
stackFinal = StackingClassifier(estimators3, final_estimator=LogisticRegression(max_iter=4000, solver='sag'), verbose=1, n_jobs=4)
stackFinal.fit(all_train, class_ids)
# why did I forget to change the variable name

In [101]:
output = pd.DataFrame()
output["id"] = test_ids
output["ClassId"] = stackFinal.predict(all_test)
output.to_csv("output.csv", index=False)

None of these stacks are better than pure MLP. Need to consider different final estimators, so I will generate some permanent prediction vectors. And this time im gonna use predict_prob

In [15]:
svm3 = make_pipeline(subset_transformer, StandardScaler(), SVC(probability=True, kernel='rbf', gamma=0.001009, C=2.136833, verbose=1))

In [21]:
estimatorsBlank = [('svm3',svm3)] + estimators2[1:]

In [14]:
lr3 = make_pipeline(subset_transformer, StandardScaler(), LogisticRegression(max_iter=10000, penalty='l1', solver='saga', C=1, verbose=1))

In [23]:
estimatorsBlank[2] = ("lr3", lr3)

In [27]:
from sklearn.base import clone
rowLen = train_df.shape[0]
noClasses = len(set(class_ids))
meta_features = numpy.zeros((rowLen, len(estimatorsBlank)*noClasses))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=999)

In [28]:
for i, (name, model) in enumerate(estimatorsBlank):
    prob_predictions = numpy.zeros((rowLen, len(set(class_ids))))

    for trainIdx, valIdx in skf.split(all_train, class_ids):
        tempModel = clone(model)
        tempModel.fit(all_train.iloc[trainIdx], class_ids[trainIdx])
        prob_predictions[valIdx] = tempModel.predict_proba(all_train.iloc[valIdx])
    meta_features[:, i*noClasses : (i+1)*noClasses] = prob_predictions

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    7.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   32.2s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  2.7min finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    7.9s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   32.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  2.7min finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:

convergence after 2747 epochs took 2281 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 38.0min finished


convergence after 2664 epochs took 2190 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 36.5min finished


convergence after 2469 epochs took 2033 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 33.9min finished


convergence after 2597 epochs took 2137 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 35.6min finished


convergence after 2333 epochs took 1956 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 32.6min finished


Iteration 1, loss = 28.35508578
Iteration 2, loss = 16.81313620
Iteration 3, loss = 10.27228150
Iteration 4, loss = 6.20931369
Iteration 5, loss = 3.97882894
Iteration 6, loss = 2.67184736
Iteration 7, loss = 2.08897809
Iteration 8, loss = 1.62396504
Iteration 9, loss = 1.31549762
Iteration 10, loss = 1.08112499
Iteration 11, loss = 0.91762111
Iteration 12, loss = 0.78904402
Iteration 13, loss = 0.66614201
Iteration 14, loss = 0.54531142
Iteration 15, loss = 0.50756594
Iteration 16, loss = 0.46250615
Iteration 17, loss = 0.40967795
Iteration 18, loss = 0.36464355
Iteration 19, loss = 0.38290196
Iteration 20, loss = 0.34704664
Iteration 21, loss = 0.35869344
Iteration 22, loss = 0.30878868
Iteration 23, loss = 0.29533051
Iteration 24, loss = 0.23020727
Iteration 25, loss = 0.18956340
Iteration 26, loss = 0.18757211
Iteration 27, loss = 0.17501346
Iteration 28, loss = 0.19007657
Iteration 29, loss = 0.20813116
Iteration 30, loss = 0.18712469
Iteration 31, loss = 0.13737415
Iteration 32, 

In [29]:
pd.DataFrame(meta_features).to_csv("ala.csv")

In [None]:
test_meta_features = numpy.zeros((test_df.shape[0], len(estimatorsBlank) * noClasses))

for i, (name, model) in enumerate(estimatorsBlank):
    model_clone = clone(model)
    model_clone.fit(all_train, class_ids)  # Train on full data now
    test_meta_features[:, i * noClasses : (i + 1) * noClasses] = model_clone.predict_proba(all_test)

pd.DataFrame(test_meta_features).to_csv("baseModelPredictions_test.csv")


[LibSVM]

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   10.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   41.6s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:  1.6min
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  3.5min finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.6s finished
