In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score
from hyperopt import hp, fmin, tpe, STATUS_OK, space_eval

import xgboost as xgb
import lightgbm as lgb
import catboost as ctb


from PIL import Image
from tqdm import tqdm
import pytesseract
tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

import langdetect

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

import re
import json
 
nltk.download('stopwords')

pd.set_option('display.max_columns', 200)

from language_tool_python import LanguageTool
pl_lang_tool = LanguageTool("pl")
en_lang_tool = LanguageTool("en")  # TODO: Check en-US

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\barto\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


# Consts

In [3]:
TEST_PATH = os.path.join("data", "datasets", "test_set")
TRAIN_PATH = os.path.join("data", "datasets", "train_set")
HACKATHON_PATH = os.path.join("data", "hackathon")

DEVICE = "cpu"


# Dataset preparation

In [4]:
with open("data\hackathon\id2label_final.json") as file:
    id2label = json.load(file)

with open("data\hackathon\label2id_final.json") as file:
    label2id = json.load(file)

In [None]:
# Read the JSON file
with open(os.path.join(HACKATHON_PATH, "train_set_ocr.json")) as json_file:
    data = json.load(json_file)
len(data)

with open(os.path.join(HACKATHON_PATH, "train_set_our_ocr.json")) as json_file:
    data_our = json.load(json_file)
len(data_our)

In [16]:
def ocr(img_path):
    img = Image.open(img_path)
    return pytesseract.image_to_string(img)

In [15]:
def parse_ocr_data(image_dir, class_name=None):
    parsed_data = {}
    for file in tqdm(sorted(os.listdir(image_dir))):
        filepath = os.path.join(image_dir, file)
        try:
            parsed_text = ocr(filepath)
        except:
            print(f"Cannot parse {file} file")
            continue
        key = file
        if class_name is not None:
            key = f"{class_name}/{file}"
        parsed_data.update({key: parsed_text})
    return parsed_data   

## Get text using Tesseract

In [None]:
# parsed_data = {}

# for class_name in sorted(os.listdir(TRAIN_PATH)):
#     print(f"Processing class: {class_name}")
#     class_dir = os.path.join(TRAIN_PATH, class_name)

#     data = parse_ocr_data(class_dir)
#     parsed_data.update(data)

In [None]:
train_set_our_ocr_path = os.path.join(HACKATHON_PATH, "train_set_our_ocr.json")

# with open(train_set_our_ocr_path, "w") as json_file:
#     json.dump(parsed_data, json_file)

with open(train_set_our_ocr_path, "r") as json_file:
    parsed_data = json.load(json_file)

In [17]:
def tokenize_text_data(text: str, remove_numbers=True, autocorrection=False, stem=False) -> tuple:
    text = text.strip()
    
    if not text:
        return [], "unknown"
    # language detection
    text = re.sub(r"\n", " ", text)

    lang = "unknown"
    try:
        lang = langdetect.detect(text)
    except:
        pass
    
    # Misspellings
    if autocorrection:
        lang_tool = en_lang_tool
        if lang == "pl":
            lang_tool = pl_lang_tool
        text = lang_tool.correct(text)

     # Lowercase
    text = text.lower()

    # Special characters
    text = re.sub(r"[^a-zA-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ\s]", "", text)

    # Numbers
    if remove_numbers:
        text = re.sub("\d+", " ", text)


    # Split text by space
    words = text.split()

    # Stopwords
    words = remove_stopwords(words, lang)

    # Stemming
    if stem and lang != "pl":
        words = [ps.stem(w) for w in words]
    
    return words, lang

def remove_stopwords(words, lang):
    parsed_words = []    
    stop_words = stopwords.words("english")
    if lang == "pl":
        stop_words = stopwords.words("polish")

    for word in words:
        if word not in stop_words:
            parsed_words.append(word)

    return parsed_words

## Datasets merge

In [18]:
def compose_dataframe(ocr_data, data_source, data_prefix):
    df = pd.DataFrame.from_dict(ocr_data, orient='index')
    df = df.reset_index().rename(columns={'index': 'file_path', 0: f'text_{data_source}_ocr'})
    df['file_path'] = df['file_path'].replace(data_prefix, '', regex=True).str.replace(r'^[./\\]+', '', regex=True).str.replace('\\', '/', regex=True)

    df[f'words_{data_source}_ocr'] = df[f'text_{data_source}_ocr'].apply(lambda x: tokenize_text_data(x, autocorrection=True, stem=True)[0])
    return df

In [None]:
df_train_default_ocr = compose_dataframe(data, "default", data_prefix="test_hashed")

In [None]:
df_train_our_ocr = compose_dataframe(data_our, "our", data_prefix="test_hashed")

In [None]:
df_merged = pd.merge(df_train_default_ocr, df_train_our_ocr, on='file_path', how='left')

In [19]:
def choose_ocr(df, ocr1_colname, ocr2_colname):
    df['word_count_text_default_ocr'] = df[ocr1_colname].apply(lambda x: len(x))
    df['word_count_text_our_ocr'] = df[ocr2_colname].apply(lambda x: len(x) if isinstance(x, list) else -1)
    df['words_better_ocr'] = np.where((df['word_count_text_default_ocr'] > 2 * df['word_count_text_our_ocr']) | (df['word_count_text_our_ocr'].isnull()), df[ocr1_colname], df[ocr2_colname])
    return df 

In [None]:
df = choose_ocr(df_merged, "words_default_ocr", "words_our_ocr")

In [None]:
# Add features
df['image_present'] = np.where(df['word_count_text_our_ocr'] == -1, False, True)

df['eng_language'] = df["text_default_ocr"].apply(lambda x: tokenize_text_data(x)[1])

df['english'] = np.where(df["eng_language"] != "pl", True, False)

In [20]:
def count_words(words):
    word_counts = {}
    for word in words:
        word = word.lower()
        if word not in word_counts.keys():
            word_counts.update({word: 0})
        word_counts[word] = word_counts[word] + 1
    return word_counts

In [21]:
def create_final_df(df, train=False):
    df_final = pd.DataFrame()
    df_final["file_path"] = df["file_path"]
    df_final["words"] = df["words_better_ocr"]
    df_final["english"] = df["english"]
    df_final["photo"] = df["image_present"]

    df_final['word_count'] = df_final["words"].apply(lambda x: len(x))

    df_final["words_freq"] = df_final["words"].apply(lambda x: count_words(x))

    df_final['word_threshold'] = df_final["words"].apply(lambda x: len(x)>25)

    if train:
        df_final["class_name"] = df_final["file_path"].apply(lambda x: x.split("/")[0])

        df_final = df_final[df_final["class_name"] != "ipynb_checkpoints"]
        df_final["class_id"] = df_final["class_name"].apply(lambda x: id2label[x])
    return df_final


In [None]:
df_final = create_final_df(df, train=True)

## Count words and get keywords

In [None]:
def count_words_sum(group):
    total_words = group['words'].sum()

    # Return the desired result as a DataFrame
    result = pd.DataFrame({'Total words': [total_words]})
    return result


grouped_result = df_final.groupby(["class_id"]).apply(count_words_sum).reset_index(level=1, drop=True).reset_index()

In [None]:
grouped_result["class_words_freq"] = grouped_result["Total words"].apply(lambda x: count_words(x))

In [None]:
class_words_freqs = {}

for class_name, class_id in id2label.items():
    words_count = grouped_result.loc[grouped_result["class_id"] == class_id].class_words_freq.values[0]
    sorted_freqs = sorted(words_count.items(), key=lambda x: x[1], reverse=True)
    class_words_freqs.update({class_id: sorted_freqs})

In [None]:
total_freqs = []
for class_id, freqs in class_words_freqs.items():
    freqs = [freq[0] for freq in freqs if len(freq[0]) > 1]
    print(freqs)
    top = [freq for freq in freqs[:10]]
    total_freqs.extend(top)
    

In [None]:
keywords = list(set(total_freqs))
print(keywords)
with open("keywords.json", "w") as file:
    json.dump(keywords, file)

## Add keywords to each sample

In [34]:
def add_keywords_vector(df, keywords):
    for keyword in tqdm(keywords):
        df[keyword] = df["words"].apply(lambda x: x.count(keyword))
    return df

In [None]:
df_final = add_keywords_vector(df_final, keywords)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
temp =df_final.groupby(['class_id','english']).size().unstack()
temp.rename(columns={0:'No', 1:'Yes'}, inplace=True)
colors  = ['#ec838a','#9b9c9a']
ax = (temp.T*100.0 / temp.T.sum()).T.plot(kind='bar',width = 0.3,stacked = True,rot = 0,figsize = (12,7),color = colors)
plt.title('English documents per class \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
plt.legend(fontsize = "medium")
plt.xticks(rotation=0, horizontalalignment="center")
plt.yticks(rotation=0, horizontalalignment="right")
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.text(x+width/2, 
            y+height/2, 
            '{:.1f}%'.format(height), 
            horizontalalignment='center', 
            verticalalignment='center')
ax.autoscale(enable=False, axis='both', tight=False)

In [None]:
df_final.to_csv("train_dataset.csv", index=False)

# Training preparation

In [5]:
df_final = pd.read_csv("train_dataset.csv")

## Dataset split

In [6]:
X = df_final.drop(["file_path", "class_name", "class_id", "words_freq", "words"], axis =1)
y = df_final["class_id"]

## Training function

In [7]:
def train_model(name, model, metric, X, y, k=1):
    if metric not in ["f1_score", "topk"]:
        raise Exception(f"Metric {metric} is not supported")
    topk = k
    # Number of folds
    k_num_folds  = 5

    # Create a StratifiedKFold object
    skf = StratifiedKFold(n_splits=k_num_folds, shuffle=True, random_state=42)

    f1_scores = []
    top_k_scores = [] 
    # Perform stratified k-fold cross-validation
    for idx, (train_index, test_index) in enumerate(skf.split(X, y)):
        fold = idx + 1
        print(f"Fold {fold}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]


        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        f1_score_ = f1_score(y_test,y_pred,average='macro')
        f1_scores.append(f1_score_)
        print(f"f1_score_: {f1_score_}")

        # Predict on the test set
        y_pred_proba = model.predict_proba(X_test)

        # Calculate top-k accuracy
        y_pred_topk = np.argsort(y_pred_proba, axis=1)[:, -topk:]
        y_test_topk = np.expand_dims(y_test, axis=1)
        topk_accuracy = np.mean(np.any(y_pred_topk == y_test_topk, axis=1))

        top_k_scores.append(topk_accuracy)
        print(f"Top-K accuracy: {topk_accuracy}")

    if metric == "f1_score":
        return round(np.mean(f1_scores)*100, 2)
    elif metric == "topk":
        return round(np.mean(topk_accuracy)*100, 2)

# Execute XGBoost

## Searching for optimal hyperparameters

In [None]:
def obj_func_xgboost(params):
    print(params)
    
    k = 1
    metric = "f1_score"

    f1score = train_model("XGBoost", xgb.XGBClassifier(**params), metric, X, y, k)
    
    return{'loss': -f1score, 'status': STATUS_OK}


xgb_reg_params = {
    'learning_rate':     hp.choice('learning_rate',          np.arange(0.005,0.1,0.01)),
    'max_depth':         hp.choice("max_depth",              np.arange(3,20,1,dtype = int)),
    'objective' : 'multi:softmax',
    'n_estimators': 100,
    'verbosity': 0,
    'seed': 0,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'num_class': len(y.unique()),
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # Fraction of samples to use for training each tree
    'gamma': hp.uniform('gamma', 0, 1),  # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': hp.uniform('min_child_weight', 0, 10),  # Minimum sum of instance weight needed in a child
    'alpha': hp.loguniform('alpha', -5, 2),  # L1 regularization term on weights
    'lambda': hp.loguniform('lambda', -5, 2),  # L2 regularization term on weights
}
best = fmin(obj_func_xgboost, xgb_reg_params, algo=tpe.suggest, max_evals=100)

In [None]:
space_eval(xgb_reg_params, best)

## Training XGBoost using optimal parameters

In [8]:
optim_xgb_params = {
 'alpha': 0.3506917294663747,
 'gamma': 0.02976364339501225,
 'lambda': 0.014637524757541995,
 'learning_rate': 0.08499999999999999,
 'max_depth': 8,
 'min_child_weight': 0.597532087144379,
 'n_estimators': 100,
 'num_class': 21,
 'objective': 'multi:softmax',
 'seed': 0,
 'subsample': 0.6526200874262882,
 'verbosity': 0
}

if DEVICE == "gpu":
    optim_xgb_params.update(
        {
            'predictor': 'gpu_predictor',
            'tree_method': 'gpu_hist',
        }
    )

In [9]:
models = {}

model_type = "XGBoost"
xgb_model = xgb.XGBClassifier(**optim_xgb_params)

train_model(model_type, xgb_model, "f1_score", X, y)

models.update({model_type: xgb_model})

Fold 1
f1_score_: 0.7041065515046359
Top-K accuracy: 0.6757005052824988
Fold 2
f1_score_: 0.6926767684100302
Top-K accuracy: 0.6628387689480937
Fold 3
f1_score_: 0.6949563551540014
Top-K accuracy: 0.6655948553054662
Fold 4
f1_score_: 0.700644468576837
Top-K accuracy: 0.6700367647058824
Fold 5
f1_score_: 0.7172179554571263
Top-K accuracy: 0.6902573529411765


# CatBoost

## Searching for optimal hyperparameters

In [None]:
def obj_func_ctb(params):
    print(params)
    
    k = 1
    metric = "topk"

    f1score = train_model("CatBoost", ctb.CatBoostClassifier(**params), metric, X, y, k)
    
    return{'loss': -f1score, 'status': STATUS_OK}

ctb_reg_params = {
    'learning_rate': hp.choice('learning_rate', np.arange(0.005,0.1,0.01)),
    'depth': hp.choice('depth', np.arange(3,16,1,dtype = int)),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
    'border_count': hp.choice('border_count', [32, 64, 128]),
    'iterations': 100,
    'verbose': 0,
    'random_state': 42,
    'loss_function':'MultiClass'
}
if DEVICE == "GPU":
    ctb_reg_params.update({
        'task_type': 'GPU',
        'devices':'0:1',
    })
best = fmin(obj_func_ctb, ctb_reg_params, algo=tpe.suggest, max_evals=20)
space_eval(ctb_reg_params, best)

## Training using optimal hyperparameters

In [10]:
optim_ctb_params = {
    'border_count': 64,
    'depth': 15,
    'iterations': 100,
    'l2_leaf_reg': 2.6105749653144654,
    'learning_rate': 0.09499999999999999,
    'loss_function': 'MultiClass',
    'random_state': 42,
    'verbose': 0,
}
  

if DEVICE == "gpu":
    optim_ctb_params.update(
        {
           'task_type': 'GPU',
            'devices':'0:1',
        }
    )

In [11]:
model_type = "CatBoost"
ctb_model = ctb.CatBoostClassifier(**optim_ctb_params)

train_model(model_type, ctb_model, "f1_score", X, y)

models.update({model_type: ctb_model})

Fold 1
f1_score_: 0.6932493828129292
Top-K accuracy: 0.6655948553054662
Fold 2
f1_score_: 0.6878646137567289
Top-K accuracy: 0.6577859439595773
Fold 3
f1_score_: 0.6920886371214214
Top-K accuracy: 0.6646761598530088
Fold 4
f1_score_: 0.6916682474105695
Top-K accuracy: 0.6608455882352942
Fold 5
f1_score_: 0.706657903448634
Top-K accuracy: 0.6806066176470589


# LightGBM

## Searching for optimal hyperparameters

In [None]:
def obj_func_lgb(params):
    print(params)
    
    k = 1
    metric = "topk"

    metric_value = train_model("LightGBM", lgb.LGBMClassifier(**params), metric, X, y, k)
    
    return{'loss': -metric_value, 'status': STATUS_OK}


lgb_reg_params = {
    'learning_rate':     hp.choice('learning_rate',          np.arange(0.005,0.1,0.01)),
    'max_depth':         hp.choice("max_depth",              np.arange(3,20,1,dtype = int)),
    'objective' : 'multiclass',
    'metric': 'multi_logloss',
    'n_estimators': 100,
    'verbosity': -1,
    'verbose': -1,
    'seed': 0,
    'num_class': len(y.unique()),
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # Fraction of samples to use for training each tree
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10),  # Minimum sum of instance weight needed in a child
    'boosting_type': 'gbdt',
    'device': 'cpu',
    'verbose_eval': -1,
}
best = fmin(obj_func_lgb, lgb_reg_params, algo=tpe.suggest, max_evals=100)
space_eval(lgb_reg_params, best)

## Training using optimal hyperparameters

In [12]:
lgb_params = {
    'boosting_type': 'gbdt',
    'device': 'cpu',
    'learning_rate': 0.08499999999999999,
    'max_depth': 6,
    'metric': 'multi_logloss',
    'min_child_weight': 1.0028092542451534,
    'n_estimators': 100,
    'num_class': 21,
    'objective': 'multiclass',
    'reg_alpha': 0.2329278851928178,
    'reg_lambda': 0.6137201004742544,
    'seed': 0,
    'subsample': 0.6819274563761952,
    'verbose': -1,
    'verbose_eval': -1,
    'verbosity': -1,
}

In [13]:
model_type = "LightGBM"

lgb_model = lgb.LGBMClassifier(**lgb_params)
train_model(model_type, lgb_model, "f1_score", X, y)

models.update({model_type: lgb_model})

Fold 1
f1_score_: 0.7008467953980282
Top-K accuracy: 0.6711070280202113
Fold 2
f1_score_: 0.6995095791445325
Top-K accuracy: 0.669728984841525
Fold 3
f1_score_: 0.7019620194527837
Top-K accuracy: 0.6738631143775838
Fold 4
f1_score_: 0.6967772805101413
Top-K accuracy: 0.6654411764705882
Fold 5
f1_score_: 0.7205475512400131
Top-K accuracy: 0.6943933823529411


# Predictions

In [22]:
with open(os.path.join(HACKATHON_PATH, "test_ocr_clean.json")) as json_file:
    test_ocr_data = json.load(json_file)

test_our_ocr_data = parse_ocr_data(TEST_PATH)

 87%|████████▋ | 3047/3492 [17:12<02:00,  3.69it/s]

Cannot parse de5e15cc-6a3b-4659-bb5e-2456ef87ea05.tiff file


100%|██████████| 3492/3492 [19:40<00:00,  2.96it/s]


In [23]:
df_test_our_ocr = compose_dataframe(test_our_ocr_data, "our", data_prefix="test_set")

In [24]:
df_test_default_ocr = compose_dataframe(test_ocr_data, "default", data_prefix="test_set")

In [25]:
df_test_default_ocr

Unnamed: 0,file_path,text_default_ocr,words_default_ocr
0,e9fdc30e-99bd-4df8-ac68-adab008dfc88.jpg,PITaxpl POLALSHPein 9220067289 8383511524 18 1...,"[pitaxpl, polalshpein, ofrin, pit, zeznanie, w..."
1,1131d492-9670-4cf3-876b-d7d63fdb93cf.jpg,Kooaek DalkoNana LiteRAV CZARNYM LLB NeBiesAV ...,"[kooaek, dalkonana, literav, czarnym, nebiesav..."
2,a10b2b27-b827-417c-8158-87c607516275.jpg,Wypelnv Joonn Beyffenl Vazar FLENY Plsfl Delko...,"[wypełni, joann, beyffenl, bazar, flety, plsfl..."
3,f72c1791-2966-4751-885a-f8575b529811.jpg,PlTaxpl POLA JASNE WYPELN4 PoDATnIK POLA CieMN...,"[pltaxpl, pola, jasne, wypeln, podatnik, pola,..."
4,3acc02fa-164e-43d2-899a-82f16ef56dd8.jpg,PlTaxpl POL A JASNE WypelNIA PodaTNix CienncKy...,"[pltaxpl, pół, jasne, wypelnia, podatnix, cien..."
...,...,...,...
3495,eb2705bc-3267-4e07-a596-ed92f48aa680.tiff,"Nancy Coleman 6831 Ridge Blvd. 414 BrockIyn, N...","[nanci, coleman, ridg, blvd, brockiyn, ny, dec..."
3496,2eb16f23-d508-4d2f-94cb-096707d314ec.tiff,20733788/7 DEC_15.19393 40.22pM1 No.689 P.1/ T...,"[dec, pm, p, totin, mcwasbi, jame, waik, aoger..."
3497,2ec55c3b-9e9e-4bef-b1e0-e49e568bdb70.tiff,AXXALS OF THE *EIv YORR ACADEMY OF sciecs VOLU...,"[axxal, eiv, yorr, academi, sciec, voluvk, mar..."
3498,33973f8c-7cb7-4495-91d9-65f98b4a1f11.tiff,SENT BY :Mezz ina/Br 0wn Inc_ 3-92 4:51PM 21...,"[sent, mezz, inabr, wn, inc, pm, tmme, date, a..."


In [26]:
df_test_our_ocr

Unnamed: 0,file_path,text_our_ocr,words_our_ocr
0,0009d485-11a3-4299-b19d-1cc37bc0f7fb.tiff,07-May-91\n\nFILE:BHFLASH\nDISTRIBUTION: BERMU...,"[may, filebhflash, distribut, bermuda, hundr, ..."
1,000ea643-eda4-4ab6-aa39-65f3787d522d.tiff,PRESS RELEASE\n\nDevoted to\n‘drug abuse preve...,"[press, releas, devot, drug, abus, prevent, ad..."
2,00246a52-7855-4888-bc8a-45353c0a5888.tiff,LoursviLue\n\nEINISHED EXLTER ROD DESCRIPTIVE\...,"[loursvilu, einish, exlter, rod, descript, gen..."
3,002d3899-1822-4a81-9387-86dfe0387feb.tiff,FF Principal Investigator/Program Director (La...,"[ff, princip, investigatorprogram, director, l..."
4,002d3ab1-166e-4ce7-9c8e-e9961cecb736.tiff,res DAILY VOLUME REPORT FOR 03/22/1996\n* CONF...,"[re, daili, volum, report, confidenti, eteuuee..."
...,...,...,...
3486,ffac2177-922b-40be-ab0b-ca3ebbabf962.tiff,CONTINUATION |\nON MICROFICHE: ¢\nEeuo\n\nYS\n\n,"[continu, microfich, eeuo, ys]"
3487,ffcbeb47-d740-404a-9902-483f8f1ca9e4.tiff,ns-087957302-24403.000\n\nsonavont esmcarion\n...,"[ns, sonavont, esmcarion, copyright, unit, sta..."
3488,ffcbfc30-e6be-4ccd-aa64-414191cc00fd.jpg,UMOWA 0 DZIELO\n\n. 2022-11-\n\n1. Zamawiajgcy...,"[umowa, dzieli, zamawiający, powierza, wykonan..."
3489,ffd18b70-6f6b-4ca3-a100-bd3d07bcd30b.tiff,"University of California, San Francisco\nCURRI...","[univers, california, san, francisco, curricul..."


In [27]:
df_test_merged = pd.merge(df_test_default_ocr, df_test_our_ocr, on='file_path', how='left')

In [28]:
df_test = choose_ocr(df_test_merged, "words_default_ocr", "words_our_ocr")

In [29]:
# Add features
df_test['image_present'] = np.where(df_test['word_count_text_our_ocr'] == -1, False, True)

df_test['eng_language'] = df_test["text_default_ocr"].apply(lambda x: tokenize_text_data(x)[1])

df_test['english'] = np.where(df_test["eng_language"] != "pl", True, False)

In [30]:
df_test_final = create_final_df(df_test)

In [31]:
with open("keywords.json") as file:
    keywords = json.load(file)

In [35]:
df_test_final = add_keywords_vector(df_test_final, keywords)

  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[keyword] = df["words"].apply(lambda x: x.count(keyword))
  df[key

In [36]:
df_test_final.to_csv("df_test.csv", index=False)
df_test_final = pd.read_csv("df_test.csv")

In [37]:
X_test = df_test_final.drop(["file_path", "words_freq", "words"], axis =1)

In [45]:
X_test_result = df_test_final["file_path"]

In [46]:
predictions = {}
for model_type, model in models.items():
    preds = model.predict(X_test)

    X_test_result = pd.concat([X_test_result, pd.DataFrame(preds, columns=[f"{model_type}_preds"]).astype("int32")], axis=1)


In [47]:
X_test_result.sample(10)

Unnamed: 0,file_path,XGBoost_preds,CatBoost_preds,LightGBM_preds
1229,9fbf9de4-8b21-47a5-9362-36ed5eba1b2d.tiff,3,3,3
643,a89f0533-796e-4a0c-bd92-83d559f75c0b.tiff,12,12,12
1385,b8cbe49d-6f3c-4b23-86b8-9deec10d03c2.tiff,5,6,6
1408,7298c868-5a28-4e26-bd60-83108cb41c8f.tiff,6,6,6
2400,5d39cf12-9dd2-4347-8671-bf6ae288a178.tiff,6,12,12
1878,d447cf98-6b2f-4c2e-8a62-28925ed7af7e.tiff,12,12,12
445,b985b203-15ca-43e6-bc80-caab8b31c074.jpg,20,20,20
2109,1327b852-01d8-4136-af44-af5375b83492.tiff,8,8,8
2066,7c9c77e6-e072-4a7a-8440-646d3e8ddc2e.tiff,3,3,3
177,62df80e5-1efc-493e-993f-704196624b0e.jpg,11,11,11


In [41]:
X_test_result.to_csv("submission.csv", index=False)

In [65]:
# def ensemble_pred(result):
def get_pred(x):
    print(type(x))


def get_final_preds(X):
    preds_list = X[["XGBoost_preds", "CatBoost_preds", "LightGBM_preds"]].values.tolist()

    final_preds = []
    for preds in preds_list:
        pred_count = {}
        for pred in preds:
            if pred not in pred_count:
                pred_count.update({pred: 0})
            pred_count[pred] = pred_count[pred] + 1
        sorted_pred_count = sorted(pred_count.items(), key=lambda x:x[1])
        if len(sorted_pred_count) != 3:
            final_pred = sorted_pred_count[0][0]
            
        if len(sorted_pred_count) == 3:
            final_pred = preds[2]
        final_preds.append(final_pred)
    return final_preds

{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{10: 3}
Final pred
10
{10: 3}
{1

In [None]:
preds = result["XGBoost_preds", "CatBoost_preds", "LightGBM_preds"].tolist()