In [1]:
import pandas as pd
import catboost
from catboost import CatBoostClassifier

In [2]:
df = pd.read_excel('./data/Task-2/train_processed.xlsx')

X= df[['text']]
y = df.label

In [3]:
X

Unnamed: 0,text
0,fiskar strong portfolio international brand in...
1,metalszinc surge pct glencore cut output fue...
2,accord scanfil demand telecommunication networ...
3,dbs launch new banking api developer platform ...
4,theodosopoulos say tellab could value nokia si...
...,...
4333,airvana umts home base station femto cell use ...
4334,malton net profit jump four time gain revoke...
4335,accord ceo kai telanne company newspaper achie...
4336,addition cramo peab sign exclusive fiveyear re...


In [4]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        task_type='CPU',
        iterations=5000,
        eval_metric='Accuracy',
        od_type='Iter',
        od_wait=500,
        **kwargs
    )
    return model.fit(
            train_pool,
            eval_set=test_pool,
            verbose=100,
            plot=True,
            use_best_model=True)

In [5]:
import numpy as np
from sklearn.model_selection import KFold

# Set up the KFold object with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize a list to store the accuracies for each fold
accuracies = []

# Iterate over each fold
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")

    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Create the CatBoost training and testing pools
    train_pool = catboost.Pool(X_train, y_train, text_features=['text'])
    test_pool = catboost.Pool(X_test, y_test, text_features=['text'])

    # Fit the model on the training data and evaluate on the testing data
    model = fit_model(train_pool, test_pool,
                      tokenizers=[
                          {
                              'tokenizer_id': 'Sense',
                              'separator_type': 'BySense',
                              'lowercasing': 'True',
                              'token_types': ['Word', 'Number', 'SentenceBreak'],
                              'sub_tokens_policy':'SeveralTokens'
                          }
                      ],
                      dictionaries=[
                          {
                              'dictionary_id': 'Word',
                              'max_dictionary_size': '50000'
                          }
                      ],
                      feature_calcers=[
                          'BoW:top_tokens_count=10000'
                      ])
    accuracy = model.score(test_pool)
    accuracies.append(accuracy)

# Print the average accuracy across all folds
print(f"Average accuracy: {np.mean(accuracies)}")


Fold 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021395
0:	learn: 0.6873199	test: 0.6751152	best: 0.6751152 (0)	total: 64.6ms	remaining: 5m 22s
100:	learn: 0.7285303	test: 0.7142857	best: 0.7165899 (77)	total: 1.95s	remaining: 1m 34s
200:	learn: 0.7544669	test: 0.7407834	best: 0.7407834 (198)	total: 3.68s	remaining: 1m 27s
300:	learn: 0.7806916	test: 0.7534562	best: 0.7534562 (298)	total: 5.41s	remaining: 1m 24s
400:	learn: 0.8043228	test: 0.7753456	best: 0.7753456 (361)	total: 7.15s	remaining: 1m 21s
500:	learn: 0.8204611	test: 0.7949309	best: 0.7949309 (500)	total: 8.87s	remaining: 1m 19s
600:	learn: 0.8368876	test: 0.7995392	best: 0.7995392 (556)	total: 10.6s	remaining: 1m 17s
700:	learn: 0.8484150	test: 0.8087558	best: 0.8099078 (681)	total: 12.3s	remaining: 1m 15s
800:	learn: 0.8582133	test: 0.8156682	best: 0.8156682 (793)	total: 14s	remaining: 1m 13s
900:	learn: 0.8662824	test: 0.8214286	best: 0.8214286 (899)	total: 15.8s	remaining: 1m 11s
1000:	learn: 0.8723343	test: 0.8202765	best: 0.8225806 (907)	total

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021395
0:	learn: 0.7063401	test: 0.6843318	best: 0.6843318 (0)	total: 20.8ms	remaining: 1m 43s
100:	learn: 0.7345821	test: 0.7073733	best: 0.7096774 (64)	total: 1.68s	remaining: 1m 21s
200:	learn: 0.7587896	test: 0.7269585	best: 0.7281106 (170)	total: 3.37s	remaining: 1m 20s
300:	learn: 0.7873199	test: 0.7430876	best: 0.7442396 (282)	total: 5.04s	remaining: 1m 18s
400:	learn: 0.8072046	test: 0.7557604	best: 0.7557604 (376)	total: 6.69s	remaining: 1m 16s
500:	learn: 0.8213256	test: 0.7707373	best: 0.7718894 (479)	total: 8.35s	remaining: 1m 14s
600:	learn: 0.8322767	test: 0.7799539	best: 0.7822581 (578)	total: 10s	remaining: 1m 13s
700:	learn: 0.8443804	test: 0.7868664	best: 0.7868664 (697)	total: 11.7s	remaining: 1m 11s
800:	learn: 0.8538905	test: 0.7926267	best: 0.7926267 (799)	total: 13.4s	remaining: 1m 10s
900:	learn: 0.8610951	test: 0.7995392	best: 0.7995392 (865)	total: 15.1s	remaining: 1m 8s
1000:	learn: 0.8694524	test: 0.7949309	best: 0.7995392 (865)	total:

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021395
0:	learn: 0.7057637	test: 0.6682028	best: 0.6682028 (0)	total: 26.7ms	remaining: 2m 13s
100:	learn: 0.7435159	test: 0.7027650	best: 0.7027650 (96)	total: 1.71s	remaining: 1m 22s
200:	learn: 0.7602305	test: 0.7119816	best: 0.7119816 (179)	total: 3.4s	remaining: 1m 21s
300:	learn: 0.7902017	test: 0.7373272	best: 0.7373272 (298)	total: 5.07s	remaining: 1m 19s
400:	learn: 0.8132565	test: 0.7442396	best: 0.7442396 (391)	total: 6.77s	remaining: 1m 17s
500:	learn: 0.8244957	test: 0.7500000	best: 0.7511521 (481)	total: 8.46s	remaining: 1m 16s
600:	learn: 0.8391931	test: 0.7580645	best: 0.7603687 (586)	total: 10.2s	remaining: 1m 14s
700:	learn: 0.8492795	test: 0.7592166	best: 0.7615207 (610)	total: 11.9s	remaining: 1m 13s
800:	learn: 0.8599424	test: 0.7626728	best: 0.7638249 (789)	total: 13.6s	remaining: 1m 11s
900:	learn: 0.8674352	test: 0.7615207	best: 0.7649770 (810)	total: 15.3s	remaining: 1m 9s
1000:	learn: 0.8752161	test: 0.7684332	best: 0.7684332 (997)	total

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021397
0:	learn: 0.6885624	test: 0.7185698	best: 0.7185698 (0)	total: 17.2ms	remaining: 1m 26s
100:	learn: 0.7159320	test: 0.7439446	best: 0.7485582 (75)	total: 1.69s	remaining: 1m 21s
200:	learn: 0.7479113	test: 0.7670127	best: 0.7670127 (187)	total: 3.38s	remaining: 1m 20s
300:	learn: 0.7732642	test: 0.7900807	best: 0.7900807 (295)	total: 5.04s	remaining: 1m 18s
400:	learn: 0.8020743	test: 0.8154556	best: 0.8154556 (400)	total: 6.74s	remaining: 1m 17s
500:	learn: 0.8161913	test: 0.8269896	best: 0.8292964 (484)	total: 8.42s	remaining: 1m 15s
600:	learn: 0.8308845	test: 0.8258362	best: 0.8316032 (539)	total: 10.1s	remaining: 1m 14s
700:	learn: 0.8467300	test: 0.8304498	best: 0.8316032 (539)	total: 11.8s	remaining: 1m 12s
800:	learn: 0.8585422	test: 0.8327566	best: 0.8339100 (793)	total: 13.5s	remaining: 1m 10s
900:	learn: 0.8689139	test: 0.8350634	best: 0.8350634 (816)	total: 15.2s	remaining: 1m 9s
1000:	learn: 0.8772688	test: 0.8373702	best: 0.8385236 (955)	tota

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021397
0:	learn: 0.7038317	test: 0.7070358	best: 0.7070358 (0)	total: 18.3ms	remaining: 1m 31s
100:	learn: 0.7306252	test: 0.7220300	best: 0.7220300 (95)	total: 1.83s	remaining: 1m 28s
200:	learn: 0.7600115	test: 0.7404844	best: 0.7404844 (185)	total: 3.67s	remaining: 1m 27s
300:	learn: 0.7816191	test: 0.7566321	best: 0.7589389 (285)	total: 5.5s	remaining: 1m 25s
400:	learn: 0.8014981	test: 0.7704729	best: 0.7704729 (400)	total: 7.31s	remaining: 1m 23s
500:	learn: 0.8219533	test: 0.7831603	best: 0.7831603 (498)	total: 9.11s	remaining: 1m 21s
600:	learn: 0.8326131	test: 0.7900807	best: 0.7900807 (590)	total: 10.9s	remaining: 1m 20s
700:	learn: 0.8467300	test: 0.7970012	best: 0.8004614 (652)	total: 12.8s	remaining: 1m 18s
800:	learn: 0.8605589	test: 0.7981546	best: 0.8016148 (770)	total: 14.6s	remaining: 1m 16s
900:	learn: 0.8706425	test: 0.8039216	best: 0.8039216 (894)	total: 16.4s	remaining: 1m 14s
1000:	learn: 0.8769807	test: 0.8039216	best: 0.8062284 (901)	tota