In [1]:
import pandas as pd
import catboost
from catboost import CatBoostClassifier

In [2]:
df = pd.read_excel('./data/Task-2/train.xlsx')

X= df[['text']]
y = df.label

In [3]:
X

Unnamed: 0,text
0,Fiskars has a strong portfolio of internationa...
1,METALS-Zinc surges 12 pct after Glencore cuts ...
2,"According to Scanfil , demand for telecommunic..."
3,dbs launches new banking api developer platfor...
4,Theodosopoulos said Tellabs could be of value ...
...,...
4333,Airvana 's UMTS Home Base Station femto cell u...
4334,malton 1q net profit jumps four times on gain ...
4335,"According to CEO Kai Telanne , the company 's ..."
4336,"In addition , Cramo and Peab have signed exclu..."


In [4]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        task_type='CPU',
        iterations=5000,
        eval_metric='Accuracy',
        od_type='Iter',
        od_wait=500,
        **kwargs
    )
    return model.fit(
            train_pool,
            eval_set=test_pool,
            verbose=100,
            plot=True,
            use_best_model=True)

In [5]:
import numpy as np
from sklearn.model_selection import KFold

# Set up the KFold object with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize a list to store the accuracies for each fold
accuracies = []

# Iterate over each fold
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")

    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Create the CatBoost training and testing pools
    train_pool = catboost.Pool(X_train, y_train, text_features=['text'])
    test_pool = catboost.Pool(X_test, y_test, text_features=['text'])

    # Fit the model on the training data and evaluate on the testing data
    model = fit_model(train_pool, test_pool,
                      tokenizers=[
                          {
                              'tokenizer_id': 'Sense',
                              'separator_type': 'BySense',
                              'lowercasing': 'True',
                              'token_types': ['Word', 'Number', 'SentenceBreak'],
                              'sub_tokens_policy':'SeveralTokens'
                          }
                      ],
                      dictionaries=[
                          {
                              'dictionary_id': 'Word',
                              'max_dictionary_size': '50000'
                          }
                      ],
                      feature_calcers=[
                          'BoW:top_tokens_count=10000'
                      ])
    accuracy = model.score(test_pool)
    accuracies.append(accuracy)

# Print the average accuracy across all folds
print(f"Average accuracy: {np.mean(accuracies)}")


Fold 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021395
0:	learn: 0.6691643	test: 0.6624424	best: 0.6624424 (0)	total: 79.5ms	remaining: 6m 37s
100:	learn: 0.7164265	test: 0.7039171	best: 0.7039171 (100)	total: 2.38s	remaining: 1m 55s
200:	learn: 0.7371758	test: 0.7200461	best: 0.7211982 (151)	total: 4.61s	remaining: 1m 50s
300:	learn: 0.7648415	test: 0.7338710	best: 0.7350230 (295)	total: 6.85s	remaining: 1m 46s
400:	learn: 0.7870317	test: 0.7592166	best: 0.7592166 (400)	total: 9.09s	remaining: 1m 44s
500:	learn: 0.8123919	test: 0.7695853	best: 0.7695853 (499)	total: 11.3s	remaining: 1m 41s
600:	learn: 0.8302594	test: 0.7811060	best: 0.7811060 (586)	total: 13.6s	remaining: 1m 39s
700:	learn: 0.8435159	test: 0.7822581	best: 0.7880184 (670)	total: 15.8s	remaining: 1m 36s
800:	learn: 0.8541787	test: 0.7880184	best: 0.7880184 (670)	total: 17.9s	remaining: 1m 34s
900:	learn: 0.8665706	test: 0.7960829	best: 0.7972350 (873)	total: 20.2s	remaining: 1m 31s
1000:	learn: 0.8769452	test: 0.7983871	best: 0.7983871 (907)	to

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021395
0:	learn: 0.6553314	test: 0.6382488	best: 0.6382488 (0)	total: 29.9ms	remaining: 2m 29s
100:	learn: 0.7126801	test: 0.6947005	best: 0.6981567 (95)	total: 2.25s	remaining: 1m 49s
200:	learn: 0.7469741	test: 0.7223502	best: 0.7235023 (186)	total: 4.43s	remaining: 1m 45s
300:	learn: 0.7671470	test: 0.7315668	best: 0.7327189 (236)	total: 6.66s	remaining: 1m 43s
400:	learn: 0.7881844	test: 0.7373272	best: 0.7396313 (389)	total: 8.79s	remaining: 1m 40s
500:	learn: 0.8095101	test: 0.7488479	best: 0.7488479 (465)	total: 11s	remaining: 1m 38s
600:	learn: 0.8279539	test: 0.7615207	best: 0.7615207 (595)	total: 13.2s	remaining: 1m 36s
700:	learn: 0.8391931	test: 0.7695853	best: 0.7707373 (688)	total: 15.4s	remaining: 1m 34s
800:	learn: 0.8527378	test: 0.7730415	best: 0.7741935 (774)	total: 17.6s	remaining: 1m 32s
900:	learn: 0.8631124	test: 0.7764977	best: 0.7776498 (880)	total: 19.8s	remaining: 1m 30s
1000:	learn: 0.8731988	test: 0.7764977	best: 0.7776498 (880)	total

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021395
0:	learn: 0.6801153	test: 0.6474654	best: 0.6474654 (0)	total: 21.8ms	remaining: 1m 49s
100:	learn: 0.7233429	test: 0.6843318	best: 0.6866359 (93)	total: 2.28s	remaining: 1m 50s
200:	learn: 0.7443804	test: 0.6970046	best: 0.6981567 (179)	total: 4.44s	remaining: 1m 46s
300:	learn: 0.7691643	test: 0.7062212	best: 0.7062212 (299)	total: 6.55s	remaining: 1m 42s
400:	learn: 0.7881844	test: 0.7165899	best: 0.7188940 (384)	total: 8.71s	remaining: 1m 39s
500:	learn: 0.8097983	test: 0.7338710	best: 0.7338710 (489)	total: 10.9s	remaining: 1m 37s
600:	learn: 0.8299712	test: 0.7442396	best: 0.7453917 (588)	total: 13s	remaining: 1m 35s
700:	learn: 0.8438040	test: 0.7453917	best: 0.7476959 (649)	total: 15.2s	remaining: 1m 33s
800:	learn: 0.8538905	test: 0.7603687	best: 0.7615207 (777)	total: 17.4s	remaining: 1m 31s
900:	learn: 0.8613833	test: 0.7626728	best: 0.7638249 (820)	total: 19.6s	remaining: 1m 29s
1000:	learn: 0.8711816	test: 0.7695853	best: 0.7695853 (994)	total

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021397
0:	learn: 0.6551426	test: 0.7047290	best: 0.7047290 (0)	total: 23.2ms	remaining: 1m 55s
100:	learn: 0.7179487	test: 0.7450980	best: 0.7450980 (87)	total: 2.24s	remaining: 1m 48s
200:	learn: 0.7378277	test: 0.7508651	best: 0.7520185 (183)	total: 4.41s	remaining: 1m 45s
300:	learn: 0.7579948	test: 0.7612457	best: 0.7623991 (294)	total: 6.58s	remaining: 1m 42s
400:	learn: 0.7767214	test: 0.7670127	best: 0.7670127 (381)	total: 8.81s	remaining: 1m 41s
500:	learn: 0.8075483	test: 0.7831603	best: 0.7831603 (499)	total: 11s	remaining: 1m 38s
600:	learn: 0.8265630	test: 0.7946943	best: 0.7970012 (575)	total: 13.2s	remaining: 1m 36s
700:	learn: 0.8406799	test: 0.8050750	best: 0.8062284 (691)	total: 15.4s	remaining: 1m 34s
800:	learn: 0.8501873	test: 0.8119954	best: 0.8131488 (791)	total: 17.6s	remaining: 1m 32s
900:	learn: 0.8588303	test: 0.8177624	best: 0.8177624 (894)	total: 19.8s	remaining: 1m 30s
1000:	learn: 0.8706425	test: 0.8200692	best: 0.8212226 (958)	total

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.021397
0:	learn: 0.6617689	test: 0.6678201	best: 0.6678201 (0)	total: 21.1ms	remaining: 1m 45s
100:	learn: 0.7110343	test: 0.7070358	best: 0.7093426 (95)	total: 2.19s	remaining: 1m 46s
200:	learn: 0.7395563	test: 0.7358708	best: 0.7358708 (187)	total: 4.4s	remaining: 1m 44s
300:	learn: 0.7695189	test: 0.7474048	best: 0.7474048 (299)	total: 6.55s	remaining: 1m 42s
400:	learn: 0.7905503	test: 0.7600923	best: 0.7612457 (392)	total: 8.78s	remaining: 1m 40s
500:	learn: 0.8095650	test: 0.7681661	best: 0.7693195 (483)	total: 11s	remaining: 1m 38s
600:	learn: 0.8248343	test: 0.7716263	best: 0.7762399 (565)	total: 13.2s	remaining: 1m 36s
700:	learn: 0.8372227	test: 0.7785467	best: 0.7820069 (670)	total: 15.4s	remaining: 1m 34s
800:	learn: 0.8498992	test: 0.7773933	best: 0.7820069 (670)	total: 17.7s	remaining: 1m 32s
900:	learn: 0.8619994	test: 0.7843137	best: 0.7843137 (883)	total: 20.1s	remaining: 1m 31s
1000:	learn: 0.8723711	test: 0.7854671	best: 0.7854671 (976)	total: