## Gradient boosting

In [9]:
# Load dataset
import pandas as pd
df = pd.read_csv("datathon_data.csv")

df.iloc[12939]

BELNR            12939
WAERS               C1
BUKRS              C20
KTOSL               C1
PRCTR              C18
BSCHL               A1
HKONT               B1
DMBTR    910658.284578
WRBTR      54449.83882
label           anomal
Name: 12939, dtype: object

In [13]:
df.iloc[[506926, 336799, 93972, 341778]]['']

Unnamed: 0,BELNR,WAERS,BUKRS,KTOSL,PRCTR,BSCHL,HKONT,DMBTR,WRBTR,label
506926,506926,U72,D58,E35,Y05,D53,F61,92445510.0,59585050.0,anomal
336799,336799,N15,G51,G09,S21,E99,W62,92445530.0,59585050.0,anomal
93972,93972,L82,G45,G19,W37,A49,U45,92445500.0,59585040.0,anomal
341778,341778,B39,H97,S57,N58,F04,Q93,92445520.0,59585050.0,anomal


Unnamed: 0,BELNR,WAERS,BUKRS,KTOSL,PRCTR,BSCHL,HKONT,DMBTR,WRBTR,label
12939,12939,C1,C20,C1,C18,A1,B1,9.106583e+05,5.444984e+04,anomal
32317,32317,C1,C11,C1,C53,A1,B1,9.106530e+05,5.443921e+04,anomal
33365,33365,W59,C13,P83,I73,I05,Q99,9.244552e+07,5.958504e+07,anomal
34058,34058,C1,C11,C1,C91,A1,B1,9.106689e+05,5.444086e+04,anomal
34059,34059,C1,C11,C1,C64,A1,B1,9.106316e+05,5.444328e+04,anomal
...,...,...,...,...,...,...,...,...,...,...
506926,506926,U72,D58,E35,Y05,D53,F61,9.244551e+07,5.958505e+07,anomal
507636,507636,C1,C11,C1,C70,A1,B1,9.106766e+05,5.444176e+04,anomal
528449,528449,C89,S43,E40,Y34,L29,N28,9.244553e+07,5.958504e+07,anomal
528759,528759,P36,V48,Z17,J68,Q50,J62,9.244554e+07,5.958504e+07,anomal


In [2]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('datathon_data.csv')

# Encode labels: "regular" = 0, "anomal" = 1
df["label_encoded"] = df["label"].apply(lambda x: 1 if x == "anomal" else 0)

scaler = StandardScaler()
X_normalized = scaler.fit_transform(df[['DMBTR', 'WRBTR']].values)
df[['DMBTR', 'WRBTR']] = X_normalized
df


Unnamed: 0,BELNR,WAERS,BUKRS,KTOSL,PRCTR,BSCHL,HKONT,DMBTR,WRBTR,label,label_encoded
0,0,C3,C31,C9,C92,A3,B1,-0.298142,-0.076700,regular,0
1,1,C1,C18,C7,C76,A1,B2,-0.368357,0.223083,regular,0
2,2,C1,C19,C2,C20,A1,B3,0.016167,3.845589,regular,0
3,3,C4,C48,C9,C95,A2,B1,0.817289,-0.041248,regular,0
4,4,C5,C58,C1,C19,A3,B1,-0.005647,-0.076274,regular,0
...,...,...,...,...,...,...,...,...,...,...,...
533004,533004,C1,C18,C3,C32,A1,B2,0.733601,-0.076700,regular,0
533005,533005,C8,C80,C1,C11,A1,B1,-0.247454,-0.061837,regular,0
533006,533006,C1,C10,C1,C19,A1,B1,-0.339416,0.325925,regular,0
533007,533007,C1,C14,C4,C40,A1,B3,-0.024220,-0.076700,regular,0


In [6]:
df[df['label'] == 'anomal']

Unnamed: 0,BELNR,WAERS,BUKRS,KTOSL,PRCTR,BSCHL,HKONT,DMBTR,WRBTR,label,label_encoded
12939,12939,C1,C20,C1,C18,A1,B1,-0.005580,-0.009622,anomal,1
32317,32317,C1,C11,C1,C53,A1,B1,-0.005583,-0.009635,anomal,1
33365,33365,W59,C13,P83,I73,I05,Q99,42.523496,73.328327,anomal,1
34058,34058,C1,C11,C1,C91,A1,B1,-0.005575,-0.009633,anomal,1
34059,34059,C1,C11,C1,C64,A1,B1,-0.005592,-0.009630,anomal,1
...,...,...,...,...,...,...,...,...,...,...,...
506926,506926,U72,D58,E35,Y05,D53,F61,42.523494,73.328346,anomal,1
507636,507636,C1,C11,C1,C70,A1,B1,-0.005572,-0.009632,anomal,1
528449,528449,C89,S43,E40,Y34,L29,N28,42.523501,73.328334,anomal,1
528759,528759,P36,V48,Z17,J68,Q50,J62,42.523506,73.328332,anomal,1


In [8]:
df.iloc[12939]

BELNR               12939
WAERS                  C1
BUKRS                 C20
KTOSL                  C1
PRCTR                 C18
BSCHL                  A1
HKONT                  B1
DMBTR            -0.00558
WRBTR           -0.009622
label              anomal
label_encoded           1
Name: 12939, dtype: object

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Sample dataset creation
np.random.seed(42)

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['label_encoded']), df['label_encoded'], test_size=0.2, random_state=42, stratify=df['label_encoded'])

# Identify categorical feature column indices
cat_features_indices = [x for x in range(1,7)]

# Train CatBoost classifier (no need for one-hot encoding)
catboost_clf = CatBoostClassifier(
    iterations=500, 
    learning_rate=0.05, 
    depth=6, 
    cat_features=cat_features_indices, 
    verbose=100, 
    random_seed=42
)

catboost_clf.fit(X_train, y_train)

# Predictions
y_pred = catboost_clf.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)


ModuleNotFoundError: No module named 'catboost'