In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

# Загрузка очищенных данных

In [2]:
df = pd.read_csv("normal.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53079 entries, 0 to 53078
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       53079 non-null  int64  
 1   transaction_id   53079 non-null  int64  
 2   ip               53079 non-null  int64  
 3   device_id        53079 non-null  float64
 4   device_type      53079 non-null  object 
 5   tran_code        53079 non-null  int64  
 6   mcc              53079 non-null  int64  
 7   client_id        53079 non-null  int64  
 8   card_type        53079 non-null  object 
 9   pin_inc_count    53079 non-null  int64  
 10  card_status      53079 non-null  int64  
 11  expiration_date  53079 non-null  object 
 12  datetime         53079 non-null  object 
 13  sum              53079 non-null  float64
 14  oper_type        53079 non-null  object 
 15  balance          53079 non-null  float64
dtypes: float64(3), int64(8), object(5)
memory usage: 6.5+ MB


# Предобработка данных

In [3]:
to_drop = ['Unnamed: 0', 'expiration_date', 'datetime']
need = []

for col in df.columns:
    if col not in to_drop:
        need.append(col)

df = df[need]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53079 entries, 0 to 53078
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transaction_id  53079 non-null  int64  
 1   ip              53079 non-null  int64  
 2   device_id       53079 non-null  float64
 3   device_type     53079 non-null  object 
 4   tran_code       53079 non-null  int64  
 5   mcc             53079 non-null  int64  
 6   client_id       53079 non-null  int64  
 7   card_type       53079 non-null  object 
 8   pin_inc_count   53079 non-null  int64  
 9   card_status     53079 non-null  int64  
 10  sum             53079 non-null  float64
 11  oper_type       53079 non-null  object 
 12  balance         53079 non-null  float64
dtypes: float64(3), int64(7), object(3)
memory usage: 5.3+ MB


In [4]:
from sklearn.preprocessing import LabelEncoder

df_label = df.copy()

rare_threshold = 2500

for col in df.select_dtypes(include='object').columns:
    print(col)
    
    value_counts = df_label[col].value_counts()
    rare_values = value_counts[value_counts < rare_threshold].index
    df_label[col] = df_label[col].apply(lambda x: 'SMALL' if x in rare_values else x)
    
    label_encoder = LabelEncoder()
    df_label[col] = label_encoder.fit_transform(df_label[col])

device_type
card_type
oper_type


# Поиск аномалий

In [5]:
model_if = IsolationForest(**{'n_estimators': 50, 'max_samples': 0.1, 'contamination': 0.02, 'max_features': 1.0, 'bootstrap': False, 'n_jobs': -1, 'random_state': 100, 'warm_start': False})
model_if.fit(df_label)

In [6]:
clf = LocalOutlierFactor(n_neighbors=200, contamination=0.00013)

In [7]:
cov = EllipticEnvelope(random_state=100, contamination=0.00013).fit(df_label)



In [8]:
df_stat = df_label.copy()

df_stat['scores_if'] = model_if.decision_function(df_label)
df_stat['scores_cov'] = cov.decision_function(df_label)
df_stat['anomaly_if'] = model_if.predict(df_label)
df_stat['anomaly_lof'] = clf.fit_predict(df_label)
df_stat['anomaly_cov'] = cov.predict(df_label)

In [9]:
anomaly = df_stat.loc[((df_stat['anomaly_if'] == -1) & (df_stat['scores_if'] < -0.1)) | (df_stat['anomaly_lof'] == -1) |
((df_stat['anomaly_cov'] == -1) & (df_stat['scores_cov'] <= -20))]

anomaly_index = list(anomaly.index)

In [10]:
res = []
for i in df.index:
    if i in anomaly_index:
        res.append(True)
    else:
        res.append(False)

# Обучение модели CatBoostClassifier

In [11]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report



X_train, X_test, y_train, y_test = train_test_split(df_label, res, test_size=0.2, random_state=100)

# Применение SMOTE для увеличения примеров для редкого класса
smote = SMOTE(random_state=100)
X_train, y_train = smote.fit_resample(X_train, y_train)
X_test, y_test = smote.fit_resample(X_test, y_test)


model = CatBoostClassifier(iterations=5,
                           depth=5,
                           learning_rate=0.1,
                           loss_function='Logloss',
                           verbose=True,
                           subsample=0.8,
                           colsample_bylevel=0.8,
                           l2_leaf_reg=20,
                           random_strength=0.6,
                           )


model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=True, early_stopping_rounds=10)

print(classification_report(y_test, model.predict(X_test)))

0:	learn: 0.5990894	test: 0.5998234	best: 0.5998234 (0)	total: 64.8ms	remaining: 259ms
1:	learn: 0.5225436	test: 0.5279402	best: 0.5279402 (1)	total: 68.6ms	remaining: 103ms
2:	learn: 0.4597587	test: 0.4690563	best: 0.4690563 (2)	total: 71.8ms	remaining: 47.9ms
3:	learn: 0.4061525	test: 0.4181278	best: 0.4181278 (3)	total: 74.5ms	remaining: 18.6ms
4:	learn: 0.3598136	test: 0.3742210	best: 0.3742210 (4)	total: 77.2ms	remaining: 0us

bestTest = 0.3742209916
bestIteration = 4

              precision    recall  f1-score   support

       False       0.97      1.00      0.98     10595
        True       1.00      0.96      0.98     10595

    accuracy                           0.98     21190
   macro avg       0.98      0.98      0.98     21190
weighted avg       0.98      0.98      0.98     21190



## Сохранение модели в "catboost_model.bin"

In [12]:
model.save_model('catboost_model.bin')

In [13]:
def use_model(path_to_data: str):
    df = pd.read_csv(path_to_data)
    df_label = df.copy()
    for col in df.select_dtypes(include='object').columns:
        value_counts = df_label[col].value_counts()
        rare_values = value_counts[value_counts < rare_threshold].index
        df_label[col] = df_label[col].apply(lambda x: 'SMALL' if x in rare_values else x)

        label_encoder = LabelEncoder()
        df_label[col] = label_encoder.fit_transform(df_label[col])
        
    return model.predict(df_label)

# Сохранение результата "preds.csv"

In [17]:
df['res'] = use_model("normal.csv")

anomaly = df[df['res'] == True]
anomaly_index = list(anomaly.index)

anomaly.to_csv("anomaly.csv")

res = []
for i in df.index:
    if i in anomaly_index:
        res.append('True')
    else:
        res.append('False')

with open("preds.csv", "w", encoding="utf-8") as f:
    f.write("\n".join(res))
    
print("Результат сохранен в -> preds.csv")

Результат сохранен в -> preds.csv
