### 1. 데이터 불러오기 및 전처리

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [10]:
df = pd.read_csv('/content/sample_data/bank_transactions_data_2.csv')

In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512 entries, 0 to 2511
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TransactionID            2512 non-null   object 
 1   AccountID                2512 non-null   object 
 2   TransactionAmount        2512 non-null   float64
 3   TransactionDate          2512 non-null   object 
 4   TransactionType          2512 non-null   object 
 5   Location                 2512 non-null   object 
 6   DeviceID                 2512 non-null   object 
 7   IP Address               2512 non-null   object 
 8   MerchantID               2512 non-null   object 
 9   Channel                  2512 non-null   object 
 10  CustomerAge              2512 non-null   int64  
 11  CustomerOccupation       2512 non-null   object 
 12  TransactionDuration      2512 non-null   int64  
 13  LoginAttempts            2512 non-null   int64  
 14  AccountBalance          

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04 08:08:08
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04 08:09:35
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04 08:07:04
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04 08:09:06
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,2024-11-04 08:06:39


In [4]:
df.columns = df.columns.str.replace(" ", "_")

df["TransactionDate"] = pd.to_datetime(df["TransactionDate"], errors="coerce")
df["PreviousTransactionDate"] = pd.to_datetime(df["PreviousTransactionDate"], errors="coerce")

In [5]:
numeric_cols = [
    "TransactionAmount", "CustomerAge", "TransactionDuration",
    "LoginAttempts", "AccountBalance"]

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[numeric_cols]), columns=numeric_cols)

df_scaled["TransactionID"] = df["TransactionID"].values

In [6]:
df_scaled.head()

Unnamed: 0,TransactionAmount,CustomerAge,TransactionDuration,LoginAttempts,AccountBalance,TransactionID
0,-0.971275,1.423718,-0.552443,-0.206794,-0.000537,TX000001
1,0.26944,1.311287,0.305314,-0.206794,2.216472,TX000002
2,-0.586882,-1.443277,-0.909842,-0.206794,-1.023534,TX000003
3,-0.387456,-1.049768,-1.353017,-0.206794,0.885797,TX000004
4,-0.973468,-1.049768,1.120184,-0.206794,0.593589,TX000005


### 2. 모델 구성 및 예측

In [7]:
from sklearn.ensemble import IsolationForest

X = df_scaled.drop("TransactionID", axis=1)

model = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
model.fit(X)

df_scaled["anomaly"] = model.predict(X)
df_scaled["anomaly"] = df_scaled["anomaly"].map({1: 0, -1: 1})  # 이상이면 1로 표기

df_scaled["anomaly"].value_counts()

Unnamed: 0_level_0,count
anomaly,Unnamed: 1_level_1
0,2260
1,252


### 3. 결과 저장 및 필터링

In [8]:
anomaly_df = df_scaled[df_scaled["anomaly"] == 1]

df_merged = pd.merge(df, anomaly_df[["TransactionID", "anomaly"]], on="TransactionID", how="left")
df_merged["anomaly"] = df_merged["anomaly"].fillna(0).astype(int)

anomaly_df_final = df_merged[df_merged["anomaly"] == 1]
anomaly_df_final.to_csv("anomaly_transactions.csv", index=False)

df_merged.to_csv("transactions_with_anomaly_flag.csv", index=False)

### 4. 자동 실행 구조화

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

def load_and_preprocess(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.replace(" ", "_")
    df["TransactionDate"] = pd.to_datetime(df["TransactionDate"], errors="coerce")
    df["PreviousTransactionDate"] = pd.to_datetime(df["PreviousTransactionDate"], errors="coerce")
    numeric_cols = [
        "TransactionAmount", "CustomerAge", "TransactionDuration",
        "LoginAttempts", "AccountBalance"
    ]
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[numeric_cols]), columns=numeric_cols)
    df_scaled["TransactionID"] = df["TransactionID"].values
    return df, df_scaled

def detect_anomalies(df_scaled):
    X = df_scaled.drop("TransactionID", axis=1)
    model = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
    model.fit(X)
    df_scaled["anomaly"] = model.predict(X)
    df_scaled["anomaly"] = df_scaled["anomaly"].map({1: 0, -1: 1})
    return df_scaled

def save_results(df, df_scaled):
    df_merged = pd.merge(df, df_scaled[["TransactionID", "anomaly"]], on="TransactionID", how="left")
    df_merged["anomaly"] = df_merged["anomaly"].fillna(0).astype(int)
    anomaly_df = df_merged[df_merged["anomaly"] == 1]
    anomaly_df.to_csv("anomaly_transactions.csv", index=False)
    df_merged.to_csv("transactions_with_anomaly_flag.csv", index=False)

if __name__ == "__main__":
    df, df_scaled = load_and_preprocess('/content/sample_data/bank_transactions_data_2.csv')
    df_scaled = detect_anomalies(df_scaled)
    save_results(df, df_scaled)