In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import networkx as nx


In [2]:
# 1. Veri Yükleme ve Genel Keşif
file_path = 'Fraud.csv'
data = pd.read_csv(file_path, usecols=['nameOrig', 'nameDest', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'type'])

# Gerekli sütunları kontrol et ve veri seti boyutunu yazdır
print("Veri seti boyutu:", data.shape)
print(data.head())

# Eksik değerleri kontrol et
print("Eksik değerler:\n", data.isnull().sum())



Veri seti boyutu: (6362620, 9)
       type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2  TRANSFER    181.00  C1305486145          181.0            0.00   
3  CASH_OUT    181.00   C840083671          181.0            0.00   
4   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  
0  M1979787155             0.0             0.0        0  
1  M2044282225             0.0             0.0        0  
2   C553264065             0.0             0.0        1  
3    C38997010         21182.0             0.0        1  
4  M1230701703             0.0             0.0        0  
Eksik değerler:
 type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: in

In [3]:
# 2. Veri Setini Eğitim ve Test Olarak Böl
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [4]:
# 3. Grafik Tabanlı Analiz (Bellek Tüketimini Azaltarak)
# Graf oluşturma ve merkezlilik değerlerini parçalara bölerek hesaplama
def calculate_malatya_centrality(graph):
    centrality = {}
    for node in graph.nodes():
        neighbors = list(graph.neighbors(node))
        if len(neighbors) == 0:
            centrality[node] = 0
        else:
            degree = graph.degree(node)
            centrality[node] = sum(degree / graph.degree(neighbor) for neighbor in neighbors)
    return centrality

def build_graph_in_chunks(data_chunk):
    graph = nx.DiGraph()
    for _, row in data_chunk.iterrows():
        if row['amount'] > 0:
            graph.add_edge(row['nameOrig'], row['nameDest'], amount=row['amount'])
    return graph



In [5]:
# Eğitim verisini parçalara ayırarak grafa dönüştürme
chunk_size = 10000  # Parça boyutunu belirle
graph = nx.DiGraph()

for i in range(0, len(train_data), chunk_size):
    chunk = train_data.iloc[i:i + chunk_size]
    graph_chunk = build_graph_in_chunks(chunk)
    graph.add_edges_from(graph_chunk.edges(data=True))

# Malatya merkezlilik hesaplaması
malatya_centrality = calculate_malatya_centrality(graph)

# Eğitim verisine Malatya merkezlilik değerlerini ekleme
train_data['malatya_centrality'] = train_data['nameOrig'].map(malatya_centrality).fillna(0)



In [6]:
# 4. Model Eğitimi ve Performans Değerlendirme
features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'malatya_centrality']
X_train = train_data[features]
y_train = train_data['isFraud']

# Test verisi hazırlanırken bellek tüketimini azaltmak için 'malatya_centrality' değerleri sıfır kabul ediliyor
test_data['malatya_centrality'] = 0
X_test = test_data[features]
y_test = test_data['isFraud']

# Model eğitimi
model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Test seti üzerinde tahmin yapma
y_pred = model.predict(X_test)


In [7]:
# Performans metriklerini hesaplama
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Performans Metrikleri:")
print(f"Doğruluk: {accuracy:.2f}")
print(f"Kesinlik: {precision:.2f}")
print(f"Duyarlılık: {recall:.2f}")
print(f"F1 Skoru: {f1:.2f}")
print("\nSınıflandırma Raporu:\n", classification_report(y_test, y_pred))


Performans Metrikleri:
Doğruluk: 0.43
Kesinlik: 0.00
Duyarlılık: 0.98
F1 Skoru: 0.00

Sınıflandırma Raporu:
               precision    recall  f1-score   support

           0       1.00      0.43      0.60   1270904
           1       0.00      0.98      0.00      1620

    accuracy                           0.43   1272524
   macro avg       0.50      0.70      0.30   1272524
weighted avg       1.00      0.43      0.60   1272524

