In [3]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [4]:
df_identities = pd.read_csv('train_identity.csv')
df_transaction = pd.read_csv('train_transaction.csv')
df_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
data = pd.merge(df_transaction, df_identities, on='TransactionID', how='left')

In [6]:
def preprocessing_data(data):
    is_fraud = data['isFraud']
    data = data.drop(columns=['isFraud'], axis = 1)
    # Preprocessing for numerical attributes
    numerical_data = data.select_dtypes(include=['float64', 'int64'])
    # Drop attributes with more than 20% missing values
    numerical_data = numerical_data.dropna(thresh=len(numerical_data)*0.8, axis=1)
    # Fill missing values with mean
    imputer = SimpleImputer(strategy='mean')
    numerical_data = pd.DataFrame(imputer.fit_transform(numerical_data), columns=numerical_data.columns)
    # Normalize numerical values
    scaler = StandardScaler()
    numerical_data = pd.DataFrame(scaler.fit_transform(numerical_data), columns=numerical_data.columns)
    # Preprocessing for categorical attributes
    categorical_data = data.select_dtypes(include=['object'])
    # Drop attributes with more than 50% missing values
    categorical_data = categorical_data.dropna(thresh=len(categorical_data)*0.5, axis=1)
    # Fill missing values with most frequent value
    imputer = SimpleImputer(strategy='most_frequent')
    categorical_data = pd.DataFrame(imputer.fit_transform(categorical_data), columns=categorical_data.columns)
    # One-hot encoding
    encoder = OneHotEncoder()
    categorical_data_encoded = encoder.fit_transform(categorical_data).toarray()
    # Combine processed numerical and categorical data
    processed_data = pd.concat([pd.DataFrame(numerical_data), pd.DataFrame(categorical_data_encoded)], axis=1)
    processed_data.columns = processed_data.columns.astype(str)
    X_train, X_test, y_train, y_test = train_test_split(processed_data, is_fraud, test_size=0.2, stratify=is_fraud)
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42, sampling_strategy=0.3)),
        ('under', RandomUnderSampler(random_state=42))
    ])
    X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)   
    return X_resampled, y_resampled, X_test, y_test

In [7]:
X_train, y_train, X_test, y_test = preprocessing_data(data)

In [None]:
class AntColonyFeatureSelection:
    def __init__(self, X_train, X_test, y_train, y_test, n_ants, decay, alpha=1, beta=10):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.n_ants = n_ants
        self.decay = decay
        self.alpha = alpha
        self.beta = beta
        self.n_features = X_train.shape[1]
        self.pheromones = np.ones((self.n_features, self.n_features)) / self.n_features

    
    def run(self):
        best_features = []
        best_f1_score = 0
        for _ in range(self.n_ants):
            selected_features, f1_score = self._construct_solutions()
            if f1_score > best_f1_score:
                best_f1_score = f1_score
                best_features = selected_features

            self.pheromones *= self.decay
            
        return best_features, best_f1_score

    
    def _construct_solutions(self):
        features = []
        current_node = np.random.randint(self.n_features)  
        f1_early = 0
        while True:
            features.append(current_node)
            f1 = self._evaluate_features(features)
            print(f1)
            self._update_pheromone(f1, f1_early, current_node, features)
            if f1 >= 0.6358123994555129:
                print(features)
                break
            current_node = self._select_next(features)
            
            f1_early = f1

        return features, self._evaluate_features(features)

    
    def _update_pheromone(self, f1, f1_early, current_node, features):
        if f1_early >= f1:
            print(current_node)
            for t in features:
                self.pheromones[t][current_node] /= self.beta
        else:
            for t in features:
                self.pheromones[t][current_node] *= self.beta
            

    def _select_next(self, features):
        choices = list(set(range(self.n_features)) - set(features))
        pheromone_values = self.pheromones[features[-1], choices]
        probabilities = pheromone_values ** self.alpha
        probabilities /= np.sum(probabilities)
        return np.random.choice(choices, p=probabilities)

    def _evaluate_features(self, features):
        features = list(set(range(self.n_features)) - set(features))
        selected_features = self.X_train.iloc[:, features].values
        rf = RandomForestClassifier()
        rf.fit(selected_features, self.y_train)
        X_test_selected = self.X_test.iloc[:, features].values
        y_pred = rf.predict(X_test_selected)
        return f1_score(self.y_test, y_pred)


aco_fs = AntColonyFeatureSelection(X_train, X_test, y_train, y_test, n_ants=20, decay=0.5)
best_features, best_f1_score = aco_fs.run()


print("Best Features:", best_features)
print("Best F1 Score:", best_f1_score)

In [13]:
# Caclulate and print classification metrics: accuracy, precision, recall, and F1 score 
def print_clf_metrics(y_actual, y_pred ):
    # Calculate accuracy
    accuracy = sklearn.metrics.accuracy_score(y_actual, y_pred)
    # Calculate precision
    precision = sklearn.metrics.precision_score(y_actual, y_pred)
    # Calculate recall
    recall = sklearn.metrics.recall_score(y_actual, y_pred)
    # Calculate F1 score
    f1 = sklearn.metrics.f1_score(y_actual, y_pred)
    # Print metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)


clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print_clf_metrics(y_test, y_pred)

In [None]:
y_roc = clf.predict_proba(X_test)
y_roc = y_roc[:, 1]
plot_roc_auc_curve(y_test, y_roc)

In [12]:
def plot_roc_auc_curve(y_actual, y_pred_prob):
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_actual, y_pred_prob)
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

In [82]:
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train.iloc[:, best_features].values, y_train)
y_pred = rf_model.predict(X_test.iloc[:, best_features].values)
print_clf_metrics(y_test, y_pred)

In [None]:
y_roc = rf_model.predict_proba(X_test.iloc[:, best_features].values)
y_roc = y_roc[:, 1]
plot_roc_auc_curve(y_test, y_roc)

In [17]:
best_hyperparameters = [30, 2, 1, 300]

final_model = RandomForestClassifier(
    n_estimators=max(1, round(best_hyperparameters[3])),
    max_depth=max(1, round(best_hyperparameters[0])),
    min_samples_split=max(1, round(best_hyperparameters[1])),
    min_samples_leaf=max(1, round(best_hyperparameters[2]))
)
final_model = final_model.fit(X_train.iloc[:, best_features].values, y_train)
y_pred = final_model.predict(X_test.iloc[:, best_features].values)

In [18]:
y_roc = final_model.predict_proba(X_test.iloc[:, best_features].values)
y_roc = y_roc[:, 1]

In [None]:
print_clf_metrics(y_test, y_pred)

In [None]:
plot_roc_auc_curve(y_test, y_roc)

In [20]:
best_hyperparameters = [30, 2, 1, 300]

final_model = RandomForestClassifier(
    n_estimators=max(1, round(best_hyperparameters[3])),
    max_depth=max(1, round(best_hyperparameters[0])),
    min_samples_split=max(1, round(best_hyperparameters[1])),
    min_samples_leaf=max(1, round(best_hyperparameters[2]))
)
final_model = final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

In [None]:
y_roc = final_model.predict_proba(X_test)
y_roc = y_roc[:, 1]

In [None]:
print_clf_metrics(y_test, y_pred)

In [None]:
plot_roc_auc_curve(y_test, y_roc)