In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
#Load the dataset
train_csv_file_path = "fraudTest.csv"
test_csv_file_path = "fraudTrain.csv"

In [None]:
#read data from file
train_df = pd.read_csv(train_csv_file_path)
test_df = pd.read_csv(test_csv_file_path)

In [None]:
#column names
print(train_df.columns)

In [None]:
#column names
print(test_df.columns)

In [None]:
#Compare columns and identify any Inconsistencies
train_columns = set(train_df.columns)
test_columns = set(test_df.columns)
missing_columns_in_train = test_columns - train_columns
missing_columns_in_test = train_columns - test_columns

In [None]:
#Prints missing columns in dataframes.
print("\nColumns missing in train_df:", missing_columns_in_train)
print("Columns missing in test_df:", missing_columns_in_test)

In [None]:
#Add missing columns with zeros to the respective dataframes
for column in missing_columns_in_train:
    train_df[column] = 0

for column in missing_columns_in_test:
    test_df[column] = 0

In [None]:
#Reorder columns to ensure consistency
train_df = train_df[test_df.columns]

In [None]:
#Apply preprocessing steps for categorical columns
categorical_columns = ['trans_date_trans_time', 'cc_num', 'amt', 'first', 'last','zip', 'lat', 'long', 'city_pop', ]

In [None]:
#Apply Label Encoding to categorical columns
for column in categorical_columns:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[column], test_df[column]]))
    train_df[column] = le.transform(train_df[column])
    test_df[column] = le.transform(test_df[column])

In [None]:
#Numeric columns
numeric_columns = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'is_fraud']

In [None]:
#Split the data into train, test sets
X_train = train_df[numeric_columns]
y_train = train_df['is_fraud']
X_test = test_df[numeric_columns]
y_test = test_df['is_fraud']

In [None]:
#Initialize classifiers
classifiers = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier()
}

In [None]:
# Training and evaluating classifiers
for name, classifier in classifiers.items():
    print(f"Training {name} classifier...")
    classifier.fit(X_train, y_train)
    
    # Make predictions on the test data
    predictions = classifier.predict(X_test)
    
    # Evaluate the classifier
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

In [None]:
print(f"{name} Classifier:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
