In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
cc_info = pd.read_csv('cc_info.csv')
transactions = pd.read_csv('transactions.csv')

merged_df = pd.merge(cc_info,transactions)


columns = merged_df.columns.values
columns

array(['credit_card', 'city', 'state', 'zipcode', 'credit_card_limit',
       'date', 'transaction_dollar_amount', 'Long', 'Lat'], dtype=object)

In [3]:
merged_df.head()

Unnamed: 0,credit_card,city,state,zipcode,credit_card_limit,date,transaction_dollar_amount,Long,Lat
0,1280981422329509,Dallas,PA,18612,6000,2015-08-05 00:59:19,11.94,-75.964527,41.353578
1,1280981422329509,Dallas,PA,18612,6000,2015-10-29 18:23:04,5.76,-76.019703,41.311467
2,1280981422329509,Dallas,PA,18612,6000,2015-10-25 17:50:48,25.84,-76.043031,41.291053
3,1280981422329509,Dallas,PA,18612,6000,2015-09-05 17:39:43,68.89,-75.944299,41.327282
4,1280981422329509,Dallas,PA,18612,6000,2015-09-04 14:12:59,27.01,-75.997259,41.352099


In [4]:
# Data Cleaning

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False), ['city', 'state', 'zipcode']),
        ('num', StandardScaler(), ['credit_card_limit', 'transaction_dollar_amount', 'Long', 'Lat']),
    ]
)
processed_data = preprocessing.fit_transform(merged_df)
data_train,data_test = train_test_split(processed_data,test_size=0.3)

In [6]:
# Model Training

model = GaussianMixture()
model.fit(data_train)


In [7]:
# Model Evaluation
train_score = model.score(data_train)
test_score = model.score(data_test)

In [None]:
# Model Result
print("Train Score: ", train_score)
print("Test Score: ", test_score)

# The Log Likelihood of the model is close to each other, which indicates that the model is not overfitting. and generalizes well to unseen data

Train Score:  1097.2375232803179
Test Score:  1097.6947915047067


In [19]:
# Test anomaly data
anomaly_data = pd.DataFrame({
    'credit_card': [1280981422329509],
    'city': ['Dallas'],
    'state': ['PA'],
    'zipcode': [18612],
    'credit_card_limit': [6500],
    'date': ['2015-08-05 00:59:19'],
    'transaction_dollar_amount': [5100.94],
    'Long': [-100.964527],
    'Lat': [2100000.353578]
})
non_anomaly_data = pd.DataFrame({
    'credit_card': [1280981422329509],
    'city': ['Dallas'],
    'state': ['PA'],
    'zipcode': [18612],
    'credit_card_limit': [6500],
    'date': ['2015-08-05 00:59:19'],
    'transaction_dollar_amount': [100.94],
    'Long': [-80.964527],
    'Lat': [40.353578]
})

def identify_anomaly(data):
    processed_data = preprocessing.transform(data)
    anomaly_score = model.score_samples(processed_data)
    threshold = np.percentile(model.score_samples(data_train), 5)
    is_anomaly = anomaly_score < threshold
    return is_anomaly

print("Anomaly: ", identify_anomaly(anomaly_data))
print("Non-Anomaly: ", identify_anomaly(non_anomaly_data))

Anomaly:  [ True]
Non-Anomaly:  [False]
