In [None]:
#importing all the requirements
import pandas as pd
from scipy.io import arff
import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM

In [None]:
plt.rcParams['font.family'] = 'Ubuntu Mono'

In [None]:
#loading bank data into pandas dataframe
bank_data = arff.loadarff('Datasets/bank-additional-ful-nominal.arff')
bank_data_df= pd.DataFrame(bank_data[0])
bank_data_df.head()

In [None]:
#loading census data into pandas dataframe
census_data = arff.loadarff('Datasets/census-income-full-nominal.arff')
census_data_df= pd.DataFrame(census_data[0])
census_data_df.head()

In [None]:
#checking the datatype of the elements in the given data
print(type(bank_data[0][0][0]))
print(type(census_data[0][0][0]))

In [None]:
#converting the numpy bytes to strings in bank data
bank_data_df = bank_data_df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
bank_data_df.head()

In [None]:
#converting the numpy bytes to strings in census data
census_data_df = census_data_df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
census_data_df.head()

In [None]:
#Renaming the name and data in the last column of bank data
bank_data_df.rename(columns={'y':'anamoly'}, inplace=True)
bank_data_df['anamoly'] = bank_data_df['anamoly'].map({'no':0,'yes':1})
bank_data_df.head()

In [None]:
#Renaming the name and data in the last column of census data
census_data_df.rename(columns={'class':'anamoly'}, inplace=True)
census_data_df['anamoly'] = census_data_df['anamoly'].map({'50000+.':1,'--50000.':0})
census_data_df.head()

In [None]:
#Feature encoding of bank data using target encoding
target_encoder = ce.TargetEncoder(cols=bank_data_df.columns[:-1])
bank_data_df[bank_data_df.columns[:-1]] = target_encoder.fit_transform(bank_data_df[bank_data_df.columns[:-1]], bank_data_df['anamoly'])
bank_data_df.head()

In [None]:
#Checking the shape of the bank data and if there any null values in the data
print(bank_data_df.shape)
print(bank_data_df.isnull().sum())

In [None]:
#Checking the shape of the census data and if there any null values in the data
print(census_data_df.shape)
print(census_data_df.isnull().sum())

In [None]:
#scaling the bank data
excluded_column = bank_data_df['anamoly']
columns_to_scale = bank_data_df.drop(columns = ['anamoly'])

scaler = StandardScaler()
scaled_data = scaler.fit_transform(columns_to_scale)
bank_data_df = pd.DataFrame(scaled_data, columns=columns_to_scale.columns, index = bank_data_df.index)
bank_data_df['anamoly'] = excluded_column

bank_data_df.head()

In [None]:
corr_matrix = bank_data_df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

high_corr_pairs = np.where(np.abs(corr_matrix) > 0.8)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) for x, y in zip(*high_corr_pairs) if x != y and x < y]
print(high_corr_pairs)

In [None]:
X = bank_data_df.iloc[:, :-1]
y = bank_data_df.iloc[:, -1]  

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
tsne_df['Target'] = y

plt.figure(figsize=(10, 6))
sns.scatterplot(data=tsne_df, x="TSNE1", y="TSNE2", hue="Target", palette="coolwarm")
plt.title("t-SNE Visualization")
plt.show()

In [None]:
methods = {
    "Isolation Forest": IsolationForest(contamination=0.1, random_state=42),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, contamination=0.1, novelty=True),
    "Elliptic Envelope": EllipticEnvelope(contamination=0.1, random_state=42),
    "DBSCAN": DBSCAN(eps=0.3, min_samples=10),
    "One-Class SVM": OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
}

for name, model in methods.items():
    if name == "Local Outlier Factor":
        model.fit(X_tsne)  
        predictions = model.predict(X_tsne)
    elif name == "DBSCAN":
        predictions = model.fit_predict(X_tsne)
    else:
        model.fit(X_tsne)
        predictions = model.predict(X_tsne)

    anomaly_labels = (predictions == -1).astype(int)

    tsne_df['Anomaly_Predicted'] = anomaly_labels
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=tsne_df, x="TSNE1", y="TSNE2", hue="Anomaly_Predicted", palette="coolwarm", legend="full")
    plt.title(f"{name} Anomaly Detection on t-SNE Reduced Data")
    plt.show()