In [1]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir("/content/drive/MyDrive/ResearchProject")

Mounted at /content/drive/


In [2]:
from irp_functions import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from scipy.stats import ks_2samp
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, f1_score 
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
from sklearn.neighbors import NearestNeighbors

In [3]:
root_path = '/content/drive/My Drive/ResearchProject/' 
dataset_path = root_path + 'Datasets/'
mammography_filename = root_path + 'Datasets/mammography.csv'
df = pd.read_csv(mammography_filename)
df.drop("Unnamed: 0", axis=1, inplace=True)
df['y'] = np.where(df['y'] == 1, 1, 0)

In [4]:
df['y'].value_counts()

0    10923
1      260
Name: y, dtype: int64

In [5]:
y = df['y'].to_numpy()
X = df.drop(['y'], axis=1).to_numpy()

In [6]:
benchmark = ten_experiments(X=X, y=y)
benchmark_data = get_DataFrame_from_results(benchmark)

In [7]:
SMOTE = {}
for oversampling_rate in range(100, 1100, 100):
  results = ten_experiments(X=X, y=y, over_sampler="SMOTE", N=oversampling_rate)
  data = get_DataFrame_from_results(results)
  SMOTE[oversampling_rate] = data

In [8]:
Outlier_SMOTE_Euclidean = {}
for oversampling_rate in range(100, 1100, 100):
  results = ten_experiments(X=X, y=y, over_sampler="Outlier_SMOTE", 
                            N=oversampling_rate)
  data = get_DataFrame_from_results(results)
  Outlier_SMOTE_Euclidean[oversampling_rate] = data

In [9]:
Outlier_SMOTE_Manhattan = {}
for oversampling_rate in range(100, 1100, 100):
  results = ten_experiments(X=X, y=y, over_sampler="Outlier_SMOTE", 
                            metric="manhattan" ,N=oversampling_rate)
  data = get_DataFrame_from_results(results)
  Outlier_SMOTE_Manhattan[oversampling_rate] = data

In [10]:
Outlier_SMOTE_Chebyshev = {}
for oversampling_rate in range(100, 1100, 100):
  results = ten_experiments(X=X, y=y, over_sampler="Outlier_SMOTE", 
                            metric="chebyshev" ,N=oversampling_rate)
  data = get_DataFrame_from_results(results)
  Outlier_SMOTE_Chebyshev[oversampling_rate] = data

In [11]:
os.chdir("/content/drive/MyDrive/ResearchProject/Results/Mammography")

In [12]:
benchmark_data.to_csv('benchmark.csv')

In [13]:
for oversampling_rate in range(100, 1100, 100):
  prefix = str(oversampling_rate)
  
  smote = SMOTE[oversampling_rate]
  euclidean = Outlier_SMOTE_Euclidean[oversampling_rate]
  manhattan = Outlier_SMOTE_Manhattan[oversampling_rate]
  chebyshev = Outlier_SMOTE_Chebyshev[oversampling_rate]

  smote_name = prefix + "_SMOTE.csv"
  euclidean_name = prefix + "_Euclidean_Outlier_SMOTE.csv"
  manhattan_name = prefix + "_Manhattan_Outlier_SMOTE.csv"
  chebyshev_name = prefix + "_Chebyshev_Outlier_SMOTE.csv"

  smote.to_csv(smote_name)
  euclidean.to_csv(euclidean_name)
  manhattan.to_csv(manhattan_name)
  chebyshev.to_csv(chebyshev_name)  