In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.io import arff
import data_processing as dp

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data3 = arff.loadarff("../data/3year.arff")
df3 = pd.DataFrame(data3[0])
data5 = arff.loadarff("../data/5year.arff")
df5 = pd.DataFrame(data5[0])

In [3]:
X_train, X_test, y_train, y_test = dp.get_train_test(df3)
# X_train, X_test, y_train, y_test = dp.pre_process(df3)

In [4]:
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

  y = column_or_1d(y, warn=True)


array([[2001,    0],
       [   0,  100]], dtype=int64)

In [5]:
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [12]:
def as_discrete(col):
    n = len(col)
    new_col = [0] * n
    for i in range(n):
        if col[i] == b"0":
            new_col[i] = 0
        else:
            new_col[i] = 1
    return pd.DataFrame(new_col)

def get_Xy(df):
    X = df.iloc[:, 0 : len(df) - 1]
    y = as_discrete(df.iloc[:, -1])
    return X, y

def med_impute(df, y):
    thd1 = df.shape[0] * 0.4
    cols = df.columns[df.isnull().sum() < thd1]
    df = df[cols]

    thd2 = df.shape[1] * 0.5
    y = y[df.isnull().sum(axis=1) <= thd2]
    df = df[df.isnull().sum(axis=1) <= thd2]

    for column in df.columns:
        df[column] = df[column].fillna(df[column].median())
    return df, y

def normalise(df):
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return df

def drop_high_corr(df, threshold=0.7):

    correlation_matrix = df.corr()
    high_cor = []
    dropped_features = []

    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                if correlation_matrix.columns[j] != correlation_matrix.columns[i]:
                    high_cor.append([
                        correlation_matrix.columns[i],
                        correlation_matrix.columns[j],
                        correlation_matrix.iloc[i, j]
                    ])

    for pair in high_cor:
        feature1, feature2, correlation = pair

        if feature1 not in dropped_features and feature2 not in dropped_features:
            if feature2 in df.columns:
                df.drop(feature2, axis=1, inplace=True)
                dropped_features.append(feature2)
            else:
                print(f"Feature '{feature2}' not found in the DataFrame.")
    return df

def pre_process(df):
    X, y = get_Xy(df)
    X, y = med_impute(X, y)   
    X = normalise(X)
    X = drop_high_corr(X, threshold=0.7)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1000)
    smote = SMOTE(random_state=0)
    X_smote, y_smote = smote.fit_resample(X_train, y_train)
   
    return X_smote, X_test, y_smote, y_test

In [13]:
X_train, X_test, y_train, y_test = pre_process(df3)
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

  y = column_or_1d(y, warn=True)


array([[3017,    0],
       [   0,  134]], dtype=int64)