In [224]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

"""
functions starting with df_ can generate a processed dataframe directly
"""


def as_discrete(col):
    n = len(col)
    new_col = [0] * n
    for i in range(n):
        if col[i] == b"0":
            new_col[i] = 0
        else:
            new_col[i] = 1
    return pd.DataFrame(new_col)


def get_Xy(df):
    X = df.iloc[:, 0 : len(df) - 1]
    y = as_discrete(df.iloc[:, -1])
    return X, y


def med_impute(df, y):
    # remove columns with more than 40% values being null
    thd1 = df.shape[0] * 0.4
    cols = df.columns[df.isnull().sum() < thd1]
    df = df[cols]

    # remove rows with more than 50% values being null
    thd2 = df.shape[1] * 0.5
    y = y[df.isnull().sum(axis=1) <= thd2]
    df = df[df.isnull().sum(axis=1) <= thd2]

    # median imputation for null values
    df = df.fillna(df.median())

    return df, y


def normalise(df):
    scaler = MinMaxScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    return X_scaled


def df_null_removal(df):
    # Extract features (X) and target (y)
    X, y = get_Xy(df)

    # Remove null values and impute missing values
    X_imputed, y = med_impute(X, y)

    # Scale the imputed data
    X_scaled_df = normalise(X_imputed)

    return pd.concat([X_scaled_df, y], axis=1)


def drop_high_corr(df, threshold=0.7):
    correlation_matrix = df.corr()
    high_cor = []
    dropped_features = []

    # Iterate through the correlation matrix to find highly correlated pairs
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                if correlation_matrix.columns[j] != correlation_matrix.columns[i]:
                    high_cor.append(
                        [
                            correlation_matrix.columns[i],
                            correlation_matrix.columns[j],
                            correlation_matrix.iloc[i, j],
                        ]
                    )

    # Iterate through the list of highly correlated pairs
    for pair in high_cor:
        feature1, feature2, correlation = pair

        # Check if either of the features in the pair has already been dropped
        if feature1 not in dropped_features and feature2 not in dropped_features:
            # Check if the feature exists in the DataFrame before attempting to drop it
            if feature2 in df.columns:
                # Drop one of the correlated features from the dataset
                # Here, we arbitrarily choose to drop the second feature in the pair
                df.drop(feature2, axis=1, inplace=True)
                dropped_features.append(feature2)
            else:
                #print(f"Feature {feature2} not found in the DataFrame.")
                print("Feature '" + feature2 + "' not found in the DataFrame.") #temporary 

    return df


def df_null_corr_process(df):
    X, y = df_null_removal(df)
    return drop_high_corr(X), y


def pre_process(df):
    X, y = get_Xy(df)
    X_imputed, y_final = med_impute(X, y)
    X_scaled = normalise(X_imputed)
    X_final = drop_high_corr(X_scaled)

    return X_final, y_final


def get_train_test(df):

    X, y = get_Xy(df)
    X_imputed, y_final = med_impute(X, y)
    X_scaled = normalise(X_imputed)
    X_final = drop_high_corr(X_scaled)
    X_train, X_test, y_train, y_test = train_test_split(
        X_final, y_final, test_size=0.2, random_state=3244
    )

    return X_train, X_test, y_train, y_test


In [225]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.io import arff
import data_processing
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings("ignore")

data = arff.loadarff("../data/3year.arff")
df = pd.DataFrame(data[0])
df_origin = df.copy()

In [226]:
df_x, df_y = pre_process(df)
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42)

print(X_train.shape)
z = y_test[0].sum()
print(z)

(7035, 31)
162


In [227]:
def get_df_with_top_k_features(k_features, X_train, X_test, y_train, y_test):
    # define feature selection
    fs = SelectKBest(score_func=f_classif, k=k_features)

    # apply feature selection
    X_selected = fs.fit_transform(X_train, y_train)

    # Take the features with the highest F-scores
    fs_scores_array = np.array(fs.scores_)

    # Get the indices that would sort the array in descending order
    sorted_indices_desc = np.argsort(fs_scores_array)[::-1]

    # Take the top k indices
    top_indices = sorted_indices_desc[:k_features]

    X_train_with_selected_x = X_train.iloc[:, top_indices]
    X_test_with_selected_x = X_test.iloc[:, top_indices]
    return X_train_with_selected_x, X_test_with_selected_x, y_train, y_test

k_features= 25
X_train, X_test, y_train, y_test  = get_df_with_top_k_features(k_features, X_train, X_test, y_train, y_test)
print(df.shape)
print(X_train.shape) #dropped 2 rows after preprocessing?
print(y_train.shape)
print(X_test.shape) #dropped 2 rows after preprocessing?
print(y_test.shape)

(10503, 65)
(7035, 25)
(7035, 1)
(3466, 25)
(3466, 1)


In [228]:
# Linear regression - test significance
import statsmodels.api as sm

def linear_regression_model(X_train, y_train):
    # Reset indices to ensure alignment
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    print(X_train.shape)

    # Add constant column to the features
    x_features = sm.add_constant(X_train)

    # Fit OLS model
    ols_model = sm.OLS(y_train, x_features)
    fit_results = ols_model.fit()
    print(fit_results)

    # Extract p-values
    p_values = fit_results.pvalues

    # Count the number of features that has p-value > 0.05
    count_greater_than_005 = np.sum(p_values > 0.05)

    # Evaluation
    MSE = fit_results.mse_total
    print(f"Number of attributes that are not significant: {count_greater_than_005} / {len(X_train.columns)}")
    print(f"MSE: {MSE}")

    return fit_results

linear_regression_trained_model = linear_regression_model(X_train, y_train)

(7035, 25)
<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x131496dd0>
Number of attributes that are not significant: 25 / 25
MSE: 0.04510058667222804


In [242]:
from sklearn.metrics import accuracy_score
def predict_test_data(linear_regression_trained_model, X_test, y_test):
    X_test_with_constant = sm.add_constant(X_test)

    predictions = linear_regression_trained_model.predict(X_test_with_constant)
    return predictions

outcome = predict_test_data(linear_regression_trained_model, X_test, y_test)
# Count the number of values less than 0.5
count_less_than_05 = np.sum(outcome < 0.5)

# Count the number of values greater than or equal to 0.5
count_greater_than_or_equal_05 = np.sum(outcome >= 0.5)
discrete_outcome = outcome.apply(lambda x: 0 if x < 0.05 else x)

print("Number of values < 0.5:", count_less_than_05)
print("Number of values >= 0.5:", count_greater_than_or_equal_05)

Number of values < 0.5: 3304
Number of values >= 0.5: 162


In [263]:
def accuracy(y_true, y_pred, tolerance=0):
    """
    Calculate the accuracy of predictions within a tolerance range.
    
    Parameters:
    - y_true: Array-like, true target values.
    - y_pred: Array-like, predicted target values.
    - tolerance: float, the tolerance range around the true values.
    
    Returns:
    - acc: float, the accuracy of predictions within the tolerance range.
    """
    # Calculate the absolute errors
    errors = abs(y_pred - y_true)
    
    # Count the number of predictions within the tolerance range
    within_tolerance = sum(errors <= tolerance)
    
    # Calculate the total number of predictions
    total_predictions = len(y_true)
    
    # Calculate accuracy
    acc = within_tolerance / total_predictions
    
    return acc

# Calculate accuracy using the function
acc = accuracy(y_test.values.flatten(), discrete_outcome)

print("Accuracy:", acc)


Accuracy: 0.953260242354299


In [253]:
def logistic_regression_model(X_train, y_train):
    # Reset indices to ensure alignment
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

    # Add constant column to the features
    X_train_with_constant = sm.add_constant(X_train)

    # Fit logistic regression model
    logit_model = sm.Logit(y_train, X_train_with_constant)
    fit_results = logit_model.fit()

    # Print summary of the model
    print(fit_results.summary())

    return fit_results


logistic_trained_model = logistic_regression_model(X_train, y_train)

         Current function value: inf
         Iterations: 35


LinAlgError: Singular matrix