In [12]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.io import arff
import data_processing as dp
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

warnings.filterwarnings("ignore")

data = arff.loadarff("../data/3year.arff")
df = pd.DataFrame(data[0])
df_origin = df.copy()

In [13]:
k_features = 25
train_test_dataset = dp.pre_process(df)  # with SMOTE
X_train, X_test, y_train, y_test = dp.get_df_with_top_k_features(
    k_features, *train_test_dataset
)
print(f"X_train.shape: {X_train.shape}")
print(f"y_train.shape: {y_train.shape}")

X_train.shape: (13978, 25)
y_train.shape: (13978, 1)


In [14]:
# Linear regression - test significance
import statsmodels.api as sm


def linear_regression_model(X_train, y_train):
    # Reset indices to ensure alignment
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

    # Add constant column to the features
    x_features = sm.add_constant(X_train)

    # Fit OLS model
    ols_model = sm.OLS(y_train, x_features)
    fit_results = ols_model.fit()
    print(fit_results)

    # Extract p-values
    p_values = fit_results.pvalues

    # Count the number of features that has p-value > 0.05
    count_greater_than_005 = np.sum(p_values > 0.05)

    # Evaluation
    MSE = fit_results.mse_total
    print(
        f"Number of attributes that are not significant: {count_greater_than_005} / {len(X_train.columns)}"
    )
    print(f"MSE: {MSE}")

    return


linear_regression_trained_model = linear_regression_model(X_train, y_train)

<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x1349e2fd0>
Number of attributes that are not significant: 14 / 25
MSE: 0.2500178865278672


In [15]:
# Not a good way to categorise binary output based on continous output
# thus omit linear regression


def predict_test_data(linear_regression_trained_model, X_test):
    X_test_with_constant = sm.add_constant(X_test)

    predictions = linear_regression_trained_model.predict(X_test_with_constant)
    return predictions


# outcome = predict_test_data(linear_regression_trained_model, X_test)
# print(f"Predicted outcome:\n{outcome[:5]}")

# Count the number of values less than 0.5
# count_less_than_05 = np.sum(outcome < 0.5)

# Count the number of values greater than or equal to 0.5
# count_greater_than_or_equal_05 = np.sum(outcome >= 0.5)
# discrete_outcome = outcome.apply(lambda x: 0 if x < 0.05 else x)

# print("\nNumber of values < 0.5:", count_less_than_05)
# print("Number of values >= 0.5:", count_greater_than_or_equal_05)
# print(f"Total: {count_less_than_05 + count_greater_than_or_equal_05}")

In [16]:
def linear_reg_accuracy(y_true, y_pred, tolerance=0):
    """
    Calculate the accuracy of predictions within a tolerance range.

    Parameters:
    - y_true: Array-like, true target values.
    - y_pred: Array-like, predicted target values.
    - tolerance: float, the tolerance range around the true values.

    Returns:
    - acc: float, the accuracy of predictions within the tolerance range.
    """
    # Calculate the absolute errors
    errors = abs(y_pred - y_true)

    # Count the number of predictions within the tolerance range
    within_tolerance = sum(errors <= tolerance)

    # Calculate the total number of predictions
    total_predictions = len(y_true)

    # Calculate accuracy
    acc = within_tolerance / total_predictions

    return acc


# Calculate accuracy using the function
# acc = linear_reg_accuracy(y_test.values.flatten(), discrete_outcome)

# print("Accuracy:", acc)

Try logistic regression

In [17]:
# test logistic regression from stats package
# iteration error
def logistic_regression_model(X_train, y_train):
    # Reset indices to ensure alignment
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

    # Add constant column to the features
    X_train_with_constant = sm.add_constant(X_train)

    # Fit logistic regression model
    logit_model = sm.Logit(y_train, X_train_with_constant)
    fit_results = logit_model.fit(max_iter=100, method="newton")

    # Print summary of the model
    print(fit_results.summary())

    return fit_results


logistic_trained_model = logistic_regression_model(X_train, y_train)

Optimization terminated successfully.
         Current function value: 0.629073
         Iterations 20
                           Logit Regression Results                           
Dep. Variable:                      0   No. Observations:                13978
Model:                          Logit   Df Residuals:                    13952
Method:                           MLE   Df Model:                           25
Date:                Sat, 06 Apr 2024   Pseudo R-squ.:                 0.09244
Time:                        23:35:54   Log-Likelihood:                -8793.2
converged:                       True   LL-Null:                       -9688.8
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         38.6336     28.771      1.343      0.179     -17.756      95.023
Attr29        -0.7610      0