In [17]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [4]:
file_path = "ds2_cleaned.csv"
df = pd.read_csv(file_path, index_col=0)
df.head()

Unnamed: 0_level_0,pha,eccentricity,semi_major_axis(au),q(au),inclination(deg),long_ascending_node(deg),argument_perihelion(deg),mean_anomaly(deg),aphelion_distance(au)
spkid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000433,0,0.2227,1.458,1.133,10.83,304.3,178.9,246.9,1.78
2000719,0,0.547,2.638,1.195,11.58,183.86,156.23,278.2,4.08
2000887,0,0.5705,2.473,1.062,9.39,110.43,350.49,86.61,3.88
2001036,0,0.5331,2.666,1.245,26.68,215.52,132.43,140.65,4.09
2001221,0,0.4358,1.919,1.082,11.88,171.32,26.64,261.04,2.75


# OVERSAMPLING

In [6]:
# Will use standard scaler to try and reduce bias.  Will drop pha and make it y

# Remove pha status target from features data
y = df.pha
X = df.drop(columns=['pha'])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
Counter(y_train)

Counter({1: 1660, 0: 19210})

In [7]:

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({1: 19210, 0: 19210})

In [8]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

0.718988069570217

In [15]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4677,1727
Actual 1,228,325


Accuracy Score : 0.718988069570217
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.73      0.83      6404
           1       0.16      0.59      0.25       553

    accuracy                           0.72      6957
   macro avg       0.56      0.66      0.54      6957
weighted avg       0.89      0.72      0.78      6957



# Undersampling

In [20]:
# Will use standard scaler to try and reduce bias.  Will drop pha and make it y

# Remove pha status target from features data
y = df.pha
X = df.drop(columns=['pha'])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
Counter(y_train)

Counter({1: 1660, 0: 19210})

In [21]:
# Undersample the data using `RandomUnderSampler`
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1660, 1: 1660})

In [22]:
# Fit a Logistic regression model using random undersampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [24]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4694,1710
Actual 1,233,320


Accuracy Score : 0.7207129509846198
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.73      0.83      6404
           1       0.16      0.58      0.25       553

    accuracy                           0.72      6957
   macro avg       0.56      0.66      0.54      6957
weighted avg       0.89      0.72      0.78      6957

