In [109]:
import pandas as pd
from dsgd.DSClassifierMultiQ import DSClassifierMultiQ


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Clasificadores
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

accuracys = {}

In [110]:
data = pd.read_csv('data/nasa.csv')
data.drop(["Neo Reference ID", "Name", "Close Approach Date", "Orbit Determination Date" ,"Orbiting Body", "Equinox"] , axis=1, inplace=True)
print("tamaño del dataset: ", data.shape)
print(data.dtypes)

tamaño del dataset:  (4687, 34)
Absolute Magnitude              float64
Est Dia in KM(min)              float64
Est Dia in KM(max)              float64
Est Dia in M(min)               float64
Est Dia in M(max)               float64
Est Dia in Miles(min)           float64
Est Dia in Miles(max)           float64
Est Dia in Feet(min)            float64
Est Dia in Feet(max)            float64
Epoch Date Close Approach         int64
Relative Velocity km per sec    float64
Relative Velocity km per hr     float64
Miles per hour                  float64
Miss Dist.(Astronomical)        float64
Miss Dist.(lunar)               float64
Miss Dist.(kilometers)          float64
Miss Dist.(miles)               float64
Orbit ID                          int64
Orbit Uncertainity                int64
Minimum Orbit Intersection      float64
Jupiter Tisserand Invariant     float64
Epoch Osculation                float64
Eccentricity                    float64
Semi Major Axis                 float64
Inclinat

In [111]:
print(data.head(1))

   Absolute Magnitude  Est Dia in KM(min)  Est Dia in KM(max)  \
0                21.6             0.12722            0.284472   

   Est Dia in M(min)  Est Dia in M(max)  Est Dia in Miles(min)  \
0         127.219879         284.472297               0.079051   

   Est Dia in Miles(max)  Est Dia in Feet(min)  Est Dia in Feet(max)  \
0               0.176763            417.388066            933.308089   

   Epoch Date Close Approach  ...  Inclination  Asc Node Longitude  \
0               788947200000  ...     6.025981          314.373913   

   Orbital Period  Perihelion Distance  Perihelion Arg  Aphelion Dist  \
0      609.599786             0.808259        57.25747       2.005764   

   Perihelion Time  Mean Anomaly  Mean Motion  Hazardous  
0     2.458162e+06    264.837533     0.590551       True  

[1 rows x 34 columns]


In [112]:
data["Hazardous"] = data["Hazardous"].apply(lambda x: 1 if x == True else 0)
# Convert everithing to numeric
data = data.apply(pd.to_numeric)
cut = int(0.3 * len(data))

# Separate the trainig and tesing sets
X_train = data.iloc[:cut, :-1].values
y_train = data.iloc[:cut, -1].values
X_test = data.iloc[cut:, :-1].values
y_test = data.iloc[cut:, -1].values

In [113]:
DSC = DSClassifierMultiQ(2, min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7)

In [114]:
losses, epoch, dt = DSC.fit(X_train, y_train, add_single_rules=True,
                            single_rules_breaks=3, add_mult_rules=False,
                                column_names=data.columns[:-1], print_every_epochs=31, print_final_model=True)

Optimization started
Processing epoch	373	0.0546	
Training time: 168.12s, epochs: 400

Least training loss reached: 0.054
DSModelMultiQ(
  DS Classifier using 132 rules
  
  Rule 1: Absolute Magnitude < 19.782
  	C1: 0.000	C2: 0.589	Unc: 0.411
  
  Rule 2: 19.782 < Absolute Magnitude < 21.649
  	C1: 0.000	C2: 0.763	Unc: 0.237
  
  Rule 3: 21.649 < Absolute Magnitude < 23.515
  	C1: 0.620	C2: 0.000	Unc: 0.380
  
  Rule 4: Absolute Magnitude > 23.515
  	C1: 0.928	C2: 0.000	Unc: 0.072
  
  Rule 5: Est Dia in KM(min) < 0.025
  	C1: 0.302	C2: 0.000	Unc: 0.698
  
  Rule 6: 0.025 < Est Dia in KM(min) < 0.243
  	C1: 0.078	C2: 0.114	Unc: 0.808
  
  Rule 7: 0.243 < Est Dia in KM(min) < 0.462
  	C1: 0.183	C2: 0.004	Unc: 0.813
  
  Rule 8: Est Dia in KM(min) > 0.462
  	C1: 0.109	C2: 0.072	Unc: 0.819
  
  Rule 9: Est Dia in KM(max) < 0.056
  	C1: 0.276	C2: 0.000	Unc: 0.724
  
  Rule 10: 0.056 < Est Dia in KM(max) < 0.544
  	C1: 0.130	C2: 0.058	Unc: 0.811
  
  Rule 11: 0.544 < Est Dia in KM(max) < 1

In [115]:
# Using the predict method, you can predict the classes of new records
# In this case we will predict the testing set
y_pred = DSC.predict(X_test)

In [116]:
print(classification_report(y_test, y_pred))
accuracys["DSClassifierMultiQ"] = accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      2805
           1       0.80      0.67      0.73       476

    accuracy                           0.93      3281
   macro avg       0.87      0.82      0.84      3281
weighted avg       0.92      0.93      0.93      3281



In [117]:
print(confusion_matrix(y_test, y_pred,))

[[2723   82]
 [ 155  321]]


In [118]:
# Global interpretability
DSC.print_most_important_rules(classes=["no hazzardous", "hazzardous"])



Most important rules for class no hazzardous

	[0.975] R123: Perihelion Time > 2458476.115
			no : 0.975	haz: 0.000	Unc: 0.025

	[0.963] R79: Minimum Orbit Intersection > 0.149
			no : 0.963	haz: 0.000	Unc: 0.037

	[0.960] R78: 0.086 < Minimum Orbit Intersection < 0.149
			no : 0.960	haz: 0.000	Unc: 0.040

	[0.928] R3: Absolute Magnitude > 23.515
			no : 0.928	haz: 0.000	Unc: 0.072

	[0.620] R2: 21.649 < Absolute Magnitude < 23.515
			no : 0.620	haz: 0.000	Unc: 0.380

	[0.532] R111: Perihelion Distance > 0.944
			no : 0.532	haz: 0.000	Unc: 0.468

	[0.481] R85: 2457130.330 < Epoch Osculation < 2457793.512
			no : 0.480	haz: 0.001	Unc: 0.519

	[0.480] R75: Orbit Uncertainity > 4.462
			no : 0.480	haz: 0.000	Unc: 0.520

	[0.413] R68: Orbit ID < 6.698
			no : 0.413	haz: 0.000	Unc: 0.587

	[0.396] R116: Aphelion Dist < 1.267
			no : 0.396	haz: 0.000	Unc: 0.604

	[0.379] R96: Inclination < 7.048
			no : 0.379	haz: 0.000	Unc: 0.621

	[0.363] R69: 6.698 < Orbit ID < 31.918
			no : 0.363	haz:

In [119]:
X = data.drop('Hazardous', axis=1)  # Asumiendo que 'target' es la columna objetivo
y = data['Hazardous']

X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)


In [122]:
# Diccionario de modelos
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(max_depth=5),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Función para entrenar y evaluar modelos
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracys[name] = accuracy
        print(f'{name} Accuracy: {accuracy:.4f}')
        print(f'Classification Report for {name}:\n{classification_report(y_test, y_pred)}')
        print(f'Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}\n')

# Llamada a la función
train_and_evaluate(models, X1_train, y1_train, X1_test, y1_test)


Logistic Regression Accuracy: 0.8387
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       983
           1       0.00      0.00      0.00       189

    accuracy                           0.84      1172
   macro avg       0.42      0.50      0.46      1172
weighted avg       0.70      0.84      0.77      1172

Confusion Matrix for Logistic Regression:
[[983   0]
 [189   0]]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Accuracy: 0.9974
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       983
           1       0.99      0.99      0.99       189

    accuracy                           1.00      1172
   macro avg       1.00      0.99      1.00      1172
weighted avg       1.00      1.00      1.00      1172

Confusion Matrix for Random Forest:
[[982   1]
 [  2 187]]

Support Vector Machine Accuracy: 0.8387
Classification Report for Support Vector Machine:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       983
           1       0.00      0.00      0.00       189

    accuracy                           0.84      1172
   macro avg       0.42      0.50      0.46      1172
weighted avg       0.70      0.84      0.77      1172

Confusion Matrix for Support Vector Machine:
[[983   0]
 [189   0]]

K-Nearest Neighbors Accuracy: 0.7944
Classification Rep

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [123]:
for key in accuracys:
    print(key, accuracys[key])

DSClassifierMultiQ 0.9277659250228589
Logistic Regression 0.8387372013651877
Random Forest 0.9974402730375427
Support Vector Machine 0.8387372013651877
K-Nearest Neighbors 0.7943686006825939
