In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
import os
from imblearn.over_sampling import SMOTE
from collections import Counter
import re

#### Dataframes

In [32]:
guide = {
    'Apollo' : 1,
    'Amor' : 0,
    'Aten' : 2,
    'IEO' : 3
}

In [123]:
df1 = pd.read_csv('datasets/ac_dataset_neo_pha.csv', engine='pyarrow')

In [124]:
x = df1.iloc[:, :-4]
y = df1.iloc[:, -2]

In [125]:
temp = pd.concat([x, y], axis=1)
correlation_with_neo = temp.corr()['neo']
top_attributes = correlation_with_neo.abs().nlargest(11).index
temp = temp[top_attributes]

x = temp.iloc[:, 1:]
y = temp.iloc[:, 0]

In [128]:
train1 = pd.concat([x, y], axis=1)
train1.to_csv('Front-End/resources/train1.csv', index=False)

In [126]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

In [66]:
neo_reg = LogisticRegression(max_iter=10000, random_state=0)

cv_scores = cross_val_score(neo_reg, x_train, y_train, cv=5)
print('Accuracy: ', cv_scores.mean())

Accuracy:  0.9996158091567102


In [67]:
x = df1.iloc[:, :-4]
y = df1.iloc[:, -1]

In [68]:
temp = pd.concat([x, y], axis=1)
correlation_with_neo = temp.corr()['pha']
top_attributes = correlation_with_neo.abs().nlargest(11).index
temp = temp[top_attributes]

x = temp.iloc[:, 1:]
y = temp.iloc[:, 0]

In [116]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

In [70]:
pha_reg = LogisticRegression(max_iter=10000, random_state=0)

cv_scores = cross_val_score(pha_reg, x_train, y_train, cv=5)
print('Accuracy: ', cv_scores.mean())

Accuracy:  0.9973118390721541


In [73]:
joblib.dump(neo_reg, 'models/neo_reg_0.joblib')
joblib.dump(pha_reg, 'models/pha_reg_0.joblib')

['models/pha_reg_0.joblib']

In [129]:
df = pd.read_csv('datasets/ab_cneos_closeapproach_data_pha.csv', engine='pyarrow')

In [130]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [122]:
test_1 = pd.concat([x.iloc[:20, :], y[:20]], axis=1)
test_1.to_csv('Front-End/resources/test2.csv', index=False)

In [131]:
import re

def extract_upper_limit(value):
    if not 'km' in value:
        # Extract the upper limit from "X m - Y m" or "X m" format
        limits = re.findall(r'[\d.]+', value)
        if limits:
            return float(limits[-1])
    return value

# Assuming x['Diameter (m)'] is your column
x['Diameter'] = x['Diameter'].apply(extract_upper_limit)


In [132]:
print(x['Diameter'].head())

0           550.0
1            82.0
2            77.0
3    0.33±0.05 km
4            90.0
Name: Diameter, dtype: object


In [133]:
x = x[x['Diameter'].apply(lambda x: 'km' not in str(x))]
print(x['Diameter'].head())

0    550.0
1     82.0
2     77.0
4     90.0
5     57.0
Name: Diameter, dtype: object


In [134]:
x.drop(['Object'], axis=1, inplace=True)

In [135]:
print(x.info())

<class 'pandas.core.frame.DataFrame'>
Index: 33892 entries, 0 to 35909
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Close-Approach (CA) Date  33892 non-null  object 
 1   CA DistanceNominal (au)   33892 non-null  object 
 2   CA DistanceMinimum (au)   33892 non-null  float64
 3   V relative(km/s)          33892 non-null  float64
 4   V infinity(km/s)          33892 non-null  object 
 5   H(mag)                    33892 non-null  float64
 6   Diameter                  33892 non-null  object 
dtypes: float64(3), object(4)
memory usage: 2.1+ MB
None


In [136]:
final = pd.concat([x, y], axis=1)

final.dropna(inplace=True)

In [137]:
print(final.info())

<class 'pandas.core.frame.DataFrame'>
Index: 33892 entries, 0 to 35909
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Close-Approach (CA) Date  33892 non-null  object 
 1   CA DistanceNominal (au)   33892 non-null  object 
 2   CA DistanceMinimum (au)   33892 non-null  float64
 3   V relative(km/s)          33892 non-null  float64
 4   V infinity(km/s)          33892 non-null  object 
 5   H(mag)                    33892 non-null  float64
 6   Diameter                  33892 non-null  object 
 7   pha                       33892 non-null  int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 2.3+ MB
None


In [138]:
x = final.iloc[:, :-1]
y = final.iloc[:, -1]

In [139]:
# Assuming df is your DataFrame and 'Diameter (m)' is the column
x['Diameter'] = x['Diameter'].astype(float)


In [140]:
def clean_and_convert(value):
    # Extract numeric part using regex
    numeric_part = re.search(r'[\d.]+', str(value))
    if numeric_part:
        return float(numeric_part.group())
    else:
        return None  # or handle non-convertible values as needed

# Assuming x is your DataFrame and 'CA DistanceNominal (au)' is the column
x['CA DistanceNominal (au)'] = x['CA DistanceNominal (au)'].apply(clean_and_convert)

In [141]:
# Assuming x is your DataFrame and 'V infinity(km/s)' is the column
x['V infinity(km/s)'] = pd.to_numeric(x['V infinity(km/s)'], errors='coerce').fillna(0).astype(float)


In [142]:
x.drop('Close-Approach (CA) Date', axis=1, inplace=True)

In [143]:
print(x.info(), y.info())

<class 'pandas.core.frame.DataFrame'>
Index: 33892 entries, 0 to 35909
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CA DistanceNominal (au)  33892 non-null  float64
 1   CA DistanceMinimum (au)  33892 non-null  float64
 2   V relative(km/s)         33892 non-null  float64
 3   V infinity(km/s)         33892 non-null  float64
 4   H(mag)                   33892 non-null  float64
 5   Diameter                 33892 non-null  float64
dtypes: float64(6)
memory usage: 1.8 MB
<class 'pandas.core.series.Series'>
Index: 33892 entries, 0 to 35909
Series name: pha
Non-Null Count  Dtype
--------------  -----
33892 non-null  int64
dtypes: int64(1)
memory usage: 529.6 KB
None None


In [145]:
train1 = pd.concat([x.iloc[:20, :], y[:20]], axis=1)
train1.to_csv('Front-End/resources/test2.csv', index=False)

In [144]:
train1 = pd.concat([x, y], axis=1)
train1.to_csv('Front-End/resources/train2.csv', index=False)

In [109]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

In [110]:
hazard_reg = LogisticRegression(max_iter=10000, random_state=0)

cv_scores = cross_val_score(hazard_reg, x_train, y_train, cv=5)
print('Accuracy: ', cv_scores.mean())

Accuracy:  0.9476854290326877


In [111]:
joblib.dump(hazard_reg, 'models/pha_reg_1.joblib')

['models/pha_reg_1.joblib']