# Initial Model Evaluation
For this test phase, we will import one somewhat large dataset (in this case, 8_dataset.csv) among the ones in the datasets folder and prepare it, followed by running a few models on it to see how they perform. The models we will be using are:
- Logistic Regression
- Random Forest
- Support Vector Machine
- K-Nearest Neighbors
- Naive Bayes
- Decision Tree

#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import os
from imblearn.over_sampling import SMOTE
from collections import Counter
import re

#### Preprocessing

In [5]:
os.chdir('C:/Users/hifia/Projects/ML Semester project/datasets')
df = pd.read_csv('8_dataset.csv', low_memory=False)

In [6]:
# Delete all entries where neo = nan
df = df[df.neo.notna()]

In [7]:
x = df.iloc[:, 8:]
neo = df.iloc[:, 6]
pha = df.iloc[:, 7]

print(x.shape)
print(neo.shape)
print(pha.shape)

print(x.head())

(958520, 37)
(958520,)
(958520,)
      H  diameter  albedo  diameter_sigma orbit_id      epoch  epoch_mjd  \
0  3.40   939.400  0.0900           0.200   JPL 47  2458600.5      58600   
1  4.20   545.000  0.1010          18.000   JPL 37  2459000.5      59000   
2  5.33   246.596  0.2140          10.594  JPL 112  2459000.5      59000   
3  3.00   525.400  0.4228           0.200   JPL 35  2458600.5      58600   
4  6.90   106.699  0.2740           3.140  JPL 114  2459000.5      59000   

    epoch_cal equinox         e  ...       sigma_i      sigma_om  \
0  20190427.0   J2000  0.076009  ...  4.608900e-09  6.168800e-08   
1  20200531.0   J2000  0.229972  ...  3.469400e-06  6.272400e-06   
2  20200531.0   J2000  0.256936  ...  3.223100e-06  1.664600e-05   
3  20190427.0   J2000  0.088721  ...  2.170600e-07  3.880800e-07   
4  20200531.0   J2000  0.190913  ...  2.740800e-06  2.894900e-05   

        sigma_w      sigma_ma      sigma_ad       sigma_n      sigma_tp  \
0  6.624800e-08  7.820700e

In [87]:
numeric_columns_indexes = df.select_dtypes(include=np.number).columns
numeric_columns_numeric_indexes = [df.columns.get_loc(col) for col in numeric_columns_indexes]
print(df.iloc[:, numeric_columns_numeric_indexes].corr())

                   spkid         H  diameter    albedo  diameter_sigma  \
spkid           1.000000  0.146075 -0.095362 -0.179656        0.023419   
H               0.146075  1.000000 -0.572648 -0.221658       -0.070651   
diameter       -0.095362 -0.572648  1.000000 -0.108880        0.337145   
albedo         -0.179656 -0.221658 -0.108880  1.000000       -0.080525   
diameter_sigma  0.023419 -0.070651  0.337145 -0.080525        1.000000   
epoch           0.006816 -0.175712  0.058475  0.094071       -0.005169   
epoch_mjd       0.006816 -0.175712  0.058475  0.094071       -0.005169   
epoch_cal       0.006999 -0.176349  0.058539  0.094114       -0.005120   
e               0.010951  0.345334 -0.050649 -0.020403       -0.016542   
a               0.000116 -0.032187  0.146799 -0.114484        0.206718   
q              -0.002115 -0.437429  0.329223 -0.267607        0.381335   
i               0.013443 -0.099022  0.054963 -0.086802        0.039580   
om              0.003874  0.000883  0.

In [8]:
# numeric_columns = x.select_dtypes(include=np.number).columns
# imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# x[:, numeric_columns] = imputer.fit_transform(x[:, numeric_columns])
# print(Counter(x))

numeric_columns_indexes = x.select_dtypes(include=np.number).columns
numeric_columns_numeric_indexes = [x.columns.get_loc(col) for col in numeric_columns_indexes]
imputer = SimpleImputer(strategy='median')
x.iloc[:, numeric_columns_numeric_indexes] = imputer.fit_transform(x.iloc[:, numeric_columns_numeric_indexes])

In [9]:
categorical_columns = x.select_dtypes(include='object').columns
categorical_columns_numeric_indexes = [x.columns.get_loc(col) for col in categorical_columns]
print(categorical_columns, categorical_columns_numeric_indexes)

Index(['orbit_id', 'equinox', 'class'], dtype='object') [4, 8, 35]


In [10]:
def process(string):
    if type(string) == str:
        return int(''.join(re.findall(r'\d+', string)))
    return string

x['orbit_id'] = x['orbit_id'].apply(process)
print(x['orbit_id'].head())

0     47
1     37
2    112
3     35
4    114
Name: orbit_id, dtype: int64


In [11]:
x['equinox'] = x['equinox'].apply(process)
print(x['equinox'].head())

0    2000
1    2000
2    2000
3    2000
4    2000
Name: equinox, dtype: int64


In [12]:
le = LabelEncoder()
x['class'] = le.fit_transform(x['class'])
print(x['class'].head())

0    7
1    7
2    7
3    7
4    7
Name: class, dtype: int32


#### Train test splits and smote

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, neo, test_size=0.2, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# Smote
print("Class distribution in training set before SMOTE:", Counter(y_train), Counter(y_test))


(766816, 37) (191704, 37) (766816,) (191704,)
Class distribution in training set before SMOTE: Counter({'N': 748512, 'Y': 18304}) Counter({'N': 187113, 'Y': 4591})


In [14]:
smote = SMOTE(random_state=0)
x_traina, y_traina = smote.fit_resample(x_train, y_train)
print("Class distribution in training set after SMOTE:", Counter(y_traina))

Class distribution in training set after SMOTE: Counter({'N': 748512, 'Y': 748512})


In [15]:
print(x_traina.head(), y_traina.head())
print(x_traina.shape, y_traina.shape)

        H  diameter  albedo  diameter_sigma  orbit_id      epoch  epoch_mjd  \
0  17.700     3.972   0.079           0.332        12  2459000.5      59000   
1  16.200     3.972   0.079           0.332        18  2459000.5      59000   
2  18.235     3.972   0.079           0.332         3  2459000.5      59000   
3  17.935     3.972   0.079           0.332         1  2459000.5      59000   
4  16.300     3.972   0.079           0.332        11  2459000.5      59000   

    epoch_cal  equinox         e  ...   sigma_i  sigma_om   sigma_w  sigma_ma  \
0  20200531.0     2000  0.171402  ...  0.000005  0.000074  0.000079  0.000026   
1  20200531.0     2000  0.109521  ...  0.000004  0.000035  0.000042  0.000022   
2  20200531.0     2000  0.270904  ...  0.005219  0.000793  0.011923  0.362750   
3  20200531.0     2000  0.162776  ...  0.000034  0.000317  0.008185  0.004864   
4  20200531.0     2000  0.153128  ...  0.000008  0.000034  0.000041  0.000030   

       sigma_ad       sigma_n  sigma_t

In [17]:
os.chdir('C:/Users/hifia/Projects/ML Semester project/Model testing')
# Export the DataFrame to a CSV file
x_traina.to_csv('x_traina.csv', index=False)  # Set index=False to exclude the DataFrame index
y_traina.to_csv('y_traina.csv', index=False)  # Set index=False to exclude the DataFrame index
# x_test.to_csv('x_test.csv', index=False)  # Set index=False to exclude the DataFrame index
# y_test.to_csv('y_test.csv', index=False)  # Set index=False to exclude the DataFrame index

print(f'DataFrames have been saved to {os.getcwd()}')

DataFrames have been saved to C:\Users\hifia\Projects\ML Semester project\Model testing


### TRAINING BEGINS

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import os
from imblearn.over_sampling import SMOTE
from collections import Counter
import re

We will use a bunch of different models to assess which model performs best, and then accordingly proceed with the remaining datasets.

In [3]:
x_train = pd.read_csv('x_traina.csv')
y_train = pd.read_csv('y_traina.csv')
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')


#### Logistic Regression

In [4]:
sc = StandardScaler()
logistic_x_train = sc.fit_transform(x_train)
logistic_x_test = sc.transform(x_test)

regressor = LogisticRegression()
regressor.fit(logistic_x_train, y_train)

y_pred = regressor.predict(logistic_x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

  y = column_or_1d(y, warn=True)


[[187112      1]
 [     1   4590]]
0.9999895672495097


#### Random Forest

In [5]:
sc = StandardScaler()
forest_x_train = sc.fit_transform(x_train)
forest_x_test = sc.transform(x_test)

classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(forest_x_train, y_train)

y_pred = classifier.predict(forest_x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

  return fit_method(estimator, *args, **kwargs)


[[187113      0]
 [     0   4591]]
1.0


#### Support Vector Machine

In [6]:
# sc = StandardScaler()
# svm_x_train = sc.fit_transform(x_train)
# svm_x_test = sc.transform(x_test)
# print("Finished standard scaling")

classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(logistic_x_train, y_train)
print("Finished fitting")

y_pred = classifier.predict(logistic_x_test)
print("Finished predicting")

cm = confusion_matrix(y_test, y_pred)
print("Finished confusion matrix")

print(cm)
print(accuracy_score(y_test, y_pred))
print("Finished accuracy score")

  y = column_or_1d(y, warn=True)
