In [1]:
# Import the modules
import psycopg2
import numpy as np
import pandas as pd
from pathlib import Path
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Define the connection parameters
host = 'localhost'  # or the IP address of your database server
dbname = 'StarsGalaxiesQuasars'  # Replace with your database name
user = 'postgres'  # Replace with your database username
password = '121792'  # Replace with your database password
port = '5432'  # Default PostgreSQL port

In [3]:
# Create the connection string
connection = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'

# Create a connection engine using SQLAlchemy
engine = create_engine(connection)

In [4]:
# Write a SQL query to join the observation and classification tables
join_query = """
SELECT o.*, c.class
FROM observation o
JOIN classification c ON o.classid = c.classid;
"""

In [5]:
# Load the data into a Pandas DataFrame
database_df = pd.read_sql(join_query, engine)

# Check DataFrame
database_df.head()

Unnamed: 0,observationid,classid,objid,ra,dec,u,g,r,i,z,redshift,run,rerun,camcol,field,specobjid,plate,mjd,fiberid,class
0,1,1,1237650000000000000,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752.0,301,4,267,3.72236e+18,-8.96e-06,3306,54922,491,STAR
1,2,1,1237650000000000000,183.59837,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752.0,301,4,267,3.63814e+17,-5.49e-05,323,51615,541,STAR
2,3,2,1237650000000000000,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752.0,301,4,268,3.23274e+17,0.1231112,287,52023,513,GALAXY
3,4,1,1237650000000000000,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752.0,301,4,269,3.72237e+18,-0.000110616,3306,54922,510,STAR
4,5,1,1237650000000000000,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752.0,301,4,269,3.72237e+18,0.000590357,3306,54922,512,STAR


In [7]:
# Separate the data into labels and features
y = database_df["class"]
X = database_df.drop(columns=["class", "observationid", "objid", "specobjid", "plate", "fiberid"])

In [8]:
# Review the y variable Series
y.head()
y.value_counts()

class
GALAXY    4998
STAR      4152
QSO        850
Name: count, dtype: int64

In [9]:
# Review the X variable DataFrame
X.head()
X.value_counts()

classid  ra          dec         u         g         r         i         z         redshift  run  rerun  camcol  field         mjd  
1        8.235100     14.704008  17.26167  16.06334  15.51514  15.29140  15.16316  1035.0    301  3      18      3.600680e+18  54865    1
2        178.627355  -1.537098   18.18851  16.73620  15.96820  15.54818  15.24338  1231.0    301  6      63      3.716700e+17  52370    1
         178.518991  -0.169398   19.14489  18.65624  18.03269  17.68574  17.40549  756.0     301  3      419     3.198940e+17  51943    1
         178.521877  -3.416499   19.27386  18.19659  17.59615  17.29579  17.05360  1140.0    301  1      171     3.715730e+17  52370    1
         178.522722  -1.510858   19.05686  17.45437  16.53960  16.11254  15.76796  1231.0    301  6      62      3.716720e+17  52370    1
                                                                                                                                       ..
1        204.339438   65.841752  16.770

In [10]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_model = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_model.fit(X_train, y_train)

In [12]:
# Make a prediction using the testing data
y_pred = logistic_model.predict(X_test)

In [13]:
# Generate a confusion matrix for the model
conf_matrix = confusion_matrix(y_test, y_pred)

In [14]:
# Print the classification report for the model
print(f"Logistic Regression Model Report:\n{classification_report(y_test, y_pred)}")

Logistic Regression Model Report:
              precision    recall  f1-score   support

      GALAXY       0.00      0.00      0.00       999
         QSO       0.00      0.00      0.00       169
        STAR       0.42      1.00      0.59       832

    accuracy                           0.42      2000
   macro avg       0.14      0.33      0.20      2000
weighted avg       0.17      0.42      0.24      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# Train-test split again for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Train Random Forest Model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [17]:
# Predictions
y_pred_rf = clf.predict(X_test)

In [18]:
# Evaluate model performance
print(f"Random Forest Model Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print(f"Random Forest Model Report:\n{classification_report(y_test, y_pred_rf)}")

Random Forest Model Accuracy: 1.00
Random Forest Model Report:
              precision    recall  f1-score   support

      GALAXY       1.00      1.00      1.00       996
         QSO       1.00      1.00      1.00       190
        STAR       1.00      1.00      1.00       814

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

