In [None]:
import matplotlib.pyplot as plt, numpy as np, joblib, os
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.model_selection import train_test_split 
from stellarutil import Simulation, Star 
from concurrent.futures import ThreadPoolExecutor
from IPython.display import clear_output

X=[] # The input for the ML model
Y=[] # The output for the ML model
HALOS = 1 # The number of halos to use
SIMULATION = 'm10v_res250md' # The simulation to use

def populate_data(SIMULATION):
    # Get the halo at index 0 restricted at 100%
    sim = Simulation(simulation_name=SIMULATION, species=['star', 'dark'])
    clear_output()
    # Get the x,y,z positions of each dm particle in the simulation, normalize it with halo center
    dark_x = sim.particles['dark']['position'][:,0]
    dark_y = sim.particles['dark']['position'][:,1]
    dark_z = sim.particles['dark']['position'][:,2]
    # Get the mass of each dm particle in the simulation
    dark_m = sim.particles['dark']['mass']
    # For each halo, get the stars and dark matter particles (X and Y)
    for i in range(0, HALOS):
        halo = sim.get_halo(i)
        # Get the x,y,z positions of each dm particle in the simulation, normalize it with halo center
        halo_dark_x = dark_x - halo.xc
        halo_dark_y = dark_y - halo.yc
        halo_dark_z = dark_z - halo.zc
        # Get the distance of each dm particle from the center of the indicated dark matter halo
        halo_dark_distances = np.sqrt(np.square(halo_dark_x) + np.square(halo_dark_y) + np.square(halo_dark_z))
        # Get X - the features used by the ML to predict Y
        for star in halo.stars:
            X.append([star.x, star.y, star.vz, star.a, star.get_3DR(), star.get_3DR()])
        # Find the dark matter masses of each star in the halo, using multiple threads
        with ThreadPoolExecutor(max_workers=12) as executor:
            def get_dm_mass(star: Star):
                dm_masses = dark_m[halo_dark_distances < star.get_3DR()] # Filter out all dm that are farther than the star's r
                return np.sum(dm_masses) # dark matter mass is the mass of each particle whose r < r_star

            def process(star: Star):
                dm_mass = get_dm_mass(star)
                return dm_mass
            
            results = executor.map(process, halo.stars)
        # Get Y - the result of the multithreaded process
        for result in enumerate(results, start=1):
            index, data = result
            Y.append(data)
        
        print(f'Finished halo {i+1}/{HALOS}')

populate_data(SIMULATION)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize and train the KNN regressor
X_test = np.array(X_test)
y_test = np.array(y_test)
X_train = np.array(X_train) 
y_train = np.array(y_train)

knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train, y_train)

# Predict on the test data
y_pred = np.array(knn_regressor.predict(X_test))

def getAccuracy():
    # Find the percent difference
    sum = 0
    for i in range(len(y_pred)):
        a = y_pred[i]
        b = y_test[i]
        p = round(100 * ((2 * abs(a-b)) / (a+b)), 3)
        sum = sum + p
    # Accuracy is 100 - average percent difference
    return round(100 - sum / len(y_pred), 3)

def graph(x,y, title, r):
    x = np.array(x)
    y = np.array(y)
    # Create the scatter plot
    plt.scatter(x, y, label='Data Points', c=r, vmin=0.011, vmax=1.5)
    plt.colorbar()
    # Get the max and min value
    minVal = min(min(x), min(y))
    maxVal = max(max(x), max(y))
    # Plot y=x line
    plt.plot([minVal, maxVal], [minVal, maxVal], color='green', label='y = x')
    # Add labels and legend
    plt.xlabel('Actual Mass [M☉]')
    plt.ylabel('Predicted Mass [M☉]')
    plt.title(title)
    plt.legend()
    plt.loglog()
    # Show the plot
    plt.show()
    

print(f"Average Accuracy: { getAccuracy() }%")
print(f"Total Stars: { len(X)}")
print(f"Predicted Stars: { len(X_test)}")
graph(y_test, y_pred, "Predicted vs Actual Dark Matter Mass", X_test[:,4])


# Load X and Y for the first 100 halos

In [None]:
X_BV = joblib.load('../data/pickle/big_victor/big_victor_h100_X.pkl')
Y_BV = joblib.load('../data/pickle/big_victor/big_victor_h100_Y.pkl')
X_LV = joblib.load('../data/pickle/little_victor/little_victor_h100_X.pkl')
Y_LV = joblib.load('../data/pickle/little_victor/little_victor_h100_Y.pkl')
X_LR = joblib.load('../data/pickle/little_romeo/little_romeo_h100_X.pkl')
Y_LR = joblib.load('../data/pickle/little_romeo/little_romeo_h100_Y.pkl')

X = X_LR
Y = Y_LR