In [8]:
DATABASE_PATH = '../database/abundances.db'
element_list = ['Fe', 'Ti', 'Ca', 'Si', 'Al', 'Mg', 'Na']

In [9]:
FETCH_LIMIT = None

In [None]:
import json
import sqlite3



import sqlite3
import pandas as pd
from flask import jsonify

def query_db(query, params=(), limit=None):
    db_path = DATABASE_PATH
    connection = sqlite3.connect(db_path)

    # Add LIMIT clause if a limit is specified
    if limit is not None:
        query += f" LIMIT {limit}"
        
    # Using pandas to read the query result directly into a DataFrame
    df = pd.read_sql_query(query, connection, params=params)

    connection.close()

    # Convert DataFrame to a list of dictionaries
    result_list = df.to_dict(orient='records')

    return result_list


In [None]:
query = "SELECT * FROM abundances"
data = []

In [None]:
import time

In [17]:
# Start the timer to measure the query execution time

start_time = time.time()

result = query_db(query, data,FETCH_LIMIT )


end_time = time.time()
# Print the time taken to perform the query
print(f"time to fetch : {end_time - start_time:.4f} seconds")

time to fetch : 2.3561 seconds


In [18]:
len(result) / 7

51277.0

In [19]:
import sys


# Get the size of the variable
size_in_bytes = sys.getsizeof(result)

print(f"Size of my_list: {size_in_bytes//1024**2} MB")


Size of my_list: 2 MB


In [20]:
import time
from sklearn.neighbors import KDTree
import numpy as np

points = result

In [21]:
# kernel = True

In [22]:
# Create a KDTree for each element
element_trees = {}
element_points_dict = {}

# Start the timer to measure the query execution time
start_time = time.time()

for element in element_list:
    # print("creating for element", element)
    element_points = [
        point for point in points if point['element'] == element]
    element_points_dict[element] = element_points
    element_points = [(point['lat'], point['long'])
                        for point in element_points]
    element_trees[element] = KDTree(np.array(element_points))

end_time = time.time()
# Print the time taken to perform the query
print(f"Trees creation time : {end_time - start_time:.4f} seconds")

# Query function
def get_nearby_points(query_point, radius_degrees, element):
    query_array = np.array([query_point])

    # Query the relevant tree for the specified element
    tree = element_trees.get(element)
    if tree:
        indices = tree.query_radius(query_array, r=radius_degrees)
        return [points[i] for i in indices[0]]
    else:
        return []



Trees creation time : 0.3250 seconds


In [30]:
len(element_points_dict['Mg'])

51277

## inspecting dates

In [49]:
s1 = {point['date'] for point in points}
# Convert to a list
s1 = sorted(list(s1))
s1

# Save to JSON
with open("../static/dates.json", "w") as f:
    json.dump(s1, f, indent=4)

print("Unique dates saved to 'dates.json'.")

Unique dates saved to 'dates.json'.


## histograms

In [42]:
# Initialize a dictionary to store histogram data
histograms = {}

# Define bins
bins = np.arange(0, 51, 5)  # 0-10, 10-20, ..., 90-100

# Compute histograms with normalization
for element, points in element_points_dict.items():
    # Extract abundances
    abundances = [entry["abundance"] for entry in points]
    
    # Compute histogram counts
    counts, _ = np.histogram(abundances, bins=bins)
    
    # Normalize counts
    total = sum(counts) 
    normalized_counts = (counts*100 / total).tolist() if total > 0 else counts.tolist()
    
    # Store the result in the histogram dictionary
    histograms[element] = normalized_counts

histograms


histograms

{'Fe': [9.037190163231077,
  90.11447627591318,
  0.49534879185599784,
  0.35298476899974646,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'Ti': [95.77783411666049,
  1.404138307623301,
  2.1725139926282737,
  0.6455135830879342,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'Ca': [0.0,
  99.01515299256977,
  0.6143105095851942,
  0.3705364978450377,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'Si': [3.4420890457710085,
  2.9135869883183494,
  2.2388205238215964,
  1.5894065565458197,
  89.36170212765957,
  0.26717631686721144,
  0.13456325448056633,
  0.04290422606626753,
  0.009750960469606256,
  0.0],
 'Al': [0.0,
  2.654211439826823,
  91.02326579168049,
  6.320572576398775,
  0.0019501920939212513,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'Mg': [2.2407707159155175,
  89.99941494237183,
  2.7595218128985706,
  5.0002925288140885,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'Na': [100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [43]:
# Save to JSON
with open("../static/histograms.json", "w") as f:
    json.dump(histograms, f, indent=4)

print("Histograms computed and saved to 'histograms.json'.")

Histograms computed and saved to 'histograms.json'.


{'Fe': [50842, 435, 0, 0, 0, 0, 0, 0, 0, 0],
 'Ti': [49832, 1445, 0, 0, 0, 0, 0, 0, 0, 0],
 'Ca': [50772, 505, 0, 0, 0, 0, 0, 0, 0, 0],
 'Si': [3259, 1963, 45959, 91, 5, 0, 0, 0, 0, 0],
 'Al': [1361, 49915, 1, 0, 0, 0, 0, 0, 0, 0],
 'Mg': [47298, 3979, 0, 0, 0, 0, 0, 0, 0, 0],
 'Na': [51277, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [24]:
from sklearn.cluster import KMeans
import numpy as np



In [25]:
len(element_points)

51277

[(23.820325, 66.53385),
 (66.61075, 50.301875),
 (28.742700000000003, 61.86415),
 (82.213425, -31.867500000000003),
 (-62.454875, -120.05075),
 (-68.75257500000001, -131.0405),
 (-18.6498, 41.1941),
 (83.10572499999999, 137.19575),
 (78.4819, -102.618),
 (53.2174, 56.299225),
 (17.470225, -25.13315),
 (-64.780925, 50.68655),
 (-13.3702, 60.056450000000005),
 (-56.14205, -130.59775),
 (75.398125, -114.52525),
 (-65.630725, -109.15025),
 (-62.328125, 41.070725),
 (-56.029725, 150.43425),
 (42.786825, -114.38975),
 (24.093325, 70.917275),
 (69.20685, -128.00650000000002),
 (-1.84785, -107.35675),
 (16.177825, -102.20325),
 (-7.8108, 53.661875),
 (67.383425, 149.10975000000002),
 (53.870025, -133.356),
 (-58.618525, 43.332225),
 (27.291075000000003, -98.094475),
 (-50.96755, -128.44225),
 (29.96075, 61.851475),
 (-39.002, -128.31625),
 (-54.621825, 54.139675),
 (79.2592, 39.973725),
 (-14.21225, -4.595225),
 (-86.514425, 47.9276),
 (11.8268, 48.26905000000001),
 (64.7565, -142.85525),
 (68

In [None]:
raise End

## Clustering

In [27]:
# Perform clustering to create 2k spatial clusters
coordinates = element_points
print(len(coordinates))
n_clusters = 2000
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(coordinates)

# Save cluster labels and centers
cluster_centers = kmeans.cluster_centers_

# Map cluster labels back to original points
for point, label in zip(all_points, labels):
    point['cluster'] = label

51277


  super()._check_params_vs_input(X, default_n_init=10)


KeyboardInterrupt: 

In [None]:

# Map cluster labels back to original points
for point, label in zip(all_points, labels):
    point['cluster'] = label

51277

In [None]:
k = np.random.randint(0,len(coordinates))
print(k)
x = element_points_dict['Al'][k]
x['lat'],x['long'],coordinates[k]

34274


(62.106625, -119.697, (62.106625, -119.697))

In [None]:
import numpy as np
from collections import defaultdict

# Initialize variables
M = len(element_points_dict)  # Number of elements
n_clusters = max(labels)+1  # Total distinct labels

# Create a structure to store the cluster means for each element
cluster_means_per_element = {element: [0] * n_clusters for element in element_points_dict}

# Iterate over each element and compute cluster means
for element, points in element_points_dict.items():
    # Create a list to store sums and counts for each cluster
    cluster_sums = [0] * n_clusters
    cluster_counts = [0] * n_clusters

    # Accumulate abundances for each cluster
    for point, label in zip(points, labels):
        cluster_sums[label] += point['abundance']
        cluster_counts[label] += 1

    # Compute means for each cluster
    cluster_means_per_element[element] = [
        cluster_sums[i] / cluster_counts[i] if cluster_counts[i] > 0 else 0 for i in range(n_clusters)
    ]

# Debug: Print a small sample of results
for element, means in cluster_means_per_element.items():
    print(f"Cluster means for {element}: {means[:10]}")  # Print the first 10 clusters as a sample


Cluster means for Fe: [5.0, 4.927243611111112, 4.598622727272727, 4.730957037037037, 5.003585263157895, 6.037532777777778, 4.9999988235294115, 4.913563225806452, 4.707002083333333, 4.291715151515152]
Cluster means for Ti: [1.0, 1.4643670000000002, 2.3284448515151515, 1.6684947037037037, 0.997632, 1.4353126111111112, 0.9999871176470587, 1.4191258064516128, 1.27835225, 1.7878875072727272]
Cluster means for Ca: [9.0, 8.938322222222224, 8.452149393939393, 8.58271888888889, 8.91985842105263, 8.482971666666666, 8.999986470588237, 8.881146451612903, 8.315085833333333, 8.27688818181818]
Cluster means for Si: [20.57894736842105, 20.337225000000004, 19.23774424242424, 18.67700501111111, 20.64185789473684, 18.202651666666668, 20.529458823529414, 19.939537741935485, 18.16532666666667, 18.90643848484848]
Cluster means for Al: [14.0, 13.79764, 13.655601818181818, 14.406977777777778, 13.81233894736842, 13.751982777777776, 13.999976470588235, 14.104067741935483, 14.374305416666667, 13.170244848484849]

In [None]:
cluster_means_per_element['Fe']

[5.0,
 4.927243611111112,
 4.598622727272727,
 4.730957037037037,
 5.003585263157895,
 6.037532777777778,
 4.9999988235294115,
 4.913563225806452,
 4.707002083333333,
 4.291715151515152,
 5.0,
 4.568252222222222,
 4.78565,
 5.000262592592593,
 4.474701428571429,
 4.423752592592592,
 4.928660333333332,
 4.8221661904761906,
 4.819921363636364,
 5.0,
 4.973558571428571,
 4.499446578947368,
 4.8071755,
 4.257302222222222,
 5.973614666666666,
 4.230226774193548,
 4.953333125,
 3.9215324137931034,
 4.66137105263158,
 5.0,
 5.528888235294118,
 4.473563,
 4.178728333333333,
 5.2930728125,
 4.5460313953488365,
 4.278186666666667,
 4.708457333333333,
 5.0,
 5.051892812499999,
 4.174323913043478,
 5.0,
 4.400663636363636,
 5.332151714285714,
 5.125248571428571,
 4.744950882352941,
 4.914285714285714,
 4.723957837837839,
 5.343314814814814,
 5.624776071428571,
 5.0,
 4.679921515151515,
 5.0,
 5.1799740000000005,
 4.409392916666667,
 4.595417727272727,
 4.668676666666666,
 5.2450884615384625,
 4.88

In [None]:
cluster_centers

array([[  13.70680526,  -39.97501711],
       [   8.76942214,   49.12751214],
       [  28.45787576, -131.22604545],
       ...,
       [  70.76836447, -114.10147368],
       [ -87.98729265,   35.43342206],
       [ -88.36097813,   21.08536563]])

In [None]:
# import json

# # Save the elementwise_cluster_data to a JSON file
# with open('static/{element}_clusters.json', 'w') as json_file:
#     json.dump(elementwise_cluster_data, json_file, indent=4)

# print("Data saved to 'elementwise_cluster_data.json'")


In [None]:
# Initialize a dictionary to store the final result
elementwise_cluster_data = {}

# Iterate through each element and its corresponding cluster means
for element, cluster_means in cluster_means_per_element.items():
    elementwise_cluster_data[element] = [
        {
            'element': element,
            'abundance': cluster_means[i],
            'lat': cluster_centers[i][0],  # Latitude of the cluster center
            'long': cluster_centers[i][1]  # Longitude of the cluster center
        }
        for i in range(len(cluster_centers))
    ]

# Example: Print the first cluster data for a specific element (e.g., 'Fe')
print(f"First cluster for Fe: {elementwise_cluster_data['Fe'][0]}")


First cluster for Fe: {'element': 'Fe', 'abundance': 5.0, 'lat': 13.706805263157895, 'long': -39.97501710526315}


In [None]:
import json

# Save the elementwise_cluster_data to a JSON file
with open('../static/clusters.json', 'w') as json_file:
    json.dump(elementwise_cluster_data, json_file, indent=4)

print("Data saved to 'elementwise_cluster_data.json'")


Data saved to 'elementwise_cluster_data.json'


0.5

In [None]:
len(cluster_centers)

2000

In [None]:
from collections import defaultdict

# Prepare a dictionary to store cluster means for each element
cluster_means = defaultdict(dict)  # {element: {cluster_id: mean_abundance}}

for element, points in element_points_dict.items():
    # Group points by cluster
    cluster_values = defaultdict(list)
    for point in points:
        cluster_values[point['cluster']].append(point['abundance'])
    
    # Compute mean abundance for each cluster
    for cluster_id, abundances in cluster_values.items():
        cluster_means[element][cluster_id] = np.mean(abundances)


In [None]:
# raies END
# 

SyntaxError: invalid syntax (1354160052.py, line 1)

In [None]:
# Get the size of the variable
size_in_bytes = sys.getsizeof(element_trees['Fe']) * 7 

print(f"Size of elements trees: {size_in_bytes//1024} KB")

Size of elements trees: 7 KB


In [None]:
# Query for a specific element
query_point = (66.988234, 22.479195)
radius = 1.0  # 2 degrees radius
element = 'Si'

start_time = time.time()

nearby_points = get_nearby_points(query_point, radius, element)

end_time = time.time()
# Print the time taken to perform the query
print(f"Query time : {end_time - start_time:.4f} seconds")

print(f"{len(nearby_points)} points found")



Query time : 0.0030 seconds
16 points found


In [None]:
N1 = 20
l1 = [[i['lat'], i['long']] for i in nearby_points[:N1]]

l1

[[63.384976, 20.88965],
 [63.568389, 21.611099],
 [63.440604, 20.868272],
 [63.623415, 20.941517],
 [63.715584, 20.629065],
 [63.862185, 19.988494],
 [63.981685, 22.22674],
 [63.957563, 21.489285],
 [63.989492, 20.865799],
 [63.808552, 22.329401],
 [64.198639, 19.929209],
 [64.217924, 21.590061],
 [64.260436, 20.445086],
 [64.406685, 21.841401],
 [64.445796, 20.796922],
 [64.577912, 19.619538],
 [64.639676, 19.551581],
 [64.764734, 19.207426],
 [64.813723, 21.18727],
 [64.874453, 22.78601]]