In [55]:
'''
get_one_set_one_Xscore(set, model, scaler) -> list of scores 

get_one_set_one_Pscore(set, model, scaler) -> list of scores

get_one_set_all_scores(set) -> 2d list of scores (models labeled) and totals

get_all_sets_all_scores -> above method but for all sets 

'''

'\nget_one_set_one_Xscore(set, model, scaler) -> list of scores \n\nget_one_set_one_Pscore(set, model, scaler) -> list of scores\n\nget_one_set_all_scores(set) -> 2d list of scores (models labeled) and totals\n\nget_all_sets_all_scores -> above method but for all sets \n\n'

In [56]:
import glob
import numpy as np
import pandas as pd 
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import matplotlib.pyplot as plt
import colorsys
import joblib

In [57]:
import torch
import torch.nn as nn

class DeeperNLCModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        dropout_rate = 0.5

        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),

            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.ReLU(),

            nn.Linear(16, 8),
            nn.BatchNorm1d(8),
            nn.ReLU(),

            nn.Linear(8, 4),
            nn.BatchNorm1d(4),
            nn.ReLU(),

            nn.Linear(4, 1)  # Final output layer
        )

    def forward(self, x):
        return self.model(x)


class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, input, target):
        bce_loss = self.bce(input, target)
        pt = torch.exp(-bce_loss)
        focal = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal.mean()


In [58]:
def input_from_csv(set_path):
    X = pd.read_csv(set_path)
    ids = X['id']
    X = X.drop(columns=['id', 'is_centrosymmetric', "crystal_system"])
    X = X.values
    return X, ids

In [59]:
def scale(scaler_path, X):
    scaler = joblib.load(scaler_path)
    X = scaler.transform(X)
    return X

In [60]:
def score_with_xgb(model_path, X):
    model = xgb.XGBClassifier()
    model.load_model(model_path)
    probs = model.predict_proba(X)[:, 1]
    score = np.zeros_like(probs, dtype=int)

    thresholds = np.round(np.arange(0.30, 1.00, 0.01), 2)
    
    for threshold in thresholds:
        score += (probs >= threshold).astype(int)
    print(f"{model_path}: Done scoring")

    return score 


In [61]:
def score_with_torch(model_path, X):
    input_tensor = torch.tensor(X, dtype=torch.float32)
    #model = torch.load(model_path, weights_only=False)
    model = DeeperNLCModel(296)  # replace with actual feature count
    # Load the state dict
    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)

    model.eval()

    # Run inference
    with torch.no_grad():
        outputs = model(input_tensor)

    # If output is logits, convert to probabilities
    if outputs.shape[1] == 1:
        probs = torch.sigmoid(outputs).squeeze().numpy()  # shape: (n_samples,)
    else:
        probs = torch.softmax(outputs, dim=1)[:, 1].numpy()  # shape: (n_samples,)

    thresholds = np.round(np.arange(0.30, 1.00, 0.01), 2)

    # Apply thresholds to compute scores
    score = np.zeros_like(probs, dtype=int)
    for threshold in thresholds:
        score += (probs >= threshold).astype(int)

    print(f"{model_path}: Done scoring")
    return(score)

In [62]:
def get_xgb_ensemble(set_path):
    X, ids = input_from_csv(set_path)
    model_paths = glob.glob("models/x*")
    scaler_paths = glob.glob("scalers/x*")
    print(len(model_paths), len(scaler_paths))
    model_scores = []

    for i in range(len(model_paths)):
        X = scale(scaler_paths[i], X)
        score = score_with_xgb(model_paths[i], X)
        print(np.argmax(score))
        model_scores.append(score)

    scores_df = pd.DataFrame(model_scores).T
    scores_df.columns = [f"model_{i}_score" for i in range(len(model_scores))]
    scores_df["total_score"] = scores_df.sum(axis=1)
    scores_df.insert(0, "id", ids.values)

    # Save
    scores_df.to_csv("xgb_scores.csv", index=False)
    print("Saved threshold_scores.csv with per-model and total scores.")

In [63]:
def get_torch_ensemble(set_path):
    X, ids = input_from_csv(set_path)
    model_paths = glob.glob("models/t*")
    scaler_paths = glob.glob("scalers/t*")
    print(len(model_paths), len(scaler_paths))
    model_scores = []

    for i in range(len(model_paths)):
        X = scale(scaler_paths[i], X)
        score = score_with_torch(model_paths[i], X)
        print(model_paths[i], scaler_paths[i])
        model_scores.append(score)

    scores_df = pd.DataFrame(model_scores).T
    scores_df.columns = [f"model_{i}_score" for i in range(len(model_scores))]
    scores_df["total_score"] = scores_df.sum(axis=1)
    scores_df.insert(0, "id", ids.values)

    # Save
    scores_df.to_csv("torch_scores.csv", index=False)
    print("Saved threshold_scores.csv with per-model and total scores.")

In [None]:
path = 'data/Set_11/featurized_materials.csv'
get_xgb_ensemble(path)
get_torch_ensemble(path)

10 10
models\x0.json: Done scoring
1512
models\x1.json: Done scoring
2393
models\x2.json: Done scoring
0
models\x3.json: Done scoring
458
models\x4.json: Done scoring
164
models\x5.json: Done scoring
3442
models\x6.json: Done scoring
1664
models\x7.json: Done scoring
2561
models\x8.json: Done scoring
796
models\x9.json: Done scoring
0
Saved threshold_scores.csv with per-model and total scores.
10 10
models\t0.pt: Done scoring
models\t0.pt scalers\t0.pkl
models\t1.pt: Done scoring
models\t1.pt scalers\t1.pkl
models\t2.pt: Done scoring
models\t2.pt scalers\t2.pkl
models\t3.pt: Done scoring
models\t3.pt scalers\t3.pkl
models\t4.pt: Done scoring
models\t4.pt scalers\t4.pkl
models\t5.pt: Done scoring
models\t5.pt scalers\t5.pkl
models\t6.pt: Done scoring
models\t6.pt scalers\t6.pkl
models\t7.pt: Done scoring
models\t7.pt scalers\t7.pkl
models\t8.pt: Done scoring
models\t8.pt scalers\t8.pkl
models\t9.pt: Done scoring
models\t9.pt scalers\t9.pkl
Saved threshold_scores.csv with per-model and t

In [65]:
xgb_scores_df = pd.read_csv("xgb_scores.csv")

# Sort by total_score in descending order and take top 10
top10_xgb = xgb_scores_df.sort_values(by="total_score", ascending=False).head(20)

# Print only ID and total score
print("Top 10 Highest Scoring Materials (XGBoost):")
print(top10_xgb[["id", "total_score"]])

Top 10 Highest Scoring Materials (XGBoost):
              id  total_score
426    mp-561472          185
1894   mp-643099          184
2396   mp-866713          183
7175  mp-1193600          183
7153  mp-1182214          181
7618   mp-684725          181
5953  mp-2232217          176
2566  mp-1199165          176
5909  mp-2229444          175
4828  mp-1176743          175
7170  mp-1192990          174
4807  mp-1238368          174
2768  mp-1212522          173
4294  mp-1233347          171
4984   mp-696931          170
2422  mp-1251539          170
1512  mp-2210626          169
4315  mp-1233464          167
3093   mp-755030          167
6619   mp-583429          166


In [66]:
scores_df = pd.read_csv("torch_scores.csv")

# Sort by total_score descending and select top 10
top10 = scores_df.sort_values(by="total_score", ascending=False).head(20)

# Print the results
print("Top 10 Highest Scoring Materials:")
print(top10[["id", "total_score"]])

Top 10 Highest Scoring Materials:
              id  total_score
2238   mp-542807          174
4300  mp-1233394          172
2768  mp-1212522          171
1252  mp-1219288          165
5520  mp-1235328          163
677   mp-1228774          160
6483  mp-1219405          156
4271  mp-1233222          155
5581  mp-1236046          154
692   mp-1229021          153
453   mp-1045011          151
4281  mp-1233289          149
4308  mp-1233439          149
4336  mp-1233587          145
528   mp-1219471          143
2876  mp-1226966          142
694   mp-1229079          142
7409  mp-2715263          141
4663   mp-807421          140
6509  mp-1228323          139


In [67]:
# Read the CSV files
torch_df = pd.read_csv("torch_scores.csv")
xgb_df = pd.read_csv("xgb_scores.csv")

# Extract id and last column (total score) from each
torch_scores = torch_df.iloc[:, [0, -1]].copy()
xgb_scores = xgb_df.iloc[:, [0, -1]].copy()

# Rename columns
torch_scores.columns = ['id', 'torch_score']
xgb_scores.columns = ['id', 'xgb_score']

# Merge on id
combined_df = pd.merge(torch_scores, xgb_scores, on='id', how='inner')

# Create total_score column as sum of torch and xgb scores
combined_df['total_score'] = combined_df['torch_score'] + combined_df['xgb_score']

# Save to CSV
combined_df.to_csv("combined_scores.csv", index=False)

In [68]:
top_ten = combined_df.sort_values(by='total_score', ascending=False).head(10)

print(top_ten)

              id  torch_score  xgb_score  total_score
2768  mp-1212522          171        173          344
2238   mp-542807          174        164          338
4300  mp-1233394          172        147          319
5520  mp-1235328          163        150          313
4271  mp-1233222          155        143          298
4336  mp-1233587          145        147          292
2396   mp-866713          107        183          290
2876  mp-1226966          142        147          289
2239   mp-581276          120        164          284
5581  mp-1236046          154        130          284
