In [1]:
import pandas as pd
import numpy as np
import joblib
import lizard
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from Static_code_analyzer import static_code_analyzer


In [None]:
# ====== Load Dataset ======
df = pd.read_csv('../Datasets/processed_jm1.csv')
X = df.drop("defects", axis=1).values
y = df["defects"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, '../models/scaler.pkl')

['models/scaler.pkl']

In [None]:
rf_model = joblib.load("../models/rf_model.pkl")
xgb_model = joblib.load("../models/xgb_model.pkl")

In [None]:
class ANN(nn.Module):
    def __init__(self, input_dim):
        super(ANN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.layers(x)

def train_ann_model(X_train_scaled, y_train, input_dim, epochs=50, batch_size=32):
    model = ANN(input_dim)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train_scaled, dtype=torch.float32),
        torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)
    )
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        for batch_x, batch_y in loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    # Save model after training
    torch.save(model.state_dict(), "../models/ann_model.pth")
    return model


In [None]:
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=32, num_layers=1):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers, batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]  # last output
        out = self.fc(out)
        return out

def train_rnn_model(X_train_scaled, y_train, input_dim, epochs=50, batch_size=32):
    model = RNN(input_dim)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    X_rnn_train = torch.tensor(X_train_scaled, dtype=torch.float32).unsqueeze(1)  # Add seq_len dim
    y_tensor = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)

    dataset = torch.utils.data.TensorDataset(X_rnn_train, y_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        for batch_x, batch_y in loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    # Save model after training
    torch.save(model.state_dict(), "../models/rnn_model.pth")
    return model

In [9]:
ann_model = train_ann_model(X_train_scaled, y_train, input_dim=X_train_scaled.shape[1])
rnn_model = train_rnn_model(X_train_scaled, y_train, input_dim=X_train_scaled.shape[1])

In [10]:
ann_model.eval()

ANN(
  (layers): Sequential(
    (0): Linear(in_features=21, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [11]:
rnn_model.eval()

RNN(
  (rnn): RNN(21, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)

In [None]:
# ML models prediction (train)
rf_train_pred = rf_model.predict_proba(X_train_scaled)[:, 1]
xgb_train_pred = xgb_model.predict_proba(X_train_scaled)[:, 1]

# ANN prediction (train)
with torch.no_grad():
    ann_train_pred = torch.sigmoid(ann_model(torch.tensor(X_train_scaled, dtype=torch.float32))).numpy().flatten()

# RNN prediction (train)
with torch.no_grad():
    rnn_train_pred = torch.sigmoid(rnn_model(torch.tensor(X_train_scaled, dtype=torch.float32).unsqueeze(1))).numpy().flatten()

# Combine for stacked training set
stacked_train_X = np.vstack((rf_train_pred, xgb_train_pred, ann_train_pred, rnn_train_pred)).T


In [None]:
# ML models prediction (test)
rf_test_pred = rf_model.predict_proba(X_test_scaled)[:, 1]
xgb_test_pred = xgb_model.predict_proba(X_test_scaled)[:, 1]

# ANN prediction (test)
with torch.no_grad():
    ann_test_pred = torch.sigmoid(ann_model(torch.tensor(X_test_scaled, dtype=torch.float32))).numpy().flatten()

# RNN prediction (test)
with torch.no_grad():
    rnn_test_pred = torch.sigmoid(rnn_model(torch.tensor(X_test_scaled, dtype=torch.float32).unsqueeze(1))).numpy().flatten()

# Combine for stacked test set
stacked_test_X = np.vstack((rf_test_pred, xgb_test_pred, ann_test_pred, rnn_test_pred)).T


In [None]:
class MetaANN(nn.Module):
    def __init__(self):
        super(MetaANN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(4, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )
    def forward(self, x):
        return self.fc(x)

def train_meta_model(stacked_X, y_train, epochs=100):
    model = MetaANN()
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    stacked_tensor = torch.tensor(stacked_X, dtype=torch.float32)
    y_tensor = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)
    dataset = torch.utils.data.TensorDataset(stacked_tensor, y_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

    for epoch in range(epochs):
        model.train()
        for batch_x, batch_y in loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    torch.save(model.state_dict(), "../models/meta_model.pth")
    return model


In [15]:
# Train meta-model
meta_model = train_meta_model(stacked_train_X, y_train, epochs=100)

In [16]:
# Load meta-model for testing
meta_model = torch.load("models/meta_model.pth")
# meta_model.eval()

In [17]:
# Predict on stacked test
with torch.no_grad():
    meta_test_output = meta_model(torch.tensor(stacked_test_X, dtype=torch.float32))
    meta_test_probs = torch.sigmoid(meta_test_output).numpy().flatten()
    meta_test_pred = (meta_test_probs > 0.4).astype(int)

TypeError: 'collections.OrderedDict' object is not callable

In [None]:
# Evaluate
print("=== Stacked Hybrid Evaluation on Test Set ===")
print("Accuracy:", accuracy_score(y_test, meta_test_pred))
print("Precision:", precision_score(y_test, meta_test_pred))
print("Recall:", recall_score(y_test, meta_test_pred))
print("F1 Score:", f1_score(y_test, meta_test_pred))

=== Stacked Hybrid Evaluation on Test Set ===
Accuracy: 0.8772779043280182
Precision: 0.8758700696055685
Recall: 0.8743485813549507
F1 Score: 0.8751086641553173
