# Generate the data

In [1]:
import json
import random

# Sample templates for code snippets
code_templates = [
    "public class {class_name} {{ public static String {method_name}() {{ return System.getenv('{env_variable}'); }} }}",
    "public class {class_name} {{ public static int {method_name}() {{ return Integer.parseInt(System.getenv('{env_variable}')); }} }}",
    "public class {class_name} {{ public static boolean {method_name}() {{ return Boolean.parseBoolean(System.getenv('{env_variable}')); }} }}"
]

# Random data for class names, method names, and environment variables
class_names = ["Config", "ServiceConfig", "AppConfig", "DatabaseConfig", "SecurityConfig"]
method_names = ["getUrl", "getPort", "getFlag", "getServiceUrl", "getDatabaseUrl"]
env_variables = ["DATABASE_URL", "SERVICE_URL", "PORT", "ENABLE_FLAG", "LOGGING_URL"]

# Function to generate random code
def generate_random_code():
    template = random.choice(code_templates)  # Choose a random template
    class_name = random.choice(class_names)   # Choose a random class name
    method_name = random.choice(method_names)  # Choose a random method name
    env_variable = random.choice(env_variables)  # Choose a random environment variable
    return template.format(class_name=class_name, method_name=method_name, env_variable=env_variable)

# Function to generate items with random labels and code
def generate_json_data():
    data = []
    for i in range(1, 101):
        item = {
            "id": i,
            "code": generate_random_code(),
            "label": random.choice(["cloud_native", "non_cloud_native"])
        }
        data.append(item)
    return data

# Generate data
json_data = generate_json_data()

# Save to a .json file
file_path = "random_code_data.json"
with open(file_path, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)

In [2]:
!pip install -U -q sentence-transformers==3.1.1

# Load the data

In [3]:
import json

with open('/kaggle/working/random_code_data.json', 'r') as f:
    data = json.load(f)
    
# Sperate code and labels
codes=[item['code'] for item in data]
labels=[1 if item['label'] =='cloud_native' else 0 for item in data]
print("Loaded", len(codes), "sample")

Loaded 100 sample


# Calculating embeddings

In [4]:
import torch
from sentence_transformers import SentenceTransformer


# load a pre-trained sentence transformer model
device='cuda'
model=SentenceTransformer('paraphrase-MiniLM-L6-v2')
model.to(device)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [5]:
# Embed the code snippets using GPU
embeddings=model.encode(codes, convert_to_tensor=True, device=device)
print("Embeddings shape:", embeddings.shape)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings shape: torch.Size([100, 384])


In [6]:
from sklearn.model_selection import train_test_split

# Convert embeddings to a numpy array
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

# Create dataset

In [7]:
class CodeDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings=embeddings
        self.labels=torch.tensor(labels, dtype=torch.float32)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]
    

train_dataset = CodeDataset(X_train, y_train)
test_dataset = CodeDataset(X_test, y_test)

In [8]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define neural network

**Why Add Batch Normalization?**

* Batch normalization helps in stabilizing and speeding up training by normalizing the input to each layer, ensuring that they have zero mean and unit variance.

* It also helps mitigate the vanishing gradient problem and improves generalization by adding a slight regularization effect.

In [9]:
import torch.nn as nn
import torch.optim as optim

class Classifier(nn.Module):
    def __init__(self, input_size):
        super(Classifier,self).__init__()
        self.fc1=nn.Linear(input_size, 256) # increase size
        self.bn1=nn.BatchNorm1d(256) # add batch normalization for first layer
        self.fc2=nn.Linear(256,128)
        self.bn2 = nn.BatchNorm1d(128)  # Add batch normalization for second layer
        self.dropout = nn.Dropout(0.02)  # 2% dropout
        self.fc3=nn.Linear(128,1)
        
    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))  # Apply batch normalization before ReLU
        x = torch.relu(self.bn2(self.fc2(x)))  # Apply batch normalization before ReLU
        x=self.dropout(x) # apply dropout
        x=self.fc3(x)
        return x

classifier=Classifier(input_size=embeddings.shape[1]).to(device)

In [10]:
# Using BCEWithLogitsLoss combines the sigmoid activation and binary cross entropy into one function,
# which improves numerical stability.
criterion=nn.BCEWithLogitsLoss()

In [11]:
import torch.optim.lr_scheduler as lr_scheduler

optimizer=optim.Adam(classifier.parameters(), lr=1e-4)

# learning rate scheduler to adjust the learning rate dynamically during training.
# it help achieve better results by reducing the learning rate when the model plateaus.
# gamma=0.1
# after every step_size number of epochs, the learning rate will be reduced to 10% of its previous value
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)  # Reduces LR by 10x every 5 epochs

In [12]:
for epoch in range(10):
    classifier.train()
    total_loss=0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs=classifier(inputs).squeeze()
        
        loss=criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()

    # Step the scheduler at the end of each epoch
    scheduler.step()
    print(f"Eopch {epoch+1}, Loss: {total_loss/len(train_loader)}")
    
classifier.eval()
all_preds=[]

with torch.no_grad():
    for inputs, _ in test_loader:
        inputs=inputs.to(device)
        preds=classifier(inputs).squeeze().cpu().numpy()
        all_preds.extend(preds)

# convert predictions to binary class
predictions=[1 if pred>=0.5 else 0 for pred in all_preds]

Eopch 1, Loss: 0.7117504835128784
Eopch 2, Loss: 0.6450997114181518
Eopch 3, Loss: 0.6238893151283265
Eopch 4, Loss: 0.5896805405616761
Eopch 5, Loss: 0.5809497594833374
Eopch 6, Loss: 0.564728844165802
Eopch 7, Loss: 0.5475818157196045
Eopch 8, Loss: 0.5543622732162475
Eopch 9, Loss: 0.555079209804535
Eopch 10, Loss: 0.5455203473567962


# Calculate performance

## Precision

This metric indicates how many of the predicted positive instances were actually correct.
* For class 0, the precision is 0.43, meaning 43% of the instances predicted as class 0 were correct.
* For class 1, the precision is 0.38, meaning 38% of the instances predicted as class 1 were correct.

## Recall(Sensitivity or True Positive Rate)

This measures how many of the actual positive instances were correctly identified.

* For class 0, the recall is 0.27, meaning that only 27% of the actual instances of class 0 were correctly identified by the model.
* For class 1, the recall is 0.56, meaning that 56% of the actual instances of class 1 were correctly identified by the model.

## F1-Score

The F1-Score is the harmonic mean of precision and recall, providing a balanced measure of model performance when precision and recall are of equal importance.

* For class 0, the F1-score is 0.33, which indicates a low balance between precision and recall.
* For class 1, the F1-score is 0.45, which is slightly better than class 0, suggesting better performance for class 1.

## Support

This is the number of true instances of each class in the test set.

* There are 11 instances of class 0.
* There are 9 instances of class 1.

## Accuracy

This is the overall accuracy of the model, defined as the number of correct predictions divided by the total number of predictions.

* The accuracy is 0.40 (40%), which means that 40% of the total predictions made by the model were correct.

## Macro Avg

This is the average of precision, recall, and F1-score for both classes, calculated equally without taking class imbalances into account.

* The macro average precision, recall, and F1-score are 0.41, 0.41, and 0.39, respectively.

## Weighted Avg

This is the weighted average of precision, recall, and F1-score, taking the number of instances in each class (support) into account.

* The weighted average precision, recall, and F1-score are all around 0.41, 0.40, and 0.39, reflecting how well the model is performing on average, considering the class distribution.

# Key Observations:

* The model has poor performance, as reflected by the low precision, recall, and F1-scores for both classes.
* The model is especially struggling to correctly classify instances of class 0 (low recall of 0.27).
* With an accuracy of 40%, the model is not performing well for a binary classification task where random guessing would give around 50% accuracy.
* It may indicate class imbalance or model underfitting, where neither class is being well distinguished by the model.

In [13]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.47      1.00      0.64         9
           1       1.00      0.09      0.17        11

    accuracy                           0.50        20
   macro avg       0.74      0.55      0.40        20
weighted avg       0.76      0.50      0.38        20



# Sample

In [14]:
# New code snippet
new_code = ["public class Example { public static String getEnvVar() { return System.getenv(\"APP_PORT\"); }}"]

# Embed the new code snippet
new_code_embedding = model.encode(new_code, convert_to_tensor=True, device=device)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
# Predict the class
classifier.eval()
with torch.no_grad():
    new_code_prediction = classifier(new_code_embedding).item()
    new_code_class = "cloud_native" if new_code_prediction >= 0.5 else "non_cloud_native"

print(f"The new code snippet is classified as: {new_code_class}")

The new code snippet is classified as: non_cloud_native


# Multiple samples

In [16]:
new_code_snippets = [
    # Cloud-native examples
    "public class CloudConfig { public static String getRedisHost() { return System.getenv(\"REDIS_HOST\"); }}", # Example 1
    "public class Service { @Value(\"${db.url}\") private String dbUrl; }",  # Example 2

    # Non-cloud-native examples

    "public class DBConnection { private static String dbUrl = \"jdbc:postgresql://127.0.0.1:5432/testdb\"; }"  # Example 3
]

new_code_embeddings = model.encode(new_code_snippets, convert_to_tensor=True, device=device)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
# Predict classes for new code snippets
classifier.eval()
new_code_predictions = []
with torch.no_grad():
    for embedding in new_code_embeddings:
        prediction = classifier(embedding.unsqueeze(0)).item()
        predicted_class = "cloud_native" if prediction >= 0.5 else "non_cloud_native"
        new_code_predictions.append(predicted_class)

# Print the results
for i, code in enumerate(new_code_snippets):
    print(f"Code Snippet {i+1}: {code}\nClassified as: {new_code_predictions[i]}\n")

Code Snippet 1: public class CloudConfig { public static String getRedisHost() { return System.getenv("REDIS_HOST"); }}
Classified as: non_cloud_native

Code Snippet 2: public class Service { @Value("${db.url}") private String dbUrl; }
Classified as: non_cloud_native

Code Snippet 3: public class DBConnection { private static String dbUrl = "jdbc:postgresql://127.0.0.1:5432/testdb"; }
Classified as: non_cloud_native



# Acknowledgement

* https://blog.gopenai.com/using-embedding-and-classifier-to-identify-cloud-native-code-809187cfdb49