In [None]:
# Loading the dataset
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,id,file_path,file_size,line_count,extension,language,code,clean_code,clean_line_count,clean_size
0,1,Markdown/000001.md,34784,572,md,Markdown,# Contributing\n\n| Component | Bui...,contributing\n\n component build ...,186,10000
1,2,XML/000002.props,3013,44,props,XML,"﻿<Project ToolsVersion=""15.0"" xmlns=""http://sc...",project toolsversion xmlns\n propertygroup\n ...,44,1812
2,3,Text/000003.txt,1076,21,txt,Text,The MIT License (MIT)\n\nCopyright (c) 2015 Mi...,the mit license mit\n\ncopyright c 2015 micros...,21,1026
3,4,Markdown/000004.md,8105,84,md,Markdown,# Azure SDK for .NET\n\n[![Packages](https://i...,azure sdk for net\n\npackageshttpsimgshieldsi...,84,7244
4,5,Markdown/000005.md,2763,41,md,Markdown,<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK ...,begin microsoft securitymd v005 block \n\n se...,41,2523


In [None]:
# Preprocessing dataset
df = df.dropna(subset=['clean_code', 'language']) # Drop rows with missing values in 'clean_code' or 'language' columns

# Remove classes with only one sample in target column 'language'
class_counts = df['language'].value_counts()
df = df[df['language'].isin(class_counts[class_counts > 1].index)]

In [None]:
# Define Features (X) and Labels (y)
X = df['clean_code']
y = df['language']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# TF-IDF Vectorization, Label Encoding, and Tensor Conversion
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_train_features = vectorizer.fit_transform(X_train).toarray()
X_test_features = vectorizer.transform(X_test).toarray()

# Encode labels to integers
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)
NUM_CLASSES = len(encoder.classes_)
INPUT_DIM = X_train_features.shape[1]

# Convert sparse SciPy matrices to dense NumPy, then to PyTorch tensors
# NOTE: PyTorch models generally expect float32
X_train_dense = X_train_features
X_test_dense = X_test_features

# Convert to PyTorch Tensors
import torch
X_train_tensor = torch.tensor(X_train_dense, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_dense, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

# Create TensorDatasets and DataLoaders for batch training
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)


In [None]:
# Model Definition: Multinomial Logistic Regression
import torch.nn as nn
class PyTorchLogisticRegression(nn.Module):
    """
    Implements Logistic Regression as a single-layer Neural Network
    for multi-class classification.
    """
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.linear = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        """
        Returns:
            output_logits: Tensor of shape (batch_size, num_classes)
        """
        output_logits = self.linear(x)
        return output_logits

# Model Initialization: Multinomial Logistic Regression
model_pt = PyTorchLogisticRegression(INPUT_DIM, NUM_CLASSES)

# Loss function () and Optimizer Definition
criterion = nn.CrossEntropyLoss() # CrossEntropyLoss includes Softmax
optimizer = torch.optim.Adam(model_pt.parameters(), lr=0.001)

In [None]:
# Training Model: Multinomial Logistic Regression
num_epochs = 10
model_pt.train()

# Training Loop
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model_pt(features)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.0423
Epoch [2/10], Loss: 1.0477
Epoch [2/10], Loss: 1.0477
Epoch [3/10], Loss: 0.3827
Epoch [3/10], Loss: 0.3827
Epoch [4/10], Loss: 0.1869
Epoch [4/10], Loss: 0.1869
Epoch [5/10], Loss: 0.6076
Epoch [5/10], Loss: 0.6076
Epoch [6/10], Loss: 0.0855
Epoch [6/10], Loss: 0.0855
Epoch [7/10], Loss: 0.0626
Epoch [7/10], Loss: 0.0626
Epoch [8/10], Loss: 0.1061
Epoch [8/10], Loss: 0.1061
Epoch [9/10], Loss: 0.5071
Epoch [9/10], Loss: 0.5071
Epoch [10/10], Loss: 0.6527
Epoch [10/10], Loss: 0.6527


In [None]:
# Model Evaluation: Multinomial Logistic Regression
model_pt.eval()
with torch.no_grad():
    outputs = model_pt(X_test_tensor) # Get predictions on the test set
    _, y_pred_pt = torch.max(outputs.data, 1) # The class prediction is the index with the highest score (logit)
    y_pred_np = y_pred_pt.numpy() # Convert predictions back to numpy for scikit-learn metrics

    # Calculate Metrics and display results
    from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
    acc_pt = accuracy_score(y_test_encoded, y_pred_np)
    f1_pt = f1_score(y_test_encoded, y_pred_np, average='weighted')
    print("\n--- PyTorch Logistic Regression Results ---")
    print(f"Accuracy: {acc_pt:.4f}")
    print(f"F1-Score: {f1_pt:.4f}")
    print("Confusion Matrix (Labels are 0, 1, 2...):\n", confusion_matrix(y_test_encoded, y_pred_np))


--- PyTorch Logistic Regression Results ---
Accuracy: 0.9416
F1-Score: 0.9372
Confusion Matrix (Labels are 0, 1, 2...):
 [[ 9  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  6 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0 64  0]
 [ 0  0  0 ...  0  0 39]]


In [None]:
# Prediction Function for Multinomial Logistic Regression
def predict_multinomial(model, vectorizer, new_texts):
    """
    Predict labels for new texts using a multinomial model.
    Args:
        model: Trained multinomial classification model (e.g., with Softmax output)
        vectorizer: Fitted TfidfVectorizer
        new_texts: List of text strings to classify
    Returns:
        predictions: Predicted class indices
        probabilities: Probability scores for each class
    """

    # Transform new texts using the same vectorizer
    X_new = vectorizer.transform(new_texts).toarray()
    X_new_tensor = torch.tensor(X_new, dtype=torch.float32)

    model.eval() # Set model to evaluation mode

    # Make predictions
    with torch.no_grad():
        outputs = model(X_new_tensor)
        probabilities = outputs # Model outputs scores/logits or probabilities
        predictions = torch.argmax(probabilities, dim=1) # Get the class with the highest probability

    return predictions, probabilities

# Note: To use this function, you would need a multinomial model trained with
# categorical cross-entropy loss and an appropriate output layer.
# The rest of your data loading and vectorization steps would remain similar.

In [None]:
# Example usage of Prediction Function
row_series = df[df['language'] == 'Markdown'].iloc[0] # Extracting a sample row
new_texts = [row_series['clean_code']]
preds_tensor, probs = predict_multinomial(model_pt, vectorizer, new_texts)
preds_np = preds_tensor.numpy() # preds_tensor is a torch tensor of class indices
predicted_labels = encoder.inverse_transform(preds_np) # Convert indices back to original labels
print("Predicted Programming Language: " + predicted_labels)
print(probs)

['Predicted Programming Language: Markdown']
tensor([[-13.0162, -12.4387,  -9.7510,  -7.1578,  -6.6710,  -7.4301, -15.5958,
         -10.5104, -12.2967,  -9.4380, -13.2916, -15.0997, -17.1523,  -7.5103,
         -13.0159,  -6.3105, -17.0187, -13.9976,  -5.7426, -14.7558, -14.2164,
         -16.7827, -12.7178, -16.3238,  -4.7670, -12.0553, -18.7029, -16.6741,
          -6.9039, -16.0770, -13.7410, -10.7152,  -6.8575,  -5.7670,  -8.3074,
         -14.4544,  -8.4775, -16.0846, -12.7949,  -7.1392,  -9.9650, -17.4416,
          -5.6903,   1.2377,  -8.3514,  -7.3445, -14.5968,  -3.3790,  -9.0177,
         -10.0762, -16.9868, -16.3285,  -5.2939,  -9.7186,  -8.8361,  -6.4836,
          -7.4861, -15.8865,  -8.8667, -17.3866, -14.9850, -13.5604,  -2.9508,
         -13.0561, -14.2647,  -7.9215, -14.9352,  -3.2596, -14.4701,  -4.8385,
         -15.5534,  -4.6419]])
