In [None]:
# Loading the dataset
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,id,file_path,file_size,line_count,extension,language,code,clean_code,clean_line_count,clean_size
0,1,Markdown/000001.md,34784,572,md,Markdown,# Contributing\n\n| Component | Bui...,contributing\n\n component build ...,186,10000
1,2,XML/000002.props,3013,44,props,XML,"﻿<Project ToolsVersion=""15.0"" xmlns=""http://sc...",project toolsversion xmlns\n propertygroup\n ...,44,1812
2,3,Text/000003.txt,1076,21,txt,Text,The MIT License (MIT)\n\nCopyright (c) 2015 Mi...,the mit license mit\n\ncopyright c 2015 micros...,21,1026
3,4,Markdown/000004.md,8105,84,md,Markdown,# Azure SDK for .NET\n\n[![Packages](https://i...,azure sdk for net\n\npackageshttpsimgshieldsi...,84,7244
4,5,Markdown/000005.md,2763,41,md,Markdown,<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK ...,begin microsoft securitymd v005 block \n\n se...,41,2523


In [None]:
# Preprocessing dataset
df = df.dropna(subset=['clean_code', 'language']) # Drop rows with missing values in 'clean_code' or 'language' columns

# Remove classes with only one sample in target column 'language'
class_counts = df['language'].value_counts()
df = df[df['language'].isin(class_counts[class_counts > 1].index)]

In [None]:
# Define Features (X) and Labels (y)
X = df['clean_code']
y = df['language']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# TF-IDF Vectorization, Label Encoding, and Tensor Conversion
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_train_features = vectorizer.fit_transform(X_train).toarray()
X_test_features = vectorizer.transform(X_test).toarray()

# Encode labels to integers
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)
NUM_CLASSES = len(encoder.classes_)
INPUT_DIM = X_train_features.shape[1]

# Convert sparse SciPy matrices to dense NumPy, then to PyTorch tensors
# NOTE: PyTorch models generally expect float32
X_train_dense = X_train_features
X_test_dense = X_test_features

# Convert to PyTorch Tensors
import torch
X_train_tensor = torch.tensor(X_train_dense, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_dense, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

# Create TensorDatasets and DataLoaders for batch training
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)


In [None]:
# Model Initialization: Random Forest Ensemble Model
from sklearn.ensemble import RandomForestClassifier
# Initialize the Random Forest model
# n_estimators: Number of trees in the forest (more = generally better accuracy, but slower)
# n_jobs=-1: Use all available CPU cores for parallel training (speeds up training time)
model_rf = RandomForestClassifier(
    n_estimators=200, # Number of decision trees
    random_state=42,
    n_jobs=-1, # Use all CPU cores for parallel training
    max_depth=100
)

print("\n--- Training Random Forest Model (Model 4) ---")

# Train the model on your sparse TF-IDF features
# Scikit-learn automatically handles the sparse format here.
model_rf.fit(X_train_features, y_train)

# Generate predictions on the test set
y_pred_rf = model_rf.predict(X_test_features)


--- Training Random Forest Model (Model 4) ---


In [None]:
# Model Evaluation: Random Forest Ensemble Model
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
# Calculate Metrics
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print("\n--- Random Forest Classifier Results ---")
print(f"Accuracy: {acc_rf:.4f}")
print(f"F1-Score (Weighted): {f1_rf:.4f}")

# Print the Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


--- Random Forest Classifier Results ---
Accuracy: 0.9599
F1-Score (Weighted): 0.9570

Confusion Matrix:
[[ 8  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  6 ...  0  0  0]
 ...
 [ 0  0  0 ...  1  0  0]
 [ 0  0  0 ...  0 69  0]
 [ 0  0  0 ...  0  0 67]]
