In [1]:
import pandas as pd
import torch
import tensorflow as tf
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import zipfile
with zipfile.ZipFile("dataset_small.zip", "r") as zip_ref:
    zip_ref.extractall("data_folder2")

In [3]:
df = pd.read_csv("data_folder2/structured_cleaned_new_dataset.csv")
df.head()


Unnamed: 0,id,file_path,file_size,line_count,extension,language,code,clean_code,clean_line_count,clean_size
0,1,Markdown/000001.md,34784,572,md,Markdown,# Contributing\n\n| Component | Bui...,contributing\n\n component build ...,186,10000
1,2,XML/000002.props,3013,44,props,XML,"﻿<Project ToolsVersion=""15.0"" xmlns=""http://sc...",project toolsversion xmlns\n propertygroup\n ...,44,1812
2,3,Text/000003.txt,1076,21,txt,Text,The MIT License (MIT)\n\nCopyright (c) 2015 Mi...,the mit license mit\n\ncopyright c 2015 micros...,21,1026
3,4,Markdown/000004.md,8105,84,md,Markdown,# Azure SDK for .NET\n\n[![Packages](https://i...,azure sdk for net\n\npackageshttpsimgshieldsi...,84,7244
4,5,Markdown/000005.md,2763,41,md,Markdown,<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK ...,begin microsoft securitymd v005 block \n\n se...,41,2523


In [4]:
missing = df["clean_code"].isnull().sum()
print(f"Missing code rows: {missing}")

Missing code rows: 58


In [5]:
df = df.dropna()

In [6]:
min_count = 10  # threshold
counts = df["language"].value_counts()

df["language"] = df["language"].apply(
    lambda x: x if counts[x] >= min_count else "Other"
)

print(df["language"].value_counts())

language
Dart          15345
Rust          14128
C#            10000
Go             9124
JSON           5569
              ...  
Puppet           12
RDoc             11
SourcePawn       11
Pascal           11
Gradle           10
Name: count, Length: 61, dtype: int64


In [7]:
print(df.isnull().sum())

# Check how many rows in total have any missing value
print("Rows with missing values:", df.isnull().any(axis=1).sum())

id                  0
file_path           0
file_size           0
line_count          0
extension           0
language            0
code                0
clean_code          0
clean_line_count    0
clean_size          0
dtype: int64
Rows with missing values: 0


# start implimenting your model (split data intilize the model extra )

In [8]:
pip install transformers datasets

Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install tf-keras

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install hf_xet

Note: you may need to restart the kernel to use updated packages.


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["language_encoded"] = label_encoder.fit_transform(df["language"])
num_classes = len(label_encoder.classes_)

In [13]:
from sklearn.model_selection import train_test_split

# Features (X) and labels (y)
X = df["clean_code"].values   # normalized code for ML models
y = df["language_encoded"].values    # target labels encoded

# Step 1: Split into Train (70%) and Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,          # 30% goes to temp
    random_state=42,         # ensures reproducibility
    stratify=y               # keeps class proportions balanced
)

# Step 2: Split Temp into Validation (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,          # half of 30% = 15%
    random_state=42,
    stratify=y_temp
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))

Train size: 60318
Validation size: 12925
Test size: 12926


In [14]:
from datasets import Dataset


train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset   = Dataset.from_dict({"text": X_val, "label": y_val})
test_dataset  = Dataset.from_dict({"text": X_test, "label": y_test})


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

for ds in [train_dataset, val_dataset, test_dataset]:
    ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|██████████████████████████████████████████████████████████████| 60318/60318 [00:44<00:00, 1369.70 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████| 12925/12925 [00:09<00:00, 1332.02 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████| 12926/12926 [00:09<00:00, 1318.53 examples/s]


In [16]:
from transformers import AutoModelForSequenceClassification

num_classes = len(set(y))  # or y is our encodded labele y = df["language_encoded"].values
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/codebert-base",
    num_labels=num_classes
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [18]:
from transformers import TrainingArguments
from transformers import Trainer

import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging

training_args = TrainingArguments(
    output_dir="./codebert_results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
   
    save_strategy="epoch"
)





Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # ← this is your validation set
    tokenizer=tokenizer
)

  trainer = Trainer(


In [None]:
trainer.train()



Step,Training Loss
10,4.25
20,4.2237
30,4.1869
40,4.1087
50,4.0129
60,3.8895
70,3.5827
80,3.4432
90,3.0911
100,3.0063


In [None]:
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))