In [1]:
import pandas as pd

# List of file paths
file_paths = ['/content/drive/MyDrive/archive (2)/pricerunner_aggregate.csv',
              '/content/drive/MyDrive/archive (2)/shopmania.csv',
              '/content/drive/MyDrive/archive (2)/skroutz_aggregate.csv']



# Initialize an empty DataFrame to store the concatenated data
concatenated_data = pd.DataFrame()

# Iterate over the file paths
for file_path in file_paths:
    # Read the CSV file without header
    data = pd.read_csv(file_path, header=None)

    # Concatenate the data vertically
    concatenated_data = pd.concat([concatenated_data, data], axis=0)

# Reset the index of the concatenated data
concatenated_data.reset_index(drop=True, inplace=True)

# Save the concatenated data to a new CSV file
concatenated_data.to_csv('merged_file.csv', index=False, header=False)
#remove unwanted columns
concatenated_data = concatenated_data.drop([0,2,3,4,5],axis=1)

In [2]:
len(concatenated_data)
#concatenated_data.head()
data = concatenated_data.dropna()
df = pd.DataFrame()
df['TITLE'] = data[1]
df['CATEGORY'] = data[6]

In [3]:
df.head(),len(df
              )

(                                               TITLE       CATEGORY
 0                    apple iphone 8 plus 64gb silver  Mobile Phones
 1                apple iphone 8 plus 64 gb spacegrau  Mobile Phones
 2  apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...  Mobile Phones
 3                apple iphone 8 plus 64gb space grey  Mobile Phones
 4  apple iphone 8 plus gold 5.5 64gb 4g unlocked ...  Mobile Phones,
 273481)

In [4]:
!pip install transformers



In [5]:
!pip install torch



In [6]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [7]:

encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df['ENCODE_CAT'] = df['CATEGORY'].apply(lambda x: encode_cat(x))

In [64]:
df.head()

Unnamed: 0,TITLE,CATEGORY,ENCODE_CAT
0,apple iphone 8 plus 64gb silver,Mobile Phones,0
1,apple iphone 8 plus 64 gb spacegrau,Mobile Phones,0
2,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,Mobile Phones,0
3,apple iphone 8 plus 64gb space grey,Mobile Phones,0
4,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,Mobile Phones,0


In [65]:
print
(df['CATEGORY'],df['ENCODE_CAT'])

(0                Mobile Phones
 1                Mobile Phones
 2                Mobile Phones
 3                Mobile Phones
 4                Mobile Phones
                   ...         
 587182    Car & Boat Batteries
 587183    Car & Boat Batteries
 587184    Car & Boat Batteries
 587185    Car & Boat Batteries
 587186    Car & Boat Batteries
 Name: CATEGORY, Length: 273481, dtype: object,
 0          0
 1          0
 2          0
 3          0
 4          0
           ..
 587182    20
 587183    20
 587184    20
 587185    20
 587186    20
 Name: ENCODE_CAT, Length: 273481, dtype: int64)

In [8]:

# Defining some key variables that will be used later on in the training
MAX_LEN = 100
TRAIN_BATCH_SIZE = 40
VALID_BATCH_SIZE = 10
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [9]:

class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        title = str(self.data.TITLE[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)

        }

    def __len__(self):
        return self.len

In [10]:

train_size = 0.95
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (273481, 3)
TRAIN Dataset: (259807, 3)
TEST Dataset: (13674, 3)


In [19]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [11]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768,21)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [12]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [13]:
model = DistillBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [14]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [15]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [16]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [17]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%50==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [20]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 3.0506198406219482
Training Accuracy per 5000 steps: 7.5
Training Loss per 5000 steps: 2.548878099404129
Training Accuracy per 5000 steps: 30.931372549019606
Training Loss per 5000 steps: 2.277808984907547
Training Accuracy per 5000 steps: 32.84653465346535
Training Loss per 5000 steps: 2.134332116865954
Training Accuracy per 5000 steps: 34.98344370860927
Training Loss per 5000 steps: 2.014726413423149
Training Accuracy per 5000 steps: 37.08955223880597
Training Loss per 5000 steps: 1.9140164571929263
Training Accuracy per 5000 steps: 39.78087649402391
Training Loss per 5000 steps: 1.8298999617662144
Training Accuracy per 5000 steps: 42.159468438538205
Training Loss per 5000 steps: 1.7446175185363857
Training Accuracy per 5000 steps: 44.643874643874646
Training Loss per 5000 steps: 1.6815407933142417
Training Accuracy per 5000 steps: 46.583541147132166
Training Loss per 5000 steps: 1.622816718074012
Training Accuracy per 5000 steps: 48.3370288248337
Traini

In [21]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu


In [22]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss per 100 steps: 0.27727416157722473
Validation Accuracy per 100 steps: 90.0
Validation Loss Epoch: 0.21450550303445878
Validation Accuracy Epoch: 91.61181804885183
Accuracy on test data = 91.61%


In [23]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        -1.3489e-04,  4.0779e-04, -3.2891e-05, -4.9643e-04, -9.0442e-05,
         3.3139e-05,  8.5759e-05, -1.7608e-04,  9.1023e-05, -3.6761e-04,
         1.4486e-04,  2.6833e-04, -3.8997e-04, -1.8572e-05, -2.4841e-04,
        -1.8619e-04, -2.7872e-04,  8.2077e-06, -5.4788e-04, -4.0090e-04,
         3.1313e-04, -2.0577e-04,  2.4990e-05,  4.6428e-04,  2.2349e-04,
         1.5057e-04, -1.5134e-04, -4.2363e-04,  6.3106e-05, -3.1690e-04,
         5.4218e-04,  1.8888e-05,  3.1226e-04,  1.0959e-04, -5.2941e-04,
         2.6760e-04, -7.8626e-04,  2.5565e-04, -6.8472e-04,  1.1015e-04,
        -3.1134e-04,  1.2216e-04,  3.3313e-04,  1.9729e-04,  4.8099e-05,
         1.9223e-04, -1.9678e-04,  3.6404e-04,  7.4586e-04, -3.9480e-06,
         1.4232e-03, -1.3540e-03,  7.5001e-04, -4.4505e-04,  5.6091e-04,
        -1.3139e-04, -1.1074e-03,  5.6495e-04,  9.1395e-04, -1.7250e-05,
         7.2879e-04, -5.8475e-04, -3.5812e-05,  4.4865e-04,

In [24]:
import os

output_model_file = '/content/drive/MyDrive/archive (2)/models/pytorch_distilbert_news.bin'
output_vocab_file = '/content/drive/MyDrive/archive (2)//models/vocab_distilbert_news.bin'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(output_model_file), exist_ok=True)

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed


In [66]:
input_text = "Infinix SMART 7 (Night Black, 64 GB)  (4 GB RAM)"

# Tokenize the input
tokens = tokenizer.encode_plus(
    input_text,
    max_length=128,  # Adjust the maximum sequence length as needed
    truncation=True,
    padding='max_length',
    return_tensors='pt'  # Return PyTorch tensors
)

In [67]:

# Prepare the input tensors
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']



In [68]:
# Specify the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the device
model = model.to(device)

# Move the input tensors to the device
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Feed the input to the model
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

In [69]:
# Process the model output
logits = outputs[0]

# Apply softmax to obtain probabilities
predictions = torch.nn.functional.softmax(logits)

# Interpret the predictions
predicted_class = torch.argmax(predictions).item()

# Print the predicted class
print("Predicted class:", predicted_class)

Predicted class: 0


  predictions = torch.nn.functional.softmax(logits)


In [70]:
# Create a dictionary to map encoded categories to class names
category_map = dict(zip(df['ENCODE_CAT'], df['CATEGORY']))

In [78]:
# Get the class name for the predicted class
#predicted_class_name = category_map[predicted_class]
# Get the class name for the predicted class
predicted_class_name = category_map[predicted_class]

In [79]:
print("Predicted class:",predicted_class_name )

Predicted class: Mobile Phones
