In [1]:
import os
import pandas as pd
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize an empty list to hold data
data = []

# Define the base directory
base_dir = "Тренировочные данные (train)"

# Function to read .docx file
def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Walk through the base directory and its subfolders
for root, dirs, files in os.walk(base_dir):
    # Get the folder name, which will be used as the class label
    folder_name = os.path.basename(root)
    
    for file in files:
        if file.endswith(".docx"):
            file_path = os.path.join(root, file)
            # Read the text from the .docx file
            text = read_docx(file_path)
            # Append the text and the class (folder name) to the data list
            data.append([file, text, folder_name])

# Create a DataFrame
df = pd.DataFrame(data, columns=['filename', "text", "class"])

# Save the DataFrame to an Excel file for reference
df.to_excel('texts.xlsx', index=False)

In [2]:
# ! pip install transformers datasets scikit-learn torch


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch

# Load the Excel file into a DataFrame
df = pd.read_excel('texts.xlsx')

# Display the first few rows to verify
df.head()


Unnamed: 0,filename,text,class
0,Use Case CF_ Washer rear camera.docx,I-24056\nOC Subsection: [F-3681] New OC Subsec...,Wipers and Washers
1,Use Case CF_ Setting the mode of wipers.docx,I-23461\nOC Subsection: [F-3681] New OC Subsec...,Wipers and Washers
2,Use Case CF_ rain - light sensor malfunction.docx,I-23222\nOC Subsection: [F-3681] New OC Subsec...,Wipers and Washers
3,Use Case CF_ Washer spray usage_ Continuous Wa...,I-23212\nOC Subsection: [F-3681] New OC Subsec...,Wipers and Washers
4,Use Case CF_ Display low wash fluid level war...,I-23214\nOC Subsection: [F-3681] New OC Subsec...,Wipers and Washers


In [4]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'class' column
df['label'] = label_encoder.fit_transform(df['class'])

# Number of unique classes
num_labels = df['label'].nunique()
print(f"Number of classes: {num_labels}")


Number of classes: 3


In [5]:
# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,       # 20% for testing
    random_state=42,
    stratify=df['label'] # Ensure proportional representation of classes
)


In [6]:
train_texts

0     I-24056\nOC Subsection: [F-3681] New OC Subsec...
1     I-23461\nOC Subsection: [F-3681] New OC Subsec...
6     I-24014\nOC Section: [F-7600] New OC Section: ...
3     I-23212\nOC Subsection: [F-3681] New OC Subsec...
15    I-6418\nOC Subsection: [F-3688] New OC Subsect...
12    I-25078\nOC Subsection: [F-3688] New OC Subsec...
8     I-24205\nOC Section: [F-7600] New OC Section: ...
9     I-23569\nOC Subsection: [F-3688] New OC Subsec...
10    I-23572\nOC Subsection: [F-3688] New OC Subsec...
5     I-24013\nOC Section: [F-7600] New OC Section: ...
4     I-23214\nOC Subsection: [F-3681] New OC Subsec...
14    I-23897\nOC Subsection: [F-3688] New OC Subsec...
Name: text, dtype: object

In [7]:
from datasets import Dataset

# Create training and testing datasets
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels})
test_dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [8]:
from transformers import DistilBertTokenizer

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define a tokenization function
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=512)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


Map: 100%|██████████| 12/12 [00:00<00:00, 297.01 examples/s]
Map: 100%|██████████| 4/4 [00:00<00:00, 487.70 examples/s]


In [9]:
from transformers import DistilBertForSequenceClassification

# Initialize the model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define the evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Set up training arguments with aligned strategies
training_args = TrainingArguments(
    output_dir='./results',                # Output directory
    num_train_epochs=3,                    # Number of training epochs
    per_device_train_batch_size=16,        # Batch size per device during training
    per_device_eval_batch_size=64,         # Batch size for evaluation
    warmup_steps=500,                      # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                     # Strength of weight decay
    logging_dir='./logs',                  # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",           # Evaluate at the end of each epoch
    save_strategy="epoch",                 # Save checkpoint at the end of each epoch
    save_total_limit=2,                    # Limit the total amount of checkpoints
    load_best_model_at_end=True,           # Load the best model at the end
    metric_for_best_model='accuracy',      # Use accuracy to evaluate the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)




In [11]:
# Start training
trainer.train()


  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

print(f"Test set evaluation results:\n{eval_results}")


In [None]:
# Example text
texts = [
    "Your example sentence 1.",
    "Another example sentence for classification."
]

# Tokenize the texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predicted_class_ids = logits.argmax(dim=1).tolist()

# Map the predicted class IDs back to labels
predicted_labels = label_encoder.inverse_transform(predicted_class_ids)

for text, label in zip(texts, predicted_labels):
    print(f"Text: {text}\nPredicted class: {label}\n")


In [None]:
# Save the model and tokenizer
model.save_pretrained('fine-tuned-distilbert')
tokenizer.save_pretrained('fine-tuned-distilbert')


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('fine-tuned-distilbert')

# Load the model
model = DistilBertForSequenceClassification.from_pretrained('fine-tuned-distilbert')
