In [None]:
pip install transformers

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report


In [None]:
# Load your data into a DataFrame
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']

for encoding in encodings:
    try:
        df = pd.read_csv('Path Of Your Data Set', encoding=encoding)
        # If reading succeeds, break out of the loop
        break
    except UnicodeDecodeError:
        print(f"Failed to read with encoding {encoding}")

# Now df contains your DataFrame with the successfully decoded data
# Display a few values from the original data
print("Original Data Sample:")
print(df.head())


In [None]:
# Specify the column containing text data
message_column = 'Column Name Contain Text Data'

# Check and convert non-string elements to strings
df[message_column] = df[message_column].astype(str)


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[message_column], df['Column Name Contain Sentiment'], test_size=0.2, random_state=42)


In [None]:
# Initialize the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(df['Column Name Contain Sentiment'].unique()))

# Tokenize the text data
X_train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors="pt", max_length=100)
X_test_encodings = tokenizer(list(X_test), truncation=True, padding=True, return_tensors="pt", max_length=100)


In [None]:
# Convert labels to numeric values
label_map = {label: idx for idx, label in enumerate(df['Column Name Contain Sentiment'].unique())}
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)


In [None]:
# Convert data to PyTorch tensors
X_train_tensor = X_train_encodings.input_ids
X_test_tensor = X_test_encodings.input_ids

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)


In [None]:
# Create DataLoader for training and testing sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


In [None]:
# Define optimizer and training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)

# Train the model
model.train()

for epoch in range(1):  # Adjust the number of epochs as needed
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


In [None]:
# Evaluation
model.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        outputs = model(inputs)
        logits = outputs.logits
        predicted = torch.argmax(logits, dim=1)
        predicted_labels.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())


In [None]:
# Generate a classification report
unique_labels = list(label_map.keys())
report = classification_report(true_labels, predicted_labels, target_names=unique_labels)
print("Classification Report:")
print(report)
