In [84]:
import chardet
import pandas as pd
# Detect the encoding of the file
with open("orders.csv", "rb") as f:
    result = chardet.detect(f.read())

# Read the CSV file with the detected encoding
order_data = pd.read_csv("orders.csv", encoding=result['encoding'])

# Display the first few rows of the DataFrame to verify
print(order_data.head())

   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2018-152156   11/8/2018  11/11/2018    Second Class    CG-12520   
1       2  CA-2018-152156   11/8/2018  11/11/2018    Second Class    CG-12520   
2       3  CA-2018-138688   6/12/2018   6/16/2018    Second Class    DV-13045   
3       4  US-2017-108966  10/11/2017  10/18/2017  Standard Class    SO-20335   
4       5  US-2017-108966  10/11/2017  10/18/2017  Standard Class    SO-20335   

     Customer Name    Segment Country/Region             City  ...  \
0      Claire Gute   Consumer  United States        Henderson  ...   
1      Claire Gute   Consumer  United States        Henderson  ...   
2  Darrin Van Huff  Corporate  United States      Los Angeles  ...   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   

  Postal Code  Region       Product ID         Category Sub-Category  \
0     42420.0   Sout

In [85]:
import matplotlib.pyplot as plt


# Check for missing values
missing_values = order_data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Check for duplicate rows
duplicate_rows = order_data.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

# Optionally, remove duplicate rows
order_data = order_data.drop_duplicates()

# Check data types
data_types = order_data.dtypes
print("Data types of each column:\n", data_types)

# Generate basic statistics
statistics = order_data.describe()
print("Basic statistics of the dataset:\n", statistics)

current_length = len(order_data)
print(f"Current length of the dataset: {current_length}")

order_data=order_data.dropna()

Missing values in each column:
 Row ID             0
Order ID           0
Order Date         0
Ship Date          0
Ship Mode          0
Customer ID        0
Customer Name      0
Segment            0
Country/Region     0
City               0
State              0
Postal Code       11
Region             0
Product ID         0
Category           0
Sub-Category       0
Product Name       0
Sales              0
Quantity           0
Discount           0
Profit             0
dtype: int64
Number of duplicate rows: 0
Data types of each column:
 Row ID              int64
Order ID           object
Order Date         object
Ship Date          object
Ship Mode          object
Customer ID        object
Customer Name      object
Segment            object
Country/Region     object
City               object
State              object
Postal Code       float64
Region             object
Product ID         object
Category           object
Sub-Category       object
Product Name       object
Sales           

In [86]:
current_length = len(order_data)
print(f"Current length of the dataset: {current_length}")

Current length of the dataset: 9983


In [87]:
order_data=order_data[['Segment','Category','Sub-Category','Sales']]

In [88]:
order_data

Unnamed: 0,Segment,Category,Sub-Category,Sales
0,Consumer,Furniture,Bookcases,261.9600
1,Consumer,Furniture,Chairs,731.9400
2,Corporate,Office Supplies,Labels,14.6200
3,Consumer,Furniture,Tables,957.5775
4,Consumer,Office Supplies,Storage,22.3680
...,...,...,...,...
9989,Consumer,Furniture,Furnishings,25.2480
9990,Consumer,Furniture,Furnishings,91.9600
9991,Consumer,Technology,Phones,258.5760
9992,Consumer,Office Supplies,Paper,29.6000


In [89]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the tokenizer and model from the local directory
tokenizer = BertTokenizer.from_pretrained("./BERT")
model = BertForSequenceClassification.from_pretrained("./BERT")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./BERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
order_data = order_data.head(500)

# Convert your table data to text format
def table_to_text(row):
    return " | ".join([f"{col}: {val}" for col, val in row.items()])

order_data['text'] = order_data.apply(table_to_text, axis=1)
order_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_data['text'] = order_data.apply(table_to_text, axis=1)


Unnamed: 0,Segment,Category,Sub-Category,Sales,text
0,Consumer,Furniture,Bookcases,261.9600,Segment: Consumer | Category: Furniture | Sub-...
1,Consumer,Furniture,Chairs,731.9400,Segment: Consumer | Category: Furniture | Sub-...
2,Corporate,Office Supplies,Labels,14.6200,Segment: Corporate | Category: Office Supplies...
3,Consumer,Furniture,Tables,957.5775,Segment: Consumer | Category: Furniture | Sub-...
4,Consumer,Office Supplies,Storage,22.3680,Segment: Consumer | Category: Office Supplies ...
...,...,...,...,...,...
495,Consumer,Office Supplies,Envelopes,105.4200,Segment: Consumer | Category: Office Supplies ...
496,Consumer,Office Supplies,Binders,119.6160,Segment: Consumer | Category: Office Supplies ...
497,Consumer,Furniture,Furnishings,255.7600,Segment: Consumer | Category: Furniture | Sub-...
498,Consumer,Furniture,Chairs,241.5680,Segment: Consumer | Category: Furniture | Sub-...


In [98]:
order_data[order_data['Sales']==max(order_data['Sales'])]['Segment']

165    Consumer
Name: Segment, dtype: object

In [102]:
def ask_question(question, context):
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return predicted_class

def process_table_in_chunks(question, dataframe, chunk_size=50):
    results = []
    for start in range(0, len(dataframe), chunk_size):
        end = start + chunk_size
        chunk = dataframe.iloc[start:end]
        context = " ".join(chunk['text'].tolist())
        predicted_class = ask_question(question, context)
        results.append(predicted_class)
    return results

# Example usage
question = "What Segment has the highest sum of sales?"
results = process_table_in_chunks(question, order_data, chunk_size=50)

# Aggregate results (this is a simple example, adjust based on your needs)
final_prediction = max(set(results), key=results.count)
print(f"Final predicted class: {final_prediction}")

# Mapping the predicted class to a label
label_map = {0: "Consumer", 1: "Corporate", 2: "Home Office"}  # Adjust this based on your actual labels
predicted_label = label_map.get(final_prediction, "Unknown")
print(f"Final predicted label: {predicted_label}")

Final predicted class: 1
Final predicted label: Corporate


In [100]:
summary

Unnamed: 0,Segment,Sales
0,Consumer,69921.5396
1,Corporate,32323.8468
2,Home Office,27180.9297


In [103]:
summary = order_data.groupby('Segment')['Sales'].sum().reset_index()
summary_text = " | ".join([f"{row['Segment']}: {row['Sales']}" for _, row in summary.iterrows()])

# Function to ask a question using the model
def ask_question(question, context):
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return predicted_class

# Example usage
question = "What Segment has the lowest sales?"
predicted_class = ask_question(question, summary_text)
print(f"Predicted class: {predicted_class}")

# Mapping the predicted class to a label
label_map = {0: "Consumer", 1: "Corporate", 2: "Home Office"}  # Adjust this based on your actual labels
predicted_label = label_map.get(predicted_class, "Unknown")
print(f"Predicted label: {predicted_label}")

Predicted class: 1
Predicted label: Corporate


In [82]:
context

'Segment: Consumer | Category: Furniture | Sub-Category: Bookcases | Sales: 261.96 | text: Segment: Consumer | Category: Furniture | Sub-Category: Bookcases | Sales: 261.96 | text: Segment: Consumer | Category: Furniture | Sub-Category: Bookcases | Sales: 261.96 | text: Segment: Consumer | Category: Furniture | Sub-Category: Bookcases | Sales: 261.96 | text: Segment: Consumer | Category: Furniture | Sub-Category: Bookcases | Sales: 261.96'

In [57]:
outputs.logits

tensor([[ 5.5649, -5.4607]], grad_fn=<AddmmBackward0>)

In [72]:
question = "What segment has the highest sum of sales?"
context = order_data.iloc[0]['text']
predicted_class = ask_question(question, context)
print(f"Predicted class: {predicted_class}")


Predicted class: 0


In [47]:
import os
os.listdir()

['.ipynb_checkpoints',
 'BERT',
 'BERT.ipynb',
 'fine-tuned-tapas',
 'fine_tuned_tabert',
 'fraudTest.csv',
 'fraudTrain.csv',
 'Fraudulent transactions-Bkp02Sept2024.ipynb',
 'Fraudulent transactions-Copy1.ipynb',
 'Fraudulent transactions.ipynb',
 'fraud_detection_nn.pth',
 'fraud_detection_nn_es.pth',
 'LIME_implementation',
 'LIME_implementation.zip',
 'logs',
 'orders.csv',
 'orders_updated.csv',
 'proj2',
 'results',
 'Tapas',
 'Tapas.ipynb',
 'tokenized_data.pt',
 'train_df.csv',
 'train_df_new.csv',
 'Untitled.ipynb']

In [None]:
order_data

In [17]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd

# Step 1: Download and save the model locally
model_name = "bert-base-uncased"


# Convert your table data to text format
def table_to_text(row):
    return " | ".join([f"{col}: {val}" for col, val in row.items()])

order_data['text'] = order_data.apply(table_to_text, axis=1)

# Prepare the dataset for TaBERT
class TableDataset(Dataset):
    def __init__(self, dataframe, tokenizer, questions, answers):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.questions = questions
        self.answers = answers

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        question = self.questions[idx]
        answer = self.answers[idx]
        input_text = f"question: {question} context: {row['text']}"
        inputs = self.tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True)
        labels = self.tokenizer(answer, return_tensors="pt", padding="max_length", truncation=True).input_ids
        inputs['labels'] = labels
        return inputs

def adjust_list_length(dataframe, questions, answers):
    # Calculate the difference in length
    diff = len(dataframe) - len(questions)
    
    # If questions list is shorter, extend it with the last question
    if diff > 0:
        questions.extend([questions[-1]] * diff)
        answers.extend([answers[-1]] * diff)
    
    return questions, answers

# Example questions and answers
questions = ["What segment has the highest sales?"]
answers = ["Consumer"]

# Adjust the length of questions and answers lists
questions, answers = adjust_list_length(order_data, questions, answers)


# Create instances of the custom dataset
tokenizer = BertTokenizer.from_pretrained("./BERT")
train_dataset = TableDataset(order_data, tokenizer, questions, answers)
eval_dataset = TableDataset(order_data, tokenizer, questions, answers)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
model = BertForSequenceClassification.from_pretrained("./BERT")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-tabert")
tokenizer.save_pretrained("./fine-tuned-tabert")

# Load the fine-tuned model
tokenizer = BertTokenizer.from_pretrained("./fine-tuned-tabert")
model = BertForSequenceClassification.from_pretrained("./fine-tuned-tabert")

# Ask Questions
input_text = "question: What segment has the highest sales? context: " + order_data.iloc[0]['text']
inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True)
outputs = model(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./BERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: too many values to unpack (expected 2)

In [52]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
order_data=order_data.head(500)
# Convert your table data to text format
def table_to_text(row):
    return " | ".join([f"{col}: {val}" for col, val in row.items()])

# Ensure order_data is defined and loaded

order_data['text'] = order_data.apply(table_to_text, axis=1)

# Prepare the dataset for TaBERT
class TableDataset(Dataset):
    def __init__(self, dataframe, tokenizer, questions, answers):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.questions = questions
        self.answers = answers

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        question = self.questions[idx]
        answer = self.answers[idx]
        input_text = f"question: {question} context: {row['text']}"

        # Tokenize the input text
        inputs = self.tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True)

        # Convert the answer to an integer label
        label = torch.tensor(0 if answer == "Consumer" else 1)  # Adjust this logic based on your actual labels

        # Return a dictionary with the expected keys
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'token_type_ids': inputs.get('token_type_ids', None).squeeze() if 'token_type_ids' in inputs else None,
            'labels': label
        }

def adjust_list_length(dataframe, questions, answers):
    diff = len(dataframe) - len(questions)
    if diff > 0:
        questions.extend([questions[-1]] * diff)
        answers.extend([answers[-1]] * diff)
    return questions, answers

# Example questions and answers
questions = ["What segment has the highest sales?"]
answers = ["Consumer"]

# Adjust the length of questions and answers lists
questions, answers = adjust_list_length(order_data, questions, answers)

# Create instances of the custom dataset
tokenizer = BertTokenizer.from_pretrained("./BERT")
train_dataset = TableDataset(order_data, tokenizer, questions, answers)
eval_dataset = TableDataset(order_data, tokenizer, questions, answers)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
model = BertForSequenceClassification.from_pretrained("./BERT")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

try:
    # Train the model
    trainer.train()
    # Save the fine-tuned model
    model.save_pretrained("./fine_tuned_tabert")
    tokenizer.save_pretrained("./fine_tuned_tabert")
except Exception as e:
    print(f"An error occurred during training: {e}")

# Load the fine-tuned model
tokenizer = BertTokenizer.from_pretrained("./fine_tuned_tabert")
model = BertForSequenceClassification.from_pretrained("./fine_tuned_tabert")

# Ask Questions
input_text = "question: What segment has the highest sales? context: " + order_data.iloc[0]['text']
inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True)
outputs = model(**inputs)

# Get the predicted class
predicted_class = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted class: {predicted_class}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./BERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.0425
1000,0.0001
1500,0.0


Predicted class: 0


In [58]:
# Load the fine-tuned model
tokenizer = BertTokenizer.from_pretrained("./fine_tuned_tabert")
model = BertForSequenceClassification.from_pretrained("./fine_tuned_tabert")

# Ask Questions
input_text = "question: What segment has the highest sales? context: " + order_data.iloc[0]['text']
inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True)
outputs = model(**inputs)

# Get the predicted class
predicted_class = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted class: {predicted_class}")

# Map the predicted class to the label
label_map = {0: "Consumer", 1: "Other"}  # Adjust this based on your actual labels
predicted_label = label_map[predicted_class]
print(f"Predicted label: {predicted_label}")

Predicted class: 0
Predicted label: Consumer


In [18]:
print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")



Training dataset size: 7995
Evaluation dataset size: 1999
