In [1]:
pip install transformers



In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [3]:
# Load and preprocess your dataset (replace 'your_dataset.csv' with your actual dataset)
data = pd.read_csv('shoe_sales_dataset.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Product Name  155 non-null    object
 1   Category      155 non-null    object
 2   Brand         155 non-null    object
 3   Price         154 non-null    object
 4   Size          154 non-null    object
dtypes: object(5)
memory usage: 6.2+ KB


In [5]:
print(data.head())

       Product Name   Category          Brand    Price  Size
0     Running Shoes   Athletic           Nike    99.99    10
1   Casual Sneakers     Casual         Adidas    79.99     9
2  Basketball Shoes   Athletic         Jordan   129.99    11
3       Dress Shoes     Formal   Calvin Klein   149.99   8.5
4      Hiking Boots    Outdoor       Columbia    89.99   9.5


In [6]:
print(data.tail())

         Product Name   Category      Brand    Price Size
150     Slip-on Shoes     Casual   Converse    69.99    9
151     Running Shoes   Athletic       Nike    99.99   10
152   Casual Sneakers     Casual     Adidas    79.99    8
153  Basketball Shoes   Athletic     Jordan   129.99   11
154       Dress Shoes     Formal     Calvin      NaN  NaN


In [7]:
# Drop rows with missing values
data.dropna(inplace=True)

In [8]:
data.isnull().sum()

Product Name    0
Category        0
Brand           0
Price           0
Size            0
dtype: int64

In [9]:
data = data.dropna()

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 0 to 153
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Product Name  154 non-null    object
 1   Category      154 non-null    object
 2   Brand         154 non-null    object
 3   Price         154 non-null    object
 4   Size          154 non-null    object
dtypes: object(5)
memory usage: 7.2+ KB


In [11]:
# Define a dictionary mapping old column names to new names
column_name_mapping = {
    'Product Name': 'product_name'
}

In [12]:
# Rename the columns using .rename()
data.rename(columns=column_name_mapping, inplace=True)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 0 to 153
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  154 non-null    object
 1   Category      154 non-null    object
 2   Brand         154 non-null    object
 3   Price         154 non-null    object
 4   Size          154 non-null    object
dtypes: object(5)
memory usage: 7.2+ KB


In [14]:
data['Price'] = pd.to_numeric(data['Price'], errors='coerce')

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 0 to 153
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  154 non-null    object 
 1   Category      154 non-null    object 
 2   Brand         154 non-null    object 
 3   Price         153 non-null    float64
 4   Size          154 non-null    object 
dtypes: float64(1), object(4)
memory usage: 7.2+ KB


In [16]:
# Check for missing values
print(data.isnull().sum())

product_name    0
Category        0
Brand           0
Price           1
Size            0
dtype: int64


In [17]:
data = data.dropna()

In [18]:
# Split the data into features (X) and target (y)
X = data[['product_name', 'Category', 'Brand', 'Size']]
y = data['Price']

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [21]:
# Encode the text features for both training and testing data
X_train_encoded = tokenizer(list(X_train['product_name']), truncation=True, padding=True, return_tensors='pt', max_length=128)
X_test_encoded = tokenizer(list(X_test['product_name']), truncation=True, padding=True, return_tensors='pt', max_length=128)

In [22]:
# Define the BERT model
class BertRegressionModel(nn.Module):
    def __init__(self):
        super(BertRegressionModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        logits = self.linear(outputs.pooler_output)
        return logits

model = BertRegressionModel()

In [23]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)



In [24]:
# Convert data to PyTorch tensors
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

In [25]:
# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train_encoded['input_ids'], X_train_encoded['attention_mask'], y_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataset = TensorDataset(X_test_encoded['input_ids'], X_test_encoded['attention_mask'], y_test)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [26]:
# Train the model
model.train()
for epoch in range(10):  # Adjust the number of epochs as needed
    for batch in train_loader:
        input_ids, attention_mask, target = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = nn.MSELoss()(logits.squeeze(), target)
        loss.backward()
        optimizer.step()

In [27]:
# Evaluation on the test set
model.eval()
with torch.no_grad():
    predictions = []
    for batch in test_loader:
        input_ids, attention_mask, _ = batch
        batch_predictions = model(input_ids, attention_mask).squeeze()
        predictions.extend(batch_predictions.tolist())

In [28]:
# Evaluate model performance (e.g., calculate RMSE, MAE, etc.)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

In [30]:
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R^2: {r2}")

RMSE: 136.87191680727736
MAE: 109.47845100587413
R^2: -1.7755668793628905


In [31]:
import torch

In [32]:
# Function to use the trained model for price comparison
def predict_price(product_name, category, brand, size, model, tokenizer):
    # Prepare input data
    input_text = f"{product_name} {category} {brand} {size}"

    # Encode the input text using the tokenizer
    inputs = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt', max_length=128)

    # Ensure the model is in evaluation mode
    model.eval()

    # Make a prediction
    with torch.no_grad():
        prediction = model(inputs['input_ids'], inputs['attention_mask']).squeeze().item()

    return prediction

In [33]:
# Example usage:
product_name = "Running Shoes"
category = "Athletic"
brand = "Nike"
size = 10


In [34]:
predicted_price = predict_price(product_name, category, brand, size, model, tokenizer)
print(f"Predicted Price: ${predicted_price:.2f}")

Predicted Price: $5.99


In [35]:
import torch

In [38]:
def compare_prices(product_name, category, brand, size, model, tokenizer):
    # Prepare input data
    input_text = f"{product_name} {category} {brand} {size}"

    # Encode the input text using the tokenizer
    inputs = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt', max_length=128)

    # Ensure the model is in evaluation mode
    model.eval()

    # Make a prediction
    with torch.no_grad():
        predicted_price = model(inputs['input_ids'], inputs['attention_mask']).squeeze().item()

    # Example comparison data (you should replace this with real data):
    historical_price = 110
    competitor_prices = {"Adidas": 120, "Reebok": 95}
    retailer_prices = {"Nike Store": 100, "Retailer A": 95, "Retailer B": 105}

    # Create a comparison report
    comparison_report = f"Product: {product_name}\n"
    comparison_report += f"Predicted Price: ${predicted_price:.2f}\n\n"

    # Historical Price Comparison
    comparison_report += "Historical Price Comparison:\n"
    if predicted_price < historical_price:
        comparison_report += "The current price is below the historical average.\n"
    elif predicted_price > historical_price:
        comparison_report += "The current price is above the historical average.\n"
    else:
        comparison_report += "The current price matches the historical average.\n"

    # Competitor Price Comparison
    comparison_report += "Competitor Price Comparison:\n"
    for competitor, price in competitor_prices.items():
        if predicted_price < price:
            comparison_report += f"The product is priced lower than {competitor} ({price}).\n"
        elif predicted_price > price:
            comparison_report += f"The product is priced higher than {competitor} ({price}).\n"
        else:
            comparison_report += f"The product is priced the same as {competitor} ({price}).\n"

    # Retailer Price Comparison
    comparison_report += "Retailer Price Comparison:\n"
    for retailer, price in retailer_prices.items():
        if predicted_price < price:
            comparison_report += f"The product is priced lower than {retailer} ({price}).\n"
        elif predicted_price > price:
            comparison_report += f"The product is priced higher than {retailer} ({price}).\n"
        else:
            comparison_report += f"The product is priced the same as {retailer} ({price}).\n"

    return comparison_report

In [39]:
# Example usage:
product_name = "Running Shoes"
category = "Athletic"
brand = "Nike"
size = 10

In [40]:
comparison_result = compare_prices(product_name, category, brand, size, model, tokenizer)
print(comparison_result)

Product: Running Shoes
Predicted Price: $5.99

Historical Price Comparison:
The current price is below the historical average.
Competitor Price Comparison:
The product is priced lower than Adidas (120).
The product is priced lower than Reebok (95).
Retailer Price Comparison:
The product is priced lower than Nike Store (100).
The product is priced lower than Retailer A (95).
The product is priced lower than Retailer B (105).



In [41]:
# Save the trained model
torch.save(model.state_dict(), 'bert_price_prediction_model.pth')
