In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Importing Libraries**

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


**Loading Dataset**

In [3]:
df=pd.read_csv("/content/drive/MyDrive/train.csv")


In [4]:
# Display the number of rows and columns
print("Shape of the dataset:", df.shape)

Shape of the dataset: (82657, 12)


In [8]:
# View Null values
df.isnull().sum()

user_name             19393
country                  35
review_title              0
review_description        0
designation           23647
points                    0
price                  5569
province                 35
region_1              12754
region_2              46708
winery                    0
variety                   0
dtype: int64

In [9]:
df['variety'].value_counts()

Pinot Noir                    10587
Chardonnay                     9403
Cabernet Sauvignon             7552
Red Blend                      7166
Bordeaux-style Red Blend       5497
Riesling                       4148
Sauvignon Blanc                4011
Syrah                          3316
Rosé                           2831
Merlot                         2471
Nebbiolo                       2242
Zinfandel                      2209
Sangiovese                     2165
Malbec                         2119
Portuguese Red                 1969
White Blend                    1896
Sparkling Blend                1739
Tempranillo                    1448
Rhône-style Red Blend          1182
Pinot Gris                     1148
Champagne Blend                1133
Cabernet Franc                 1095
Grüner Veltliner               1055
Portuguese White                896
Pinot Grigio                    873
Bordeaux-style White Blend      850
Gewürztraminer                  840
Gamay                       

**Balancing the dataset by taking 500 sample of each class**

In [6]:
df1 = df.groupby('variety').apply(lambda x: x.sample(500, replace=True) if len(x) > 500 else x)

In [7]:
df1['variety'].value_counts()

Bordeaux-style Red Blend      500
Bordeaux-style White Blend    500
White Blend                   500
Tempranillo                   500
Syrah                         500
Sparkling Blend               500
Sauvignon Blanc               500
Sangiovese                    500
Rosé                          500
Riesling                      500
Rhône-style Red Blend         500
Red Blend                     500
Portuguese White              500
Portuguese Red                500
Pinot Noir                    500
Pinot Gris                    500
Pinot Grigio                  500
Nebbiolo                      500
Merlot                        500
Malbec                        500
Grüner Veltliner              500
Gewürztraminer                500
Gamay                         500
Chardonnay                    500
Champagne Blend               500
Cabernet Sauvignon            500
Cabernet Franc                500
Zinfandel                     500
Name: variety, dtype: int64

**Data preprocessing**

In [8]:
# concatinate required columns
df2=df1['review_title']+ df1['review_description']

In [9]:
# Add label column
final_df=pd.DataFrame({'text':df2,'variety':df1['variety']})

In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
final_df['variety_numeric'] = label_encoder.fit_transform(final_df['variety'])

In [18]:
final_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,variety,variety_numeric
variety,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bordeaux-style Red Blend,75461,Château Franc Pipeau 2011 Saint-ÉmilionThis i...,Bordeaux-style Red Blend,0
Bordeaux-style Red Blend,15086,Château Lanessan 2011 Haut-MédocAn austere wi...,Bordeaux-style Red Blend,0
Bordeaux-style Red Blend,46558,Château Tour Baladoz 2009 Saint-ÉmilionWith c...,Bordeaux-style Red Blend,0
Bordeaux-style Red Blend,3303,Château de Seguin 2015 Cuvée Carpe Diem (Bord...,Bordeaux-style Red Blend,0
Bordeaux-style Red Blend,20698,Château de Roques 2009 Cuvée Mérlateau (Lussa...,Bordeaux-style Red Blend,0


In [12]:
!pip install transformers



In [19]:
# Define the 'text' and 'numeric_variety' columns
texts = final_df['text'].values
labels = final_df['variety_numeric'].values

In [20]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

**Initialize the BERT tokenizer and model**

In [21]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=final_df['variety_numeric'].nunique())

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Tokenize the input text data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

In [23]:
# Create PyTorch DataLoader for the training and testing data
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_labels))
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), torch.tensor(test_labels))

In [24]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [25]:
# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()



**Model Training**

In [26]:
# Train the model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(3):  # You can adjust the number of epochs as needed
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

**Model Evaluation**

In [27]:
# Evaluation on the test set
model.eval()
test_preds = []
test_true_labels = []

for batch in test_loader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)

    test_preds.extend(preds.cpu().tolist())
    test_true_labels.extend(labels.cpu().tolist())

In [29]:
# Convert numeric labels back to their original form
label_encoder = LabelEncoder()
label_encoder.fit(final_df['variety_numeric'])
test_preds = label_encoder.inverse_transform(test_preds)
test_true_labels = label_encoder.inverse_transform(test_true_labels)


In [30]:
# Create a DataFrame to store the test predictions
predictions_df = pd.DataFrame({'text': test_texts, 'predicted_variety': test_preds, 'true_variety': test_true_labels})


In [31]:
predictions_df

Unnamed: 0,text,predicted_variety,true_variety
0,Jean-Luc and Paul Aegerter 2012 Creux de la Ne...,14,5
1,Domaine de Leyre-Loup 2014 FleurieThis is a r...,6,6
2,Cavipor 2011 Vinhas Altas White (Vinho Verde)T...,16,16
3,Ventana 2007 Gewürztraminer (Arroyo Seco)There...,7,7
4,Wines & Winemakers 2011 Samora Branco White (T...,16,16
...,...,...,...
2795,Westerly 2010 Merlot (Happy Canyon of Santa Ba...,10,10
2796,Rudi Pichler 2006 Weissenkirchner Achleiten Sm...,19,19
2797,Schramsberg 2009 Blanc de Blancs Brut Sparklin...,23,23
2798,Santa Ema 2009 Reserve Barrel Select Syrah (Ca...,24,24


**Performance metrics**

In [35]:
# Assuming 'true_variety' column contains the true labels and 'predicted_variety' contains the predicted labels
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
true_labels = predictions_df['true_variety']
predicted_labels = predictions_df['predicted_variety']

# Calculate the accuracy score
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Print the scores
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9743
Precision: 0.9745
Recall: 0.9743
F1 Score: 0.9739


**Customized prediction**

In [33]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Load the pretrained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Example of new text data for prediction
new_texts = [
    "This is a fantastic wine. I highly recommend it.",
    "The taste of this wine is disappointing.",
    "I am not a fan of this wine. Would not buy again."
]

# Tokenize the new text data
new_encodings = tokenizer(new_texts, truncation=True, padding=True, return_tensors='pt')

# Make predictions
with torch.no_grad():
    model.eval()
    inputs = new_encodings['input_ids']
    attention_mask = new_encodings['attention_mask']
    outputs = model(inputs, attention_mask=attention_mask)

logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=1).tolist()

# Convert numeric labels back to their original form
predicted_labels = label_encoder.inverse_transform(predicted_labels)

# Print the predictions
for text, prediction in zip(new_texts, predicted_labels):
    print(f"Text: {text} \nPredicted Variety: {prediction}\n")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text: This is a fantastic wine. I highly recommend it. 
Predicted Variety: 27

Text: The taste of this wine is disappointing. 
Predicted Variety: 27

Text: I am not a fan of this wine. Would not buy again. 
Predicted Variety: 27



**Model saving**

In [None]:
# Save the model and tokenizer to files
model.save_pretrained("/content/bert_model")
tokenizer.save_pretrained("/content/bert_model")

# Save label encoder separately
torch.save(label_encoder, "/content/label_encoder.pkl")