# Importation des modules

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

2023-09-26 16:04:25.209835: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-26 16:04:25.376892: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-26 16:04:25.379251: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


# Préparation des données

In [2]:
df = pd.read_csv("dataset_csv_img/Flipkart/flipkart_com-ecommerce_sample_1050.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   uniq_id                  1050 non-null   object 
 1   crawl_timestamp          1050 non-null   object 
 2   product_url              1050 non-null   object 
 3   product_name             1050 non-null   object 
 4   product_category_tree    1050 non-null   object 
 5   pid                      1050 non-null   object 
 6   retail_price             1049 non-null   float64
 7   discounted_price         1049 non-null   float64
 8   image                    1050 non-null   object 
 9   is_FK_Advantage_product  1050 non-null   bool   
 10  description              1050 non-null   object 
 11  product_rating           1050 non-null   object 
 12  overall_rating           1050 non-null   object 
 13  brand                    712 non-null    object 
 14  product_specifications  

In [4]:
df['product_category_tree'] = df['product_category_tree'].apply(lambda x : x.split(" >> ")[0][2:])


In [5]:
df.rename(columns={'product_category_tree': 'category'}, inplace=True)
df['category']

0       Home Furnishing
1             Baby Care
2             Baby Care
3       Home Furnishing
4       Home Furnishing
             ...       
1045          Baby Care
1046          Baby Care
1047          Baby Care
1048          Baby Care
1049          Baby Care
Name: category, Length: 1050, dtype: object

In [6]:
df['description']

0       Key Features of Elegance Polyester Multicolor ...
1       Specifications of Sathiyas Cotton Bath Towel (...
2       Key Features of Eurospa Cotton Terry Face Towe...
3       Key Features of SANTOSH ROYAL FASHION Cotton P...
4       Key Features of Jaipur Print Cotton Floral Kin...
                              ...                        
1045    Oren Empower Extra Large Self Adhesive Sticker...
1046    Wallmantra Large Vinyl Sticker Sticker (Pack o...
1047    Buy Uberlyfe Extra Large Pigmented Polyvinyl F...
1048    Buy Wallmantra Medium Vinyl Sticker Sticker fo...
1049    Buy Uberlyfe Large Vinyl Sticker for Rs.595 on...
Name: description, Length: 1050, dtype: object

In [8]:
# - Encoder les catégories en numériques
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# Tokenisation avec le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True) for text in df['description']]

# Padding des séquences
max_len = max(len(text) for text in tokenized_texts)
input_ids = np.array([text + [0] * (max_len - len(text)) for text in tokenized_texts])

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(input_ids, df['label'], test_size=0.2, random_state=42)

# Créer le modèle BERT
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Compiler le modèle
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Entraîner le modèle
history = model.fit(
    X_train, y_train,
    epochs=3,
    batch_size=32,
    validation_data=(X_test, y_test)
)

# Évaluer le modèle
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred.logits, axis=1)
accuracy = accuracy_score(y_test, y_pred_labels)
report = classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_)

print("Précision du modèle :", accuracy)
print(report)

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.52MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 51.5kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 874kB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:39<00:00, 11.1MB/s] 
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Précision du modèle : 0.8857142857142857
                            precision    recall  f1-score   support

                 Baby Care       0.71      0.63      0.67        27
  Beauty and Personal Care       0.94      0.76      0.84        21
                 Computers       0.95      1.00      0.97        38
Home Decor & Festive Needs       0.85      0.97      0.91        30
           Home Furnishing       0.86      0.86      0.86        35
          Kitchen & Dining       0.86      0.92      0.89        26
                   Watches       1.00      0.97      0.98        33

                  accuracy                           0.89       210
                 macro avg       0.88      0.87      0.87       210
              weighted avg       0.89      0.89      0.88       210

