In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [None]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tensorflow as tf
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer,TFBertForSequenceClassification

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
df1=pd.read_csv("/kaggle/input/bank-customer-complaint-analysis/complaints.csv")
df1.head()

In [None]:
df1.rename(columns={'Unnamed: 0':'complaint_no'},inplace=True)

In [None]:
df1["product"].unique()

In [None]:
df1["product"].value_counts()

In [None]:
sns.countplot(x="product",data=df1)

In [None]:
df1.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df1.dropna(inplace=True)

In [None]:
df1.tail()

In [None]:
df=df1

In [None]:
df["product"].value_counts()

In [None]:
total_count=df["product"].value_counts().sum()
for category,count in df["product"].value_counts().items():
  print(f"{category}: {round(count/total_count *100,2)}%")

In [None]:
df["product"].value_counts()

In [None]:
def preprocess_text(text):
  text=re.sub(r"[^a-zA-z0-9]"," ",text)
  text=text.lower()
  return text

In [None]:
df['narrative'] = df['narrative'].astype(str)
df['Cleaned_narrative'] = df['narrative'].apply(preprocess_text)

In [None]:
le=LabelEncoder()


df["product_label"]=le.fit_transform(df["product"])

In [None]:
# spilt the data into train_test
x_train,x_test,y_train,y_test=train_test_split(df["Cleaned_narrative"],df["product_label"],test_size=0.2,random_state=42)

In [None]:
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=0.2,random_state=42)

In [None]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenizer_text(text,tokenizer,max_lenth=128):
  return tokenizer(text.tolist(),padding=True,max_length=max_lenth,truncation=True,return_tensors="tf")

In [None]:
training_tokens=tokenizer_text(x_train,tokenizer)
testing_tokens=tokenizer_text(x_test,tokenizer)
validation_tokens=tokenizer_text(x_val,tokenizer)

In [None]:
testing_tokens

In [None]:
train_dataset=tf.data.Dataset.from_tensor_slices((dict(training_tokens),y_train)).shuffle(1000).batch(16)
test_dataset=tf.data.Dataset.from_tensor_slices((dict(testing_tokens),y_test)).batch(16)
validation_dataset=tf.data.Dataset.from_tensor_slices((dict(validation_tokens), y_val)).batch(32)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weights=compute_class_weight(class_weight="balanced",classes=np.unique(y_train),y=y_train)
class_weights_dict=dict(enumerate(class_weights))

In [None]:
class_weights

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

bert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train the model
history=bert_model.fit(train_dataset, epochs=3,validation_data=validation_dataset, class_weight=class_weights_dict)



In [None]:

plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.show()
plt.plot(history.history['sparse_categorical_accuracy'], label='training accuracy')
plt.plot(history.history['val_sparse_categorical_accuracy'], label='validation accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#test_loss, test_accuracy = bert_model.evaluate(test_dataset)
#print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

In [None]:
test_loss, test_accuracy = bert_model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

In [None]:
import tensorflow as tf

# Step 1: Make predictions
y_true = []
y_pred = []

for batch in test_dataset:
    inputs, labels = batch
    predictions = bert_model.predict(inputs)
    logits = predictions.logits  # Extract logits from the predictions
    y_true.extend(labels.numpy())
    y_pred.extend(tf.argmax(logits, axis=1).numpy())  # Get the index of the max logit for predicted class

# Convert to numpy arrays if necessary
y_true = tf.convert_to_tensor(y_true, dtype=tf.int64)
y_pred = tf.convert_to_tensor(y_pred, dtype=tf.int64)

# Step 2: Calculate metrics
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()

precision.update_state(y_true, y_pred)
recall.update_state(y_true, y_pred)

precision_value = precision.result().numpy()
recall_value = recall.result().numpy()
f1_value = 2 * (precision_value * recall_value) / (precision_value + recall_value + tf.keras.backend.epsilon())

print(f'Precision: {precision_value}')
print(f'Recall: {recall_value}')
print(f'F1 Score: {f1_value}')


In [None]:
import tensorflow as tf

# Step 1: Make predictions
y_true = []
y_pred = []

for batch in validation_dataset:
    inputs, labels = batch
    predictions = bert_model.predict(inputs)
    logits = predictions.logits  # Extract logits from the predictions
    y_true.extend(labels.numpy())
    y_pred.extend(tf.argmax(logits, axis=1).numpy())  # Get the index of the max logit for predicted class

# Convert to numpy arrays if necessary
y_true = tf.convert_to_tensor(y_true, dtype=tf.int64)
y_pred = tf.convert_to_tensor(y_pred, dtype=tf.int64)

# Step 2: Calculate metrics
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()

precision.update_state(y_true, y_pred)
recall.update_state(y_true, y_pred)

precision_value = precision.result().numpy()
recall_value = recall.result().numpy()
f1_value = 2 * (precision_value * recall_value) / (precision_value + recall_value + tf.keras.backend.epsilon())

print(f'Precision: {precision_value}')
print(f'Recall: {recall_value}')
print(f'F1 Score: {f1_value}')
