In [None]:
pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
pip install tf-keras



In [None]:
import pandas as pd
import numpy as np
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, classification_report

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load dataset
df = pd.read_csv("/content/complaints.csv")

# Drop unnecessary columns
df.drop(['Complaint ID'], axis=1, inplace=True)

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = text.lower().split()  # Convert to lowercase and split
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['Complaint Text'] = df['Complaint Text'].apply(preprocess_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Encode category and sub-category labels
label_encoder_cat = LabelEncoder()
df['Complaint Category'] = label_encoder_cat.fit_transform(df['Complaint Category'])
label_encoder_sub = LabelEncoder()
df['Sub-Complaint'] = label_encoder_sub.fit_transform(df['Sub-Complaint'])

# Save label encoders for later use
with open('label_encoder_cat.pkl', 'wb') as file:
    pickle.dump(label_encoder_cat, file)

with open('label_encoder_sub.pkl', 'wb') as file:
    pickle.dump(label_encoder_sub, file)

# Train-Test Split
X_train, X_test, y_train_cat, y_test_cat, y_train_sub, y_test_sub = train_test_split(
    df['Complaint Text'], df['Complaint Category'], df['Sub-Complaint'], test_size=0.20, random_state=42
)

In [None]:
# Load BERT Sentence Transformer
bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Convert text to embeddings using BERT
X_train_vec = np.array([bert_model.encode(text) for text in X_train])
X_test_vec = np.array([bert_model.encode(text) for text in X_test])

# Handle Imbalance using SMOTE (Fixed)
smote = SMOTE(random_state=42)

# Apply SMOTE separately for category and sub-complaint
X_train_vec_smote_cat, y_train_cat_smote = smote.fit_resample(X_train_vec, y_train_cat)
X_train_vec_smote_sub, y_train_sub_smote = smote.fit_resample(X_train_vec, y_train_sub)

In [None]:
###### Train Category Classifier
category_model = RandomForestClassifier(n_estimators=200, class_weight='balanced', max_depth=10, random_state=42)
category_model.fit(X_train_vec_smote_cat, y_train_cat_smote)

# Train Sub-Complaint Classifier
sub_complaint_model = RandomForestClassifier(n_estimators=200, class_weight='balanced', max_depth=10, random_state=42)
sub_complaint_model.fit(X_train_vec_smote_sub, y_train_sub_smote)

# Evaluate Category Model
y_pred_cat = category_model.predict(X_test_vec)
print("\nCategory Classification Report:\n", classification_report(y_test_cat, y_pred_cat))

# Evaluate Sub-Complaint Model
y_pred_sub = sub_complaint_model.predict(X_test_vec)
print("\nSub-Complaint Classification Report:\n", classification_report(y_test_sub, y_pred_sub))

# Save models
with open('category_model.pkl', 'wb') as file:
    pickle.dump(category_model, file)

with open('sub_complaint_model.pkl', 'wb') as file:
    pickle.dump(sub_complaint_model, file)


# Prediction function
def predict_complaint(complaint_text):
    processed_text = preprocess_text(complaint_text)
    complaint_vec = bert_model.encode(processed_text).reshape(1, -1)

    predicted_category = category_model.predict(complaint_vec)
    predicted_sub_complaint = sub_complaint_model.predict(complaint_vec)

    category_label = label_encoder_cat.inverse_transform(predicted_category)[0]
    sub_complaint_label = label_encoder_sub.inverse_transform(predicted_sub_complaint)[0]

    return category_label, sub_complaint_label


complaint = input("Enter your complaint: ")
predicted_category, predicted_sub_complaint = predict_complaint(complaint)

print("\nPredicted Complaint Category:", predicted_category)
print("Predicted Sub-Complaint:", predicted_sub_complaint)


Category Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3424
           1       1.00      1.00      1.00      3372
           2       1.00      1.00      1.00      3332
           3       1.00      1.00      1.00      3261
           4       1.00      1.00      1.00      3206
           5       1.00      1.00      1.00      3405

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000


Sub-Complaint Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1128
           1       1.00      1.00      1.00      1157
           2       1.00      1.00      1.00      1549
           3       1.00      1.00      1.00      1130
           4       1.00      1.00      1.00      1074
           5       1.00      1.00      1.00       649
     

In [None]:
print(df.columns)

Index(['Complaint Text', 'Complaint Category', 'Sub-Complaint'], dtype='object')


In [None]:
# Prediction function
def predict_complaint(complaint_text):
    processed_text = preprocess_text(complaint_text)
    complaint_vec = bert_model.encode(processed_text).reshape(1, -1)

    predicted_category = category_model.predict(complaint_vec)
    predicted_sub_complaint = sub_complaint_model.predict(complaint_vec)

    category_label = label_encoder_cat.inverse_transform(predicted_category)[0]
    sub_complaint_label = label_encoder_sub.inverse_transform(predicted_sub_complaint)[0]

    return category_label, sub_complaint_label


complaint = input("Enter your complaint: ")
predicted_category, predicted_sub_complaint = predict_complaint(complaint)

print("\nPredicted Complaint Category:", predicted_category)
print("Predicted Sub-Complaint:", predicted_sub_complaint)

Enter your complaint: ticket collector is charging extra charges 

Predicted Complaint Category: Security Issues
Predicted Sub-Complaint: Overcharging & Non-Availability
