In [None]:
# ==============================================================================
# SECTION 1: SETUP AND INSTALLATIONS
# ==============================================================================
import pandas as pd
import numpy as np
import re
import nltk
import os

from google.colab import drive
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

print("✅ All libraries imported.")

# Download necessary NLTK data (only needs to be run once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') # Open Multilingual Wordnet
print("✅ NLTK components downloaded.")

In [None]:
# ==============================================================================
# SECTION 2: DATA LOADING AND PREPARATION
# ==============================================================================
# To access files from Google Drive, you must first mount it.
# This will open an authentication pop-up. Follow the steps to grant access.
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# Define the full path to your CSV file on Google Drive.
# The path starts with '/content/drive/My Drive/'
# and then follows the folder structure you provided.
DATA_PATH = '/content/drive/My Drive/Dataset/consumer-complaints.csv'

# Load the dataset from the specified Google Drive path.
# We'll specifically select the 'product' and 'consumer_complaint_narrative' columns
# as they are the most relevant for our classification task. We also drop any rows
# that have missing values in these columns to ensure data quality.
try:
    df = pd.read_csv(DATA_PATH)[['product', 'consumer_complaint_narrative']].dropna()
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file at '{DATA_PATH}' was not found. Please check the path and try again.")
    # Exit or handle the error gracefully if the file isn't found.
    # For this example, we'll just stop here.
    exit()

# We'll select a subset of the data that focuses on specific product types.
# For this task, we're interested in 'Debt collection' and 'Mortgage' complaints.
# These will form our two classes for binary classification.
df_useful = df[df['product'].isin(['Debt collection', 'Mortgage'])]

# To make the processing faster and manageable, we will take a smaller,
# random sample of 10,000 records from this filtered dataset.
# The 'random_state' is set to 42 to ensure the sample is the same every time
# the script is run, which is crucial for reproducibility.
df_useful = df_useful.sample(n=10000, random_state=42)

# Now, we create our target variable, or 'sentiment'.
# We'll assign a label of 1 to 'Debt collection' complaints (representing one class)
# and a label of 0 to 'Mortgage' complaints (representing the other class).
df_useful['sentiment'] = np.where(df_useful['product'] == 'Debt collection', 1, 0)

# Finally, we'll keep only the text and the newly created sentiment columns.
# We also rename the 'consumer_complaint_narrative' column to 'text' for
# convenience and to align with common practices in text processing pipelines.
df_useful.rename(columns={'consumer_complaint_narrative': 'text'}, inplace=True)
df_useful = df_useful[['text', 'sentiment']]

# Let's print the first few rows of our new, useful dataset to confirm the
# structure and the newly added 'sentiment' column.
print("\n--- Processed Dataset Head ---")
print(df_useful.head())

# It's also good practice to check the distribution of our target variable.
# This helps us understand if the classes are balanced or if we might need
# to apply techniques to handle class imbalance.
print("\n--- Data Distribution ---")
print(df_useful['sentiment'].value_counts())

In [None]:
# ==============================================================================
# SECTION 3: TEXT PREPROCESSING
# ==============================================================================
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Cleans and prepares the text for feature extraction.
    """
    # 1. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    # 2. Remove non-alphabetic characters and convert to lower case
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    # 3. Tokenize
    tokens = text.split()
    # 4. Remove stopwords and lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(lemmatized_tokens)

print("\nPreprocessing data... (This may take a minute)")
df_sample['processed_text'] = df_sample['text'].apply(preprocess_text)
print("✅ Preprocessing complete.")

print("\n--- Original vs. Processed Text ---")
print("Original:", df_sample['text'].iloc[0])
print("Processed:", df_sample['processed_text'].iloc[0])

In [None]:
# ==============================================================================
# SECTION 4: RULE-BASED COMPONENT
# ==============================================================================
# List of strong negative keywords and phrases indicating a complaint/issue
NEGATIVE_KEYWORDS = [
    'fraud', 'incorrect', 'scam', 'unauthorized', 'stolen', 'never received',
    'not resolved', 'false information', 'dispute', 'complaint', 'inaccurate',
    'wrong', 'thief', 'damage', 'violation'
]

def predict_with_rules(text):
    """
    Applies a set of predefined rules to classify text as negative.
    Returns 1 (Negative) if a rule is triggered, otherwise -1 (Unknown).
    """
    text = text.lower()
    if any(keyword in text for keyword in NEGATIVE_KEYWORDS):
        return 1
    if "not accurate" in text or "not correct" in text:
        return 1
    return -1 # Use -1 to signify 'unknown'

print("\n✅ Rule-based component defined.")

In [None]:
# ==============================================================================
# SECTION 5: FEATURE EXTRACTION & DATA SPLITTING
# ==============================================================================
# Define features (X) and target (y)
X_raw = df_sample['text'] # For rule-based model
X_processed = df_sample['processed_text'] # For ML models
y = df_sample['sentiment']

# Split the data into training and testing sets
X_train_raw, X_test_raw, X_train_processed, X_test_processed, y_train, y_test = train_test_split(
    X_raw, X_processed, y, test_size=0.25, random_state=42, stratify=y)

print(f"\nTraining data shape: {X_train_processed.shape}")
print(f"Testing data shape: {X_test_processed.shape}")

# Initialize and fit the TF-IDF Vectorizer on the processed training data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_processed)
X_test_tfidf = tfidf_vectorizer.transform(X_test_processed)

print("✅ TF-IDF vectorization complete.")

In [None]:
# ==============================================================================
# SECTION 6: TRAINING MACHINE LEARNING MODELS
# ==============================================================================
print("\nTraining ML models...")

# 1. Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
print("✅ Logistic Regression trained.")

# 2. Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
print("✅ Naive Bayes trained.")

# 3. Linear SVM (Support Vector Machine)
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train_tfidf, y_train)
print("✅ Linear SVM trained.")

In [None]:
# ==============================================================================
# SECTION 7: IMPLEMENTING AND TESTING THE HYBRID MODEL
# ==============================================================================
print("\nImplementing the hybrid model logic...")

def predict_hybrid(raw_texts, tfidf_vectors, ml_model):
    """
    Combines rule-based predictions with an ML model.
    """
    hybrid_predictions = []
    # Convert raw_texts Series to a list to use index
    raw_texts_list = list(raw_texts)

    for i, text in enumerate(raw_texts_list):
        # 1. Apply rule-based system on raw text
        rule_pred = predict_with_rules(text)
        if rule_pred != -1: # Rule triggered
            hybrid_predictions.append(rule_pred)
        else:
            # 2. If no rule, use the ML model on the corresponding TF-IDF vector
            vector = tfidf_vectors[i]
            ml_pred = ml_model.predict(vector)[0]
            hybrid_predictions.append(ml_pred)
    return np.array(hybrid_predictions)

# Generate predictions for all models
lr_preds = lr_model.predict(X_test_tfidf)
nb_preds = nb_model.predict(X_test_tfidf)
svm_preds = svm_model.predict(X_test_tfidf)
hybrid_preds = predict_hybrid(X_test_raw, X_test_tfidf, svm_model) # Using SVM as the ML component

print("✅ All model predictions are ready.")

In [None]:
# ==============================================================================
# SECTION 8: EVALUATION AND COMPARISON
# ==============================================================================
target_names = ['Class 0 (Positive)', 'Class 1 (Negative)']

print("\n\n--- MODEL PERFORMANCE COMPARISON ---")

print("\n--- 1. Logistic Regression ---")
print(classification_report(y_test, lr_preds, target_names=target_names))

print("\n--- 2. Naive Bayes ---")
print(classification_report(y_test, nb_preds, target_names=target_names))

print("\n--- 3. SVM (Standalone) ---")
print(classification_report(y_test, svm_preds, target_names=target_names))

print("\n--- 4. HYBRID MODEL (Rules + SVM) --- 🏆")
print(classification_report(y_test, hybrid_preds, target_names=target_names))

print("\n--- Summary of Accuracy Scores ---")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr_preds):.4f}")
print(f"Naive Bayes Accuracy:         {accuracy_score(y_test, nb_preds):.4f}")
print(f"SVM Accuracy:                 {accuracy_score(y_test, svm_preds):.4f}")
print(f"Hybrid Model Accuracy:        {accuracy_score(y_test, hybrid_preds):.4f}")

In [None]:
# ==============================================================================
# SECTION 9: PREDICTION ON NEW, UNSEEN DATA
# ==============================================================================

# Ensure all your models (lr_model, nb_model, svm_model) and the
# tfidf_vectorizer are already trained and available in the environment.

def predict_new_review(text):
    """
    Takes a new text string and returns predictions from all trained models.
    """
    print(f"--- Analyzing Review: '{text}' ---")

    # 1. Preprocess the new text using the same function
    processed_text = preprocess_text(text)
    
    # 2. Vectorize the processed text using the FITTED TF-IDF vectorizer
    # IMPORTANT: Use .transform() only, NOT .fit_transform()
    vectorized_text = tfidf_vectorizer.transform([processed_text])

    # 3. Get predictions from the standard ML models
    lr_pred = lr_model.predict(vectorized_text)[0]
    nb_pred = nb_model.predict(vectorized_text)[0]
    svm_pred = svm_model.predict(vectorized_text)[0]
    
    # 4. Get prediction from the Hybrid Model
    # The hybrid model's logic checks rules on the original raw text first
    hybrid_pred = -1
    rule_result = predict_with_rules(text) # Check rules on original text
    if rule_result != -1:
        hybrid_pred = rule_result
    else:
        # If no rule triggered, use the SVM prediction we already made
        hybrid_pred = svm_pred

    # 5. Decode predictions for readability
    # 1 is 'Negative' and 0 is 'Positive' in our setup
    sentiment_map = {1: 'Negative 😡', 0: 'Positive ✅'}
    
    print(f"Logistic Regression Prediction: {sentiment_map[lr_pred]}")
    print(f"Naive Bayes Prediction:         {sentiment_map[nb_pred]}")
    print(f"SVM Prediction:                 {sentiment_map[svm_pred]}")
    print(f"Hybrid Model Prediction:      {sentiment_map[hybrid_pred]}")
    print("-" * 40)


# ==============================================================================
# NOW, LET'S TEST IT WITH YOUR OWN SENTENCES!
# ==============================================================================

# Example 1: A clearly negative complaint
new_complaint_1 = "There is an incorrect and fraudulent charge on my credit report from a company I have never heard of. This is damaging my score."
predict_new_review(new_complaint_1)

# Example 2: A text that should be classified as positive in our context (about a mortgage)
new_complaint_2 = "My application for the mortgage went through smoothly and the agent was very helpful."
predict_new_review(new_complaint_2)

# Example 3: A tricky case without strong negative keywords for the rule-based system
new_complaint_3 = "The information listed on my account is not up to date and it's causing a lot of problems with my financial planning."
predict_new_review(new_complaint_3)