In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

# --- 1. Data Loading and Preprocessing ---

# Download necessary NLTK data
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

# Load your generated dataset
# IMPORTANT: After generating the data, save it as 'training_disputes.csv' and upload it to your Colab session.
try:
    df = pd.read_csv('training_disputes.csv')
except FileNotFoundError:
    print("Error: 'training_disputes.csv' not found. Please upload the generated training data.")
    exit()

print("Dataset loaded successfully. Shape:", df.shape)
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Text preprocessing function
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Cleans and prepares text data for vectorization."""
    text = re.sub(r'\W', ' ', str(text))  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    tokens = text.split()  # Tokenize
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]  # Stemming and stop word removal
    return " ".join(tokens)

# Apply preprocessing to the description column
df['processed_description'] = df['description'].apply(preprocess_text)
print("\nPreprocessing complete. Example:")
print(f"Original: {df['description'].iloc[0]}")
print(f"Processed: {df['processed_description'].iloc[0]}")


# --- 2. Feature Engineering and Label Encoding ---

# The feature engineered columns are already in the dataset.
# We need to define which columns are our features.
engineered_features = [
    'is_verified_duplicate',
    'is_verified_failed',
    'contains_fraud_keyword',
    'contains_refund_keyword',
    'contains_duplicate_keyword'
]

# Encode the target variable ('true_category') into numbers
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['true_category'])

# Define features (X) and target (y)
X_text = df['processed_description']
X_engineered = df[engineered_features]
y = df['category_encoded']


# --- 3. Model Training ---

# Split data into training and testing sets
X_train_text, X_test_text, X_train_eng, X_test_eng, y_train, y_test = train_test_split(
    X_text, X_engineered, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and fit the TF-IDF Vectorizer on the training text data
tfidf_vectorizer = TfidfVectorizer(max_features=500) # Limit features to the top 500 words
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

# Combine TF-IDF features with our engineered features
# We convert the sparse TF-IDF matrix to a dense array and concatenate
X_train_combined = pd.concat([pd.DataFrame(X_train_tfidf.toarray()), X_train_eng.reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([pd.DataFrame(X_test_tfidf.toarray()), X_test_eng.reset_index(drop=True)], axis=1)

# The column names get messed up during concat, so we convert them to strings
X_train_combined.columns = X_train_combined.columns.astype(str)
X_test_combined.columns = X_test_combined.columns.astype(str)


# Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_combined, y_train)
print("\nModel training complete.")


# --- 4. Model Evaluation ---

# Make predictions on the test set
y_pred = model.predict(X_test_combined)
y_pred_proba = model.predict_proba(X_test_combined)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)


# --- 5. Saving Model and Artifacts ---

# Save the trained model, the vectorizer, and the label encoder for later use
joblib.dump(model, 'dispute_classifier_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(engineered_features, 'engineered_features.pkl') # Save the list of feature names

print("\nModel, TF-IDF vectorizer, label encoder, and feature list have been saved successfully.")
print("Files created: dispute_classifier_model.pkl, tfidf_vectorizer.pkl, label_encoder.pkl, engineered_features.pkl")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset loaded successfully. Shape: (967, 8)

First 5 rows of the dataset:
  dispute_id                                        description  \
0      D1049  Was charged again for the same order, looks li...   
1      D1754  emi card was charged twice this month (not sur...   
2      D1789  Chargebacckr equested as I on't rrecognize thi...   
3      D1733  EMI card was charged twice this month (not sur...   
4      D1403  failed transaction but money taken out of my s...   

   is_verified_duplicate  is_verified_failed  contains_fraud_keyword  \
0                      1                   0                       0   
1                      0                   0                       0   
2                      0                   0                       0   
3                      0                   0                       0   
4                      0                   1                       0   

   contains_refund_keyword  contains_duplicate_keyword       true_category  
0           