In [9]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

In [10]:
# Constants
STOPWORDS = set(stopwords.words('english'))

In [11]:
# Function to load dataset
def load_data(file_path):
    return pd.read_csv(file_path, encoding='ISO-8859-1')

In [12]:
# Function for text preprocessing
def preprocess_claims(claims):
    lemmatizer = WordNetLemmatizer()
    preprocessed_claims = []
    for claim in claims:
        # Remove non-alphanumeric characters
        claim = re.sub(r'[^a-zA-Z0-9]', ' ', claim)
        # Lowercase
        claim = claim.lower()
        # Tokenization and Lemmatization
        claim = ' '.join(lemmatizer.lemmatize(word) for word in claim.split() if word not in STOPWORDS)
        preprocessed_claims.append(claim)
        
    return preprocessed_claims

In [13]:
# Function to train and evaluate models
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000),
        'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42)
    }
    
    results = {}
    
    for model_name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predict on training and testing sets
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Store the results
        results[model_name] = {
            'train_accuracy': accuracy_score(y_train, y_train_pred),
            'test_accuracy': accuracy_score(y_test, y_test_pred),
            'train_confusion_matrix': confusion_matrix(y_train, y_train_pred),
            'test_confusion_matrix': confusion_matrix(y_test, y_test_pred),
            'train_classification_report': classification_report(y_train, y_train_pred, output_dict=True),
            'test_classification_report': classification_report(y_test, y_test_pred, output_dict=True)
        }
        # saving the models
        joblib.dump(model, f'{model_name.lower().replace(" ", "_")}_model.pkl')
    return results

In [14]:
# Main function to execute the workflow
def main(file_path):
    # Load data
    df = load_data(file_path)

In [15]:
# Main function to execute the workflow
def main(file_path):
    # Load data
    df = load_data(file_path)
    
    # Preprocess claims
    df['claims'] = preprocess_claims(df['claims'])
    
    # Encode speaker labels
    label_encoder = LabelEncoder()
    df['encoded_speaker'] = label_encoder.fit_transform(df['speaker'])
    
    # Vectorization
    tfidf_vectorizer = TfidfVectorizer()
    X_claims = tfidf_vectorizer.fit_transform(df['claims']).toarray()
    
    # Combine TF-IDF matrix and encoded speaker matrix
    X_speaker = df['encoded_speaker'].values.reshape(-1, 1)
    X_combined = np.hstack((X_claims, X_speaker))
    y = df['label']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
    
    # Train and evaluate models
    results = train_and_evaluate_models(X_train, X_test, y_train, y_test)

    # saving the vectorizer
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
    
    for model_name, metrics in results.items():
        print(f"Model: {model_name}")
        print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
        print(f"Testing Accuracy: {metrics['test_accuracy']:.4f}")
        print("Training Confusion Matrix:")
        print(metrics['train_confusion_matrix'])
        print("Testing Confusion Matrix:")
        print(metrics['test_confusion_matrix'])
        print("Training Classification Report:")
        print(metrics['train_classification_report'])
        print("Testing Classification Report:")
        print(metrics['test_classification_report'])
        print("-" * 50)

if __name__ == "__main__":
    #path to your dataset
    file_path = 'sonadataset.csv'
    main(file_path)

Model: Random Forest
Training Accuracy: 0.9909
Testing Accuracy: 0.9330
Training Confusion Matrix:
[[  95    0]
 [  19 1973]]
Testing Confusion Matrix:
[[  1  24]
 [ 11 486]]
Training Classification Report:
{'0': {'precision': 0.8333333333333334, 'recall': 1.0, 'f1-score': 0.9090909090909091, 'support': 95.0}, '1': {'precision': 1.0, 'recall': 0.9904618473895582, 'f1-score': 0.9952080706179067, 'support': 1992.0}, 'accuracy': 0.9908960229995208, 'macro avg': {'precision': 0.9166666666666667, 'recall': 0.9952309236947792, 'f1-score': 0.9521494898544078, 'support': 2087.0}, 'weighted avg': {'precision': 0.9924133524996006, 'recall': 0.9908960229995208, 'f1-score': 0.991288027328465, 'support': 2087.0}}
Testing Classification Report:
{'0': {'precision': 0.08333333333333333, 'recall': 0.04, 'f1-score': 0.05405405405405406, 'support': 25.0}, '1': {'precision': 0.9529411764705882, 'recall': 0.9778672032193159, 'f1-score': 0.9652432969215492, 'support': 497.0}, 'accuracy': 0.9329501915708812,