In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from transformers import DistilBertTokenizer, DistilBertModel, TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.data import Dataset
import tensorflow as tf
import matplotlib.pyplot as plt
import shap
import torch
import pickle

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('df.csv')

# Display the first few rows of the DataFrame to verify
print(df.head())

In [None]:
# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    return inputs

# Apply tokenization to the 'Content' column
df['Tokenized_Content'] = df['Content'].apply(tokenize_text)

# Perform inference using the model
def get_embeddings(tokenized_text):
    with torch.no_grad():
        outputs = model(**tokenized_text)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply the model to get embeddings for each tokenized text
df['Embeddings'] = df['Tokenized_Content'].apply(get_embeddings)

# Save the DataFrame to a pickle file
with open('df_with_embeddings.pkl', 'wb') as f:
    pickle.dump(df, f)

# Optionally, save the DataFrame to a CSV file (without embeddings)
df.drop(columns=['Tokenized_Content', 'Embeddings']).to_csv('df_with_embeddings.csv', index=False)

# Display the DataFrame with embeddings
print(df.head())

In [None]:
# Prepare data for RandomForestClassifier
print("RandomForestClassifier with Oversampling:")
smote = SMOTE(random_state=42)
ros = RandomOverSampler(random_state=42)

# Extract embeddings and target variable
X = np.vstack(df['Embeddings'])
y = df['outcome']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_resampled, y_train_resampled)

# Evaluate RandomForestClassifier
y_pred_rf = rf_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Accuracy: ", accuracy_score(y_test, y_pred_rf))

In [None]:
# Feature Importance using SHAP
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test[:100])

# Visualize feature importance
plt.title("Feature Importance for RandomForest")
shap.summary_plot(shap_values, X_test[:100])

In [None]:
# DistilBERT Model Training
print("Training DistilBERT Model:")
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
history_list = []
fold = 1

for train_index, val_index in kfold.split(X, y):
    print(f"Starting fold {fold}...")

    # Split the data for this fold
    train_embeddings, val_embeddings = X[train_index], X[val_index]
    train_labels, val_labels = y[train_index], y[val_index]

    # Create TensorFlow datasets for training and validation
    train_dataset = Dataset.from_tensor_slices((train_embeddings, train_labels)).shuffle(len(train_embeddings)).batch(16)
    val_dataset = Dataset.from_tensor_slices((val_embeddings, val_labels)).batch(16)

    # Load the DistilBERT model for sequence classification
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(np.unique(y)))

    # Compile the model
    optimizer = Adam(learning_rate=5e-5)
    loss = SparseCategoricalCrossentropy(from_logits=True)
    metric = SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # Set up early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

    # Train the model
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=5,
        callbacks=[early_stopping]
    )

    history_list.append(history)
    fold += 1

In [None]:
# Evaluate overfitting by plotting training vs. validation accuracy for all folds
plt.figure(figsize=(10, 6))
for i, history in enumerate(history_list):
    plt.plot(history.history['accuracy'], label=f'Fold {i+1} Train Accuracy')
    plt.plot(history.history['val_accuracy'], label=f'Fold {i+1} Validation Accuracy', linestyle='--')
plt.title('Training vs. Validation Accuracy Across Folds')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
# Feature importance using attention scores
# Extract attention scores from the model (example-based visualization)
def visualize_attention(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="tf")
    outputs = model(inputs, output_attentions=True)
    attention = outputs.attentions[-1][0]  # Last layer attention
    attention_weights = tf.reduce_mean(attention, axis=1).numpy()

    print("Attention Weights:")
    for token, weight in zip(tokenizer.tokenize(text), attention_weights.flatten()):
        print(f"{token}: {weight:.4f}")

example_text = "Example negotiation message to visualize feature importance."
visualize_attention(model, tokenizer, example_text)

# Save the trained model
model.save_pretrained("distilbert_negotiation_model")
tokenizer.save_pretrained("distilbert_negotiation_model")

In [None]:
# Analyze the results
# Decode predictions for interpretation
predictions = model.predict(val_dataset).logits
predicted_labels = tf.argmax(predictions, axis=1).numpy()

# Create a DataFrame to compare predictions with actual outcomes
results_df = pd.DataFrame({
    'message': val_embeddings,
    'actual_outcome': val_labels,
    'predicted_outcome': predicted_labels
})

In [None]:
# Save the results for further analysis
results_df.to_csv("negotiation_results.csv", index=False)

print("Results saved to 'negotiation_results.csv'.")