In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os

In [None]:
import pandas as pd

# Load the dataset to inspect its structure and contents
file_path = '/Users/aaryanshah/Oncampus-Job/NLP_Gal/data/TrainingSet 1(Deals).csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe and its general information
data.head(), data.info()


In [None]:
# Dropping rows where key text columns have missing values
data_clean = data.dropna(subset=['Target Business Description', 'M&A Headline', 'Deal Synopsis'])

# Combining the three text columns into one
data_clean['Combined Text'] = data_clean['Target Business Description'] + " " + \
                              data_clean['M&A Headline'] + " " + data_clean['Deal Synopsis']

# Inspecting the first few rows of the combined text and the human ratings columns
data_clean[['Combined Text', 'Singh', 'Arora', 'Neilsen', 'RH', 'Uche', 'Edrick']].head()


In [None]:
import numpy as np

# Define a function to apply Leave-One-Out Cross-Validation method for bias reduction
def leave_one_out_avg(row):
    ratings = row[['Singh', 'Arora', 'Neilsen', 'RH', 'Uche', 'Edrick']]
    loo_averages = []
    # Remove one rating at a time and compute the mean of the rest
    for i in range(len(ratings)):
        loo_averages.append(np.mean(np.delete(ratings.values, i)))
    return np.mean(loo_averages)  # Average of all LOO averages for final target

# Apply the function to each row in the DataFrame
data_clean['Adjusted Rating'] = data_clean.apply(leave_one_out_avg, axis=1)

# Show the first few rows of the adjusted ratings to verify
data_clean[['Combined Text', 'Adjusted Rating']].head()

In [None]:
# Define a function to map the adjusted ratings into the defined categories
def map_rating_to_category(rating):
    if rating < 1.5:
        return 1  # Maintain Core Capabilities
    elif rating < 2.5:
        return 2  # Support Core Capabilities
    elif rating < 3.5:
        return 3  # Adjacent Capabilities
    elif rating < 4.5:
        return 4  # Near-Emergent Capabilities
    else:
        return 5  # Emergent Capabilities/Unrelated Diversification

# Map the continuous adjusted ratings to discrete categories
data_clean['Category'] = data_clean['Adjusted Rating'].apply(map_rating_to_category)

# Show the distribution of the final categories
category_distribution = data_clean['Category'].value_counts()

# Displaying the first few rows with the mapped categories and the distribution of categories
data_clean[['Combined Text', 'Adjusted Rating', 'Category']].head(), category_distribution


- Category 1: 1111 entries
- Category 2: 592 entries
- Category 3: 504 entries
- Category 4: 202 entries
- Category 5: 123 entries

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the 'Combined Text' column
X_tfidf = tfidf.fit_transform(data_clean['Combined Text'])

# Extracting the target labels
y = data_clean['Category']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Checking the shape of the training and testing data
X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the RandomForest model
rf_model.fit(X_train, y_train)

# Predict on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_report = classification_report(y_test, rf_predictions)

print('accuracy:',rf_accuracy) 
print('Classification Report:\n', rf_report) # rf_report


In [None]:
pip install torch transformers


In [None]:
# from sklearn.model_selection import train_test_split

# # Splitting indices of the dataset into training and testing sets
# train_indices, test_indices = train_test_split(data_clean.index, test_size=0.2, random_state=42)

# # Using the indices to access the text directly from the DataFrame for BERT
# train_texts = data_clean.loc[train_indices, 'Combined Text'].tolist()
# test_texts = data_clean.loc[test_indices, 'Combined Text'].tolist()

# # Proceed with tokenization and dataset preparation as before
# train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
# test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# # Your label data also needs to be accessed via indices
# train_labels = data_clean.loc[train_indices, 'Category'].tolist()
# test_labels = data_clean.loc[test_indices, 'Category'].tolist()

# # Then, create your dataset for training and testing
# train_dataset = DealsDataset(train_encodings, train_labels)
# test_dataset = DealsDataset(test_encodings, test_labels)

In [None]:
pip install accelerate -U

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import Trainer

class DealsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=128)

from sklearn.model_selection import train_test_split

# Splitting indices of the dataset into training and testing sets
train_indices, test_indices = train_test_split(data_clean.index, test_size=0.2, random_state=42)

# Using the indices to access the text directly from the DataFrame for BERT
train_texts = data_clean.loc[train_indices, 'Combined Text'].tolist()
test_texts = data_clean.loc[test_indices, 'Combined Text'].tolist()

# Proceed with tokenization and dataset preparation as before
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Your label data also needs to be accessed via indices
train_labels = data_clean.loc[train_indices, 'Category'].tolist()
test_labels = data_clean.loc[test_indices, 'Category'].tolist()

# Then, create your dataset for training and testing
train_dataset = DealsDataset(train_encodings, train_labels)
test_dataset = DealsDataset(test_encodings, test_labels)

# Training arguments for BERT
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

# Train the model
trainer.train()


In [None]:
pip install transformers[torch]

In [None]:
# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the evaluation results
print(results)

# If you need to calculate more detailed metrics manually:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Predicting on the test dataset
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_dataset.labels, pred_labels)
print("Accuracy:", accuracy)

# Detailed classification report
report = classification_report(test_dataset.labels, pred_labels)
print(report)


### Using Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Initialize the Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Fit and transform the 'Combined Text' column
X_count = count_vectorizer.fit_transform(data_clean['Combined Text'])

# Splitting the dataset into training and testing sets using the count vectorized data
X_train_count, X_test_count, y_train_count, y_test_count = train_test_split(
    X_count, y, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_count, y_train_count)

# Predict on the test set
count_predictions = logreg_model.predict(X_test_count)

# Evaluate the model
count_accuracy = accuracy_score(y_test_count, count_predictions)
count_report = classification_report(y_test_count, count_predictions)

print('accuracy:', count_accuracy) # count_accuracy
print('Classification Report:\n', count_report) # count_report

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(data_clean['Combined Text'])
y = data_clean['Category']

# Split the dataset
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42)

# Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_tfidf, y_train_tfidf)
    predictions = model.predict(X_test_tfidf)
    print(f"{name} with TF-IDF:")
    print("Accuracy:", accuracy_score(y_test_tfidf, predictions))
    print(classification_report(y_test_tfidf, predictions))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize TF-IDF
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_tfidf = count_vectorizer.fit_transform(data_clean['Combined Text'])
y = data_clean['Category']

# Split the dataset
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42)

# Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_tfidf, y_train_tfidf)
    predictions = model.predict(X_test_tfidf)
    print(f"{name} with Count Vectorizer:")
    print("Accuracy:", accuracy_score(y_test_tfidf, predictions))
    print(classification_report(y_test_tfidf, predictions))

In [None]:
# from transformers import BertModel, BertTokenizer
# import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# def get_bert_embeddings(texts):
#     model.eval()
#     with torch.no_grad():
#         inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
#         outputs = model(**inputs)
#         embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Taking the output of the [CLS] token
#     return embeddings

# # Example usage
# texts = data_clean['Combined Text'].tolist()
# bert_embeddings = get_bert_embeddings(texts)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Assume `bert_embeddings` is already prepared and `data['Category']` is the target
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, data_clean['Category'], test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
predictions = log_reg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assume `bert_embeddings` is already prepared and `data['Category']` is the target
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, data_clean['Category'], test_size=0.2, random_state=42)

# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))



In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assume `bert_embeddings` is already prepared and `data['Category']` is the target
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, data_clean['Category'], test_size=0.2, random_state=42)

# SVM
svm = SVC()
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

In [None]:
from gensim.models import Word2Vec

# Assuming texts are already tokenized
tokenized_texts = [text.split() for text in data_clean['Combined Text']]
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

def get_w2v_embeddings(texts):
    embeddings = []
    for text in texts:
        word_vectors = [w2v_model.wv[word] for word in text.split() if word in w2v_model.wv]
        if word_vectors:
            embeddings.append(np.mean(word_vectors, axis=0))
        else:
            embeddings.append(np.zeros(100))  # Assuming vector_size=100
    return np.array(embeddings)

# Example usage
w2v_embeddings = get_w2v_embeddings(data_clean['Combined Text'].tolist())

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import get_scheduler
import torch
from torch.utils.data import Dataset

class DealsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=128)

from sklearn.model_selection import train_test_split

# Splitting indices of the dataset into training and testing sets
train_indices, test_indices = train_test_split(data_clean.index, test_size=0.2, random_state=42)

# Using the indices to access the text directly from the DataFrame for BERT
train_texts = data_clean.loc[train_indices, 'Combined Text'].tolist()
test_texts = data_clean.loc[test_indices, 'Combined Text'].tolist()

# Proceed with tokenization and dataset preparation as before
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Your label data also needs to be accessed via indices
train_labels = data_clean.loc[train_indices, 'Category'].tolist()
test_labels = data_clean.loc[test_indices, 'Category'].tolist()

# Then, create your dataset for training and testing
train_dataset = DealsDataset(train_encodings, train_labels)
test_dataset = DealsDataset(test_encodings, test_labels)

# Training arguments with optimizations
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increased epochs
    per_device_train_batch_size=16,  # Adjusted batch size
    per_device_eval_batch_size=16,
    warmup_steps=100,  # Adjusted warmup steps
    weight_decay=0.05,  # Increased weight decay for regularization
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=5e-5,  # Adjusted learning rate
    load_best_model_at_end=True,  # Load the best model at the end of training based on loss
    evaluation_strategy="steps",  # Evaluate as you go
    eval_steps=50,  # How often to run evaluation
    save_strategy="steps",
    save_steps=50
)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Initialize the Trainer with the modified training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the evaluation results
print(results)

# If you need to calculate more detailed metrics manually:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Predicting on the test dataset
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_dataset.labels, pred_labels)
print("Accuracy:", accuracy)

# Detailed classification report
report = classification_report(test_dataset.labels, pred_labels)
print(report)

In [None]:
import re
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Removing HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation
    text = text.lower()  # Converting to lower case
    return text

# Apply preprocessing to the text data
data_clean['Processed Text'] = data_clean['Combined Text'].apply(preprocess_text)

# Define a custom dataset for PyTorch
class DealsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Prepare datasets
train_texts = data_clean.loc[train_indices, 'Processed Text'].tolist()
test_texts = data_clean.loc[test_indices, 'Processed Text'].tolist()
train_labels = data_clean.loc[train_indices, 'Category'].tolist()
test_labels = data_clean.loc[test_indices, 'Category'].tolist()

train_dataset = DealsDataset(train_texts, train_labels, tokenizer)
test_dataset = DealsDataset(test_texts, test_labels, tokenizer)

# Define training arguments with some hyperparameter tuning
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,  # Adjust epochs
    per_device_train_batch_size=16,  # Adjust batch size
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=3e-5,
    evaluation_strategy='steps',
    eval_steps=50,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="steps",
    save_steps=50
)

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(data_clean['Category'])))

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate()
print(results)


In [None]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Evaluate the model on the test dataset to get a summary of the performance
evaluation_results = trainer.evaluate(test_dataset)
print("Evaluation results:", evaluation_results)

# To get predictions and compute detailed accuracy and other metrics
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predicted_labels)
print("Accuracy:", accuracy)

# Detailed classification report
report = classification_report(test_labels, predicted_labels, target_names=["Class1", "Class2", "Class3", "Class4", "Class5"])
print("Classification Report:\n", report)


In [None]:
import pandas as pd

# Load the cluster data
cluster_file_path = '/Users/aaryanshah/Oncampus-Job/NLP_Gal/clustering/Updated_Clusters _including_CVC_results.csv'
clusters_data = pd.read_csv(cluster_file_path)

# Display the first few rows and the structure of the dataframe
print(clusters_data.head())
print(clusters_data.info())
