In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline, AutoTokenizer
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
import re
import torch
# from datasets import Dataset
import os
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

# Download NLTK resources (first time only)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def extract_non_blank_answers(text):
    # Regular expression to extract Question-Answer pairs
    pattern = r'Question:\s*(.*?)\s*Answer:\s*(.*?)\s*(?=Question:|$)'
    matches = re.findall(pattern, text, re.DOTALL)

    if matches:
      # Filter out where Answer is just a single character or blank ('.' or ' ')
      # non_blank_pairs = [(question, answer) for question, answer in matches if len(answer.strip()) > 1]
      return_string = ''.join([f"{question}{answer}" for question, answer in matches if len(answer.strip())> 1])
    else:
      return_string = text

    # Remove extra whitespace and newlines
    return_string = re.sub(r'\s+', ' ', return_string).strip()

    #Remove unnecessary underscores
    return_string = re.sub(r'_+', '', return_string)

    return return_string

# Preprocessing
def preprocess(text):
    words = [word for word in word_tokenize(text.lower())
             if word not in stopwords.words('english')
             and word not in string.punctuation
             and len(word) > 2]  # Minimum length
    return ' '.join(words) if words else '[NO_CONTENT]'

"""
  Extracts top n keywords from input text using TF-IDF.

  Args:
      text (str): Input text to analyze.
      n (int): Number of top keywords to return (default=10).

  Returns:
      list: Top n keywords with highest TF-IDF scores.
    """
def get_top_keywords(text, n=15):
    processed_text = preprocess(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([processed_text])
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]
    return [feature_names[i] for i in scores.argsort()[::-1][:n]]


def truncate_text(text, max_length=512):
    """Ensure text is within model's token limit"""
    inputs = tokenizer(text, truncation=True, max_length=max_length, return_tensors="pt")
    return tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)

def process_batch(batch, batch_size=8):
    """Process texts in batches with proper truncation"""
    results = []
    for i in range(0, len(batch), batch_size):

      batch_df = batch.iloc[i:i+batch_size]
      batch_texts = batch_df['Refined MSC Notes'].tolist()

      # Truncate and analyze
      truncated_texts = [truncate_text(text) for text in batch_texts]
      sentiments = sentiment_analyzer(truncated_texts)

      # Get keywords
      keywords = [get_top_keywords(text) for text in truncated_texts]

      for idx, (_, row) in enumerate(batch_df.iterrows()):
        results.append({
            'Match_ID': row['Match ID 18Char'],
            'Completion_Date': row['Completion Date'],
            # Add other desired fields here
            'Cleaned_MSC_Notes': row['Refined MSC Notes'],
            'Truncated_text': truncated_texts[idx],
            'Sentiment': sentiments[idx]['label'],
            'Sentiment_score': sentiments[idx]['score'],
            'Keywords': keywords[idx]
        })

    return pd.DataFrame(results)


In [None]:

df = pd.read_excel('Training-Restated.xlsx')
columns = ['Match Support Contact Notes', 'Completion Date']
df = df.dropna(subset=columns)
df['Refined MSC Notes'] = df['Match Support Contact Notes'].apply(extract_non_blank_answers)

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_analyzer = pipeline("sentiment-analysis",
                             model=model_name,
                             tokenizer=tokenizer,
                             device=0 if torch.cuda.is_available() else -1)

result_df = process_batch(df)
result_df.head()

Device set to use cpu
