In [None]:
import os
import json

In [None]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
root_directory = '/lockbox/sgpgi_ds'

In [None]:
def extract_and_parse_jsonl(root_directory):
    parsed_json_data = [] 
    for subdir, _, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.jsonl'):
                file_path = os.path.join(subdir, file)
                try:
                    with open(file_path, 'r') as f:
                        for line in f:
                            try:
                                json_data = json.loads(line)
                                parsed_json_data.append(json_data)
                            except json.JSONDecodeError as e:
                                print(f"Error parsing JSON in file '{file_path}': {e}")
                except Exception as e:
                    print(f"Error processing file '{file_path}': {e}")
    return parsed_json_data 


In [None]:
texts = extract_and_parse_jsonl(root_directory)

In [None]:
print(texts)

In [None]:
extracted_texts = [entry['text'] for entry in texts if 'text' in entry]
for text in extracted_texts:
    print(text)

In [None]:
type(extracted_texts)

In [None]:
import re


In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [None]:
cleaned_texts = [clean_text(text) for text in extracted_texts]

In [None]:
cleaned_texts

In [None]:
combined_string = ' '.join(cleaned_texts)

In [None]:
combined_string

In [None]:
import os
import numpy as np
from nltk.tokenize import word_tokenize

def read_and_normalize_text_files(root_directory):
    file_texts = []
    for subdir, _, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(subdir, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read().replace('\n', ' ')
                        file_texts.append(text)
                except Exception as e:
                    print(f"Error reading file '{file_path}': {e}")
    return file_texts

In [None]:
candidate_datapath_llama3 = '/lockbox/llama3_20240509/llama3'

In [None]:
syn_text_llama3 = read_and_normalize_text_files(candidate_datapath_llama3)

In [None]:
syn_text_llama3

In [None]:
combined_string_1 = ' '.join(syn_text_llama3)

In [None]:
combined_string_1

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, bigrams, trigrams, FreqDist
import string
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def preprocess(text):
    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

In [None]:
tokens_combined = preprocess(combined_string)
tokens_annotated = preprocess(combined_string_1)

In [None]:
def find_top_ngrams(tokens, n=10):
    unigram_freq = FreqDist(tokens)
    bigram_freq = FreqDist(bigrams(tokens))
    trigram_freq = FreqDist(trigrams(tokens))

    top_unigrams = unigram_freq.most_common(n)
    top_bigrams = bigram_freq.most_common(n)
    top_trigrams = trigram_freq.most_common(n)

    return top_unigrams, top_bigrams, top_trigrams

In [None]:
top_unigrams_combined, top_bigrams_combined, top_trigrams_combined = find_top_ngrams(tokens_combined, 10)
top_unigrams_annotated, top_bigrams_annotated, top_trigrams_annotated = find_top_ngrams(tokens_annotated, 10)

In [None]:
def print_top_ngrams(title, unigrams, bigrams, trigrams):
    print(f"--- {title} ---")
    print("Top 10 Unigrams:", unigrams)
    print("Top 10 Bigrams:", bigrams)
    print("Top 10 Trigrams:", trigrams)
    print("\n")

print_top_ngrams("Combined String", top_unigrams_combined, top_bigrams_combined, top_trigrams_combined)
print_top_ngrams("Annotated String", top_unigrams_annotated, top_bigrams_annotated, top_trigrams_annotated)

In [None]:
def normalize_freqs(ngrams):
    total = sum(freq for _, freq in ngrams)
    return [(ngram, freq * 100 / total) for ngram, freq in ngrams]

top_unigrams_combined_norm = normalize_freqs(top_unigrams_combined)
top_bigrams_combined_norm = normalize_freqs(top_bigrams_combined)
top_trigrams_combined_norm = normalize_freqs(top_trigrams_combined)

top_unigrams_annotated_norm = normalize_freqs(top_unigrams_annotated)
top_bigrams_annotated_norm = normalize_freqs(top_bigrams_annotated)
top_trigrams_annotated_norm = normalize_freqs(top_trigrams_annotated)


In [None]:
def aggregate_ngram_freqs(top_unigrams, top_bigrams, top_trigrams):
    ngrams_labels = [label for label, _ in top_unigrams] + [f"{a} {b}" for a, b in dict(top_bigrams).keys()] + [f"{a} {b} {c}" for a, b, c in dict(top_trigrams).keys()]
    unigram_freqs = [freq for _, freq in top_unigrams]
    bigram_freqs = [freq for _, freq in top_bigrams]
    trigram_freqs = [freq for _, freq in top_trigrams]
    unigram_freqs.extend([0] * (len(ngrams_labels) - len(unigram_freqs)))
    bigram_freqs.extend([0] * (len(ngrams_labels) - len(bigram_freqs)))
    trigram_freqs.extend([0] * (len(ngrams_labels) - len(trigram_freqs)))
    return ngrams_labels, unigram_freqs, bigram_freqs, trigram_freqs

labels_combined, unigrams_combined, bigrams_combined, trigrams_combined = aggregate_ngram_freqs(top_unigrams_combined_norm, top_bigrams_combined_norm, top_trigrams_combined_norm)
labels_annotated, unigrams_annotated, bigrams_annotated, trigrams_annotated = aggregate_ngram_freqs(top_unigrams_annotated_norm, top_bigrams_annotated_norm, top_trigrams_annotated_norm)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def normalize_ngrams(ngram_freq, total):
    return [(ngram, count / total * 100) for ngram, count in ngram_freq]

def plot_normalized_ngrams(unigrams, bigrams, trigrams, title, output_filename):
    total_unigrams = sum([count for _, count in unigrams])
    total_bigrams = sum([count for _, count in bigrams])
    total_trigrams = sum([count for _, count in trigrams])

    unigrams_norm = normalize_ngrams(unigrams, total_unigrams)
    bigrams_norm = normalize_ngrams(bigrams, total_bigrams)
    trigrams_norm = normalize_ngrams(trigrams, total_trigrams)

    unigram_labels = [label for label, _ in unigrams_norm]
    unigram_values = [value for _, value in unigrams_norm]

    bigram_labels = [" ".join(pair) for pair, _ in bigrams_norm]
    bigram_values = [value for _, value in bigrams_norm]

    trigram_labels = [" ".join(trio) for trio, _ in trigrams_norm]
    trigram_values = [value for _, value in trigrams_norm]

    all_labels = unigram_labels + bigram_labels + trigram_labels
    indices = np.arange(len(all_labels))

    bar_width = 0.3

    fig, ax = plt.subplots(figsize=(14, 8))
    ax.bar(indices[:len(unigram_values)], unigram_values, width=bar_width, label="Unigrams", color="blue")
    ax.bar(indices[len(unigram_values):len(unigram_values) + len(bigram_values)], bigram_values, width=bar_width, label="Bigrams", color="green")
    ax.bar(indices[len(unigram_values) + len(bigram_values):], trigram_values, width=bar_width, label="Trigrams", color="red")

    ax.set_xticks(indices)
    ax.set_xticklabels(all_labels, rotation=45, ha="right")

    ax.set_ylabel('Frequency (%)')
    ax.set_xlabel('N-grams')
    ax.set_title(title)
    ax.legend()
    plt.tight_layout()
    fig.savefig(output_filename, format='png', dpi=300)
    plt.show()

plot_normalized_ngrams(top_unigrams_combined, top_bigrams_combined, top_trigrams_combined, "SGPGI Dataset Normalized N-grams", "SGPGIngramsplot.png")
plot_normalized_ngrams(top_unigrams_annotated, top_bigrams_annotated, top_trigrams_annotated, "Synthetic SGPGI Dataset Normalized N-grams", "SGPGIsynngramplot.png")


In [None]:
import numpy as np

def compute_statistics(text):
    words = text.split()
    num_chars = len(text)
    num_words = len(words)
    
    word_lengths = [len(word) for word in words]
    mean_word_length = np.mean(word_lengths)
    std_error_word_length = np.std(word_lengths) / np.sqrt(len(word_lengths))
    median_word_length = np.median(word_lengths)
    min_word_length = np.min(word_lengths)
    max_word_length = np.max(word_lengths)
    
    return {
        'Total Characters': num_chars,
        'Total Words': num_words,
        'Mean Word Length': mean_word_length,
        'SE Word Length': std_error_word_length,
        'Median Word Length': median_word_length,
        'Min Word Length': min_word_length,
        'Max Word Length': max_word_length
    }

combined_stats = compute_statistics(combined_string)
annotated_stats = compute_statistics(combined_string_1)

print("Statistics for Combined String:")
for key, value in combined_stats.items():
    print(f"{key}: {value}")

print("\nStatistics for Annotated String:")
for key, value in annotated_stats.items():
    print(f"{key}: {value}")


In [None]:
import re
from collections import Counter
from nltk import bigrams, trigrams, word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_and_tokenize(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    words = word_tokenize(text)
    return [word for word in words if word not in stop_words and 'phi' not in word and 'type' not in word]

def extract_context_without_phi(text, window_size=5):
    phi_regex = re.compile(r'</?PHI[^>]*>')

    phi_positions = [(m.start(), m.end()) for m in phi_regex.finditer(text)]

    words = []
    prev_end = 0
    for start, end in phi_positions:
        context_before = clean_and_tokenize(text[prev_end:start])
        context_after = clean_and_tokenize(text[end:min(len(text), end + window_size * 10)])

        words.extend(context_before)
        words.extend(context_after)

        prev_end = end

    if prev_end < len(text):
        words.extend(clean_and_tokenize(text[prev_end:]))

    top_unigrams = Counter(words).most_common(10)
    top_bigrams = Counter(bigrams(words)).most_common(10)
    top_trigrams = Counter(trigrams(words)).most_common(10)

    return top_unigrams, top_bigrams, top_trigrams

top_unigrams_annotated, top_bigrams_annotated, top_trigrams_annotated = extract_context_without_phi(combined_string)
top_unigrams_combined, top_bigrams_combined, top_trigrams_combined = extract_context_without_phi(combined_string_1)

print("\nTop 10 Surrounding Unigrams (Annotated):")
for unigram, count in top_unigrams_annotated:
    print(f"{unigram}: {count}")

print("\nTop 10 Surrounding Bigrams (Annotated):")
for bigram, count in top_bigrams_annotated:
    print(f"{bigram}: {count}")

print("\nTop 10 Surrounding Trigrams (Annotated):")
for trigram, count in top_trigrams_annotated:
    print(f"{trigram}: {count}")

# Print results for combined text
print("\nTop 10 Surrounding Unigrams (Combined):")
for unigram, count in top_unigrams_combined:
    print(f"{unigram}: {count}")

print("\nTop 10 Surrounding Bigrams (Combined):")
for bigram, count in top_bigrams_combined:
    print(f"{bigram}: {count}")

print("\nTop 10 Surrounding Trigrams (Combined):")
for trigram, count in top_trigrams_combined:
    print(f"{trigram}: {count}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_normalized_ngrams(unigrams, bigrams, trigrams, title, filename):
    unigram_labels, unigram_counts = zip(*unigrams)
    bigram_labels, bigram_counts = zip(*bigrams)
    trigram_labels, trigram_counts = zip(*trigrams)

    total_unigrams = sum(unigram_counts)
    total_bigrams = sum(bigram_counts)
    total_trigrams = sum(trigram_counts)

    unigram_percentages = [count / total_unigrams * 100 for count in unigram_counts]
    bigram_percentages = [count / total_bigrams * 100 for count in bigram_counts]
    trigram_percentages = [count / total_trigrams * 100 for count in trigram_counts]

    unigram_x_positions = np.arange(len(unigrams))
    bigram_x_positions = np.arange(len(bigrams)) + len(unigrams)
    trigram_x_positions = np.arange(len(trigrams)) + len(unigrams) + len(bigrams)

    fig, ax = plt.subplots(figsize=(14, 8))
    ax.bar(unigram_x_positions, unigram_percentages, width=0.4, label='Unigrams')
    ax.bar(bigram_x_positions, bigram_percentages, width=0.4, label='Bigrams')
    ax.bar(trigram_x_positions, trigram_percentages, width=0.4, label='Trigrams')

    all_positions = np.concatenate([unigram_x_positions, bigram_x_positions, trigram_x_positions])
    all_labels = [label for label in unigram_labels] + [f'{a}, {b}' for a, b in bigram_labels] + [f'{a}, {b}, {c}' for a, b, c in trigram_labels]

    ax.set_xticks(all_positions)
    ax.set_xticklabels(all_labels, rotation=45, ha='right')

    ax.set_xlabel('N-Grams')
    ax.set_ylabel('Frequency (%)')
    ax.set_title(title)
    ax.legend()
    ax.grid(axis='y')

    plt.tight_layout()
    plt.savefig(filename, format='png')
    plt.show()

plot_normalized_ngrams(top_unigrams_combined, top_bigrams_combined, top_trigrams_combined, "SGPGI Dataset PHI N-Grams", "sgpgiPHI_ngrams.png")
plot_normalized_ngrams(top_unigrams_annotated, top_bigrams_annotated, top_trigrams_annotated, "Synthetic SGPGI Dataset PHI N-Grams", "sgpgiSYNPHI_ngrams.png")


In [None]:
def jaccard_distance(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return 1 - len(intersection) / len(union)

In [None]:
set_combined = set(tokens_combined)
set_annotated = set(tokens_annotated)

In [None]:
distance = jaccard_distance(set_combined, set_annotated)
print(f"Jaccard Distance: {distance}")

In [None]:
import sys
sys.path.insert(0, '/home/lokesh/ds_comparison/bert_score')

In [None]:
from bert_score import score

In [None]:
P, R, F1 = score([combined_string], [combined_string_1], lang='en', model_type="dmis-lab/biobert-v1.1")
print("Precision:", P)
print("Recall:", R)
print("F1 Score:", F1)