In [41]:
!pip install --upgrade boto3
!pip install textstat
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m100.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [44]:
import json
import boto3
from typing import Dict, List
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import spacy
from collections import Counter
import textstat
from typing import List
from scipy.stats import chisquare
from collections import Counter

endpoint_name = "jumpstart-dft-hf-llm-mistral-7b-ins-20231202-203714"

def query_endpoint(payload):
    client = boto3.client("runtime.sagemaker")
    try:
        response = client.invoke_endpoint(
            EndpointName=endpoint_name, 
            InferenceComponentName='jumpstart-dft-hf-llm-mistral-7b-ins-20231202-2-20231202-2037370',
            
            # InferenceComponentName='jumpstart-dft-hf-text2text-flan-t5-20231202-235-20231202-235546',
            # InferenceComponentName='jumpstart-dft-meta-textgeneration-l-20231202-2-20231202-2332220',
            # InferenceComponentName='jumpstart-dft-hf-llm-falcon-7b-inst-20231202-2-20231202-2046440',
            ContentType="application/json", 
            Body=json.dumps(payload).encode("utf-8")
        )
        response_body = response["Body"].read().decode("utf8")
        return json.loads(response_body)
    except Exception as e:
        print(f"Error querying endpoint: {e}")
        return None

def format_instructions(name: str, base_content: str) -> str:
    if name:
        return f"""
Summarize the following content in the style of {name}:
{base_content}
    """
    else:
        return f"""
Summarize the following content:
{base_content}
    """

def calculate_lexical_diversity(text):
    tokens = word_tokenize(text)
    types = set(tokens)
    return len(types) / len(tokens) if tokens else 0

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def calculate_pos_distribution(text):
    doc = nlp(text)
    pos_counts = Counter(token.pos_ for token in doc)
    total = sum(pos_counts.values())
    pos_distribution = {pos: count / total for pos, count in pos_counts.items()}
    return pos_distribution

def calculate_ner_distribution(text):
    doc = nlp(text)
    entity_counts = Counter(ent.label_ for ent in doc.ents)
    total_entities = sum(entity_counts.values())
    entity_distribution = {ent_type: count / total_entities for ent_type, count in entity_counts.items()}
    return entity_distribution

def calculate_sentence_lengths(text):
    doc = nlp(text)
    sentence_lengths = [len(sentence) for sentence in doc.sents]
    return sentence_lengths

def calculate_readability_score(text):
    return textstat.flesch_reading_ease(text)

def analyze_responses(name: str, responses: List[str]) -> None:
    word_counts = [len(response.split()) for response in responses]
    average_word_count = sum(word_counts) / len(word_counts) if word_counts else 0

    lexical_diversities = [calculate_lexical_diversity(response) for response in responses]
    average_lexical_diversity = sum(lexical_diversities) / len(lexical_diversities) if lexical_diversities else 0

    print(f"\nAnalysis for {name}:")
    print(f"Average Word Count: {average_word_count:.2f}")
    print(f"Average Lexical Diversity (TTR): {average_lexical_diversity:.2f}")

    for idx, response in enumerate(responses):
        pos_dist = calculate_pos_distribution(response)
        ner_dist = calculate_ner_distribution(response)
        sentence_lengths = calculate_sentence_lengths(response)
        readability_score = calculate_readability_score(response)

        print(f"Completion {idx + 1}: Word Count = {word_counts[idx]}, Lexical Diversity = {lexical_diversities[idx]:.2f}")
        print(f" POS Distribution: {pos_dist}")
        print(f" NER Distribution: {ner_dist}")
        print(f" Average Sentence Length: {sum(sentence_lengths)/len(sentence_lengths) if sentence_lengths else 0:.2f} words")
        print(f" Readability Score: {readability_score:.2f}\n")
    print("\n")

base_content = """
Once upon a time in a small, picturesque village, there were two friends named Emily and Jack. Emily, an aspiring artist, had always been fascinated by the vibrant colors and textures of nature. She spent her days capturing the essence of the village's beauty through her paintings. Jack, on the other hand, was a young inventor with a passion for creating gadgets that could help the villagers in their daily lives. Their friendship was a harmonious blend of art and science, each inspiring the other with their unique talents.

One sunny afternoon, Emily and Jack decided to collaborate on a project that would combine their skills. Emily had an idea to create a series of murals around the village, depicting its history and culture. Jack, excited by the idea, suggested integrating interactive elements into the murals using his inventions. Together, they planned to bring the village's stories to life, making them accessible and engaging for everyone, especially the children.

Their project was met with enthusiasm and support from the villagers. Emily's vibrant murals, filled with colors and intricate details, adorned the walls of the village, while Jack's inventions added motion and sound to the scenes. The murals became a sensation, attracting visitors from nearby towns. The collaboration between Emily and Jack not only beautified the village but also strengthened the bond within the community, as they all came together to celebrate their shared history and culture.
"""

names = [None, "Ernest Hemingway", "Joan Didion", "William Shakespeare", "Jeff Spicoli", "Hermione Granger"]

def process_response(response):
    # Check if response is only whitespace (including newlines)
    if response.strip():
        return response
    else:
        # Return None if the response is only whitespace
        return None

responses = {}
control_responses = []  # Separate list for control group responses

for name in names:
    print(f"\nGenerating responses for: {'Control Group' if name is None else name}")
    prompt = format_instructions(name, base_content)
    print(f"> Input:\n{prompt}\n")
    responses[name] = []
    for i in range(3):
        while True:
            payload = {
                "inputs": prompt,
                "parameters": {"max_new_tokens": 1000, "do_sample": True, "temperature": 0.2}
            }
            response = query_endpoint(payload)
            if response and isinstance(response, list) and isinstance(response[0], dict):
                generated_text = response[0].get('generated_text', '')
                processed_text = process_response(generated_text)
                if processed_text is not None:
                    responses[name].append(processed_text)
                    if name is None:
                        control_responses.append(processed_text)  # Add to control group list
                    print(f"Output {i + 1}:\n{processed_text}\n")
                    break
                else:
                    print("Response is only whitespace. Triggering a new inference.")
            else:
                print(f"Output {i + 1}: No valid response\n")
                break

# Analysis
for name, response_texts in responses.items():
    analyze_responses(name, response_texts)

# responses now contains three generated texts for each style, with analysis

# Aggregate POS counts for the control group
control_pos_counts = Counter()
for response in control_responses:
    control_pos_counts.update(calculate_pos_distribution(response))

# Function to perform Chi-Squared test
def perform_chi_squared_test(observed_counts, expected_counts):
    # Align observed and expected counts, filling missing values with zeros
    total_expected = sum(expected_counts.values())
    aligned_observed = [observed_counts.get(tag, 0) for tag in expected_counts]
    aligned_expected = [expected_counts[tag] for tag in expected_counts]

    # Perform Chi-Squared test
    chi_squared_stat, p_value = chisquare(aligned_observed, f_exp=aligned_expected)
    return chi_squared_stat, p_value

# Perform test for each name
for name, texts in responses.items():
    for text in texts:
        observed_counts = calculate_pos_distribution(text)
        chi_squared_stat, p_value = perform_chi_squared_test(observed_counts, control_pos_counts)
        print(f"Chi-Squared Test for {name}: Statistic = {chi_squared_stat}, P-Value = {p_value}")


[nltk_data] Downloading package punkt to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Generating responses for: Control Group
> Input:

Summarize the following content:

Once upon a time in a small, picturesque village, there were two friends named Emily and Jack. Emily, an aspiring artist, had always been fascinated by the vibrant colors and textures of nature. She spent her days capturing the essence of the village's beauty through her paintings. Jack, on the other hand, was a young inventor with a passion for creating gadgets that could help the villagers in their daily lives. Their friendship was a harmonious blend of art and science, each inspiring the other with their unique talents.

One sunny afternoon, Emily and Jack decided to collaborate on a project that would combine their skills. Emily had an idea to create a series of murals around the village, depicting its history and culture. Jack, excited by the idea, suggested integrating interactive elements into the murals using his inventions. Together, they planned to bring the village's stories to life, making 

ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:
2.0000000000000004