In [None]:
import pandas as pd
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "Qwen/Qwen3-4B"          # or any 8-B variant, e.g. -AWQ, -FP8, GGUF etc.
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model     = AutoModelForCausalLM.from_pretrained(
               model_id,
               device_map="auto",           # GPU if you have one
               torch_dtype="auto"           # fp16/bf16 automatically
           )
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

### Text Generation

In [71]:
import os
import re

nlp_results = pd.DataFrame(columns=['Author', 'Title', 'y_train', 'y_pred', 'y_test'])

def remove_prefix(original, full):
    if full.startswith(original):
        return full[len(original):].lstrip()  # Remove leading spaces/newlines
    else:
        raise ValueError("The first text is not a prefix of the second.")

def split_text_by_sentences(text, word_limit):
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    train_text = ''
    word_count = 0

    for sentence in sentences:
        sentence_words = sentence.split()
        if word_count + len(sentence_words) > word_limit:
            break
        train_text += sentence + ' '
        word_count += len(sentence_words)

    # Now get the next 500 words after the train_text
    remaining_text = text[len(train_text):].strip()
    next_words = ' '.join(remaining_text.split()[:500])

    return train_text.strip(), next_words.strip()
base_path = "./"
authors = ["Mark_Twain", "Paul Graham", "Paul Krugman"]

for author in authors:
    folder_path = os.path.join(base_path, author)
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-16') as f:
                text = f.read()
                train_text, test_text = split_text_by_sentences(text, 500)
                # Do whatever you want with train_text and test_text
                print(f'Processed {filename} from {author}')
                pred = generator(
                    train_text,
                    # generation control
                    max_new_tokens=500,      # ← exactly the number you asked for
                    temperature=0.8,         # creativity (0–2); lower is safer
                    top_p=0.9,               # nucleus sampling
                    do_sample=True,          # sampling instead of greedy
                    eos_token_id=tokenizer.eos_token_id,   # stop at end-of-text if it appears
                )
                y_pred = pred[0]["generated_text"]
                result = remove_prefix(train_text, y_pred)
                new_row = pd.DataFrame([{'Author': author, 'Title': filename, 'y_pred': result, 'y_train': train_text, 'y_test': test_text}])
                nlp_results = pd.concat([nlp_results, new_row], ignore_index=True)

                print(nlp_results.shape)
                print(nlp_results)

Processed THE MAN THAT CORRUPTED HADLEYBURG.txt from Mark_Twain
(1, 5)
       Author                                  Title  \
0  Mark_Twain  THE MAN THAT CORRUPTED HADLEYBURG.txt   

                                             y_train  \
0  THE MAN THAT CORRUPTED HADLEYBURG\n\nAND OTHER...   

                                              y_pred  \
0  He made a\ntrip to Hadleyburg and found it the...   

                                              y_test  
0  joy. He began to form a plan at once, saying t...  
Processed To the Person Sitting in Darkness.txt from Mark_Twain
(2, 5)
       Author                                  Title  \
0  Mark_Twain  THE MAN THAT CORRUPTED HADLEYBURG.txt   
1  Mark_Twain  To the Person Sitting in Darkness.txt   

                                             y_train  \
0  THE MAN THAT CORRUPTED HADLEYBURG\n\nAND OTHER...   
1  TO THE PERSON SITTING IN DARKNESS\n\n\n       ...   

                                              y_pred  \
0  He made a\nt

In [17]:
import os
import re

nlp_results = pd.DataFrame(columns=['Author', 'Title', 'y_train', 'y_pred', 'y_test'])

def remove_prefix(original, full):
    if full.startswith(original):
        return full[len(original):].lstrip()  # Remove leading spaces/newlines
    else:
        raise ValueError("The first text is not a prefix of the second.")

def split_text_by_sentences(text, word_limit):
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    train_text = ''
    word_count = 0

    for sentence in sentences:
        sentence_words = sentence.split()
        if word_count + len(sentence_words) > word_limit:
            break
        train_text += sentence + ' '
        word_count += len(sentence_words)

    # Now get the next 500 words after the train_text
    remaining_text = text[len(train_text):].strip()
    next_words = ' '.join(remaining_text.split()[:500])

    return train_text.strip(), next_words.strip()
base_path = "./"
authors = ["Robin Hanson"]

for author in authors:
    folder_path = os.path.join(base_path, author)
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-16') as f:
                text = f.read()
                train_text, test_text = split_text_by_sentences(text, 500)
                # Do whatever you want with train_text and test_text
                print(f'Processed {filename} from {author}')
                pred = generator(
                        train_text,
                        # generation control
                        max_new_tokens=400,      # ← exactly the number you asked for
                        temperature=0.8,         # creativity (0–2); lower is safer
                        top_p=0.9,               # nucleus sampling
                        do_sample=True,          # sampling instead of greedy
                        eos_token_id=tokenizer.eos_token_id,   # stop at end-of-text if it appears
                    )
                y_pred = pred[0]["generated_text"]
                result = remove_prefix(train_text, y_pred)
                new_row = pd.DataFrame([{'Author': author, 'Title': filename, 'y_pred': result, 'y_train': train_text, 'y_test': test_text}])
                nlp_results = pd.concat([nlp_results, new_row], ignore_index=True)
    
                print(nlp_results.shape)
                print(nlp_results)

Processed Elite-Only Financial Markets.txt from Robin Hanson
(1, 5)
         Author                             Title  \
0  Robin Hanson  Elite-Only Financial Markets.txt   

                                             y_train  \
0  Prediction markets are financial markets, but ...   

                                              y_pred  \
0  But the way they are structured, they have no ...   

                                              y_test  
0  More accurate stock prices better achieve this...  
Processed Thinkers Must Be Heretics.txt from Robin Hanson
(2, 5)
         Author                             Title  \
0  Robin Hanson  Elite-Only Financial Markets.txt   
1  Robin Hanson     Thinkers Must Be Heretics.txt   

                                             y_train  \
0  Prediction markets are financial markets, but ...   
1  When we form opinions on topics, the depth of ...   

                                              y_pred  \
0  But the way they are structured, the

In [27]:
result = pd.concat([df, hanson], axis=0)
result

Unnamed: 0,Author,Title,y_train,y_pred,y_test
0,Mark_Twain,THE MAN THAT CORRUPTED HADLEYBURG.txt,THE MAN THAT CORRUPTED HADLEYBURG\n\nAND OTHER...,He made a\ntrip to Hadleyburg and found it the...,"joy. He began to form a plan at once, saying t..."
1,Mark_Twain,To the Person Sitting in Darkness.txt,TO THE PERSON SITTING IN DARKNESS\n\n\n ...,And\nthere is more of it. The People who Sit i...,Game. It shows that these new players of it ar...
2,Mark_Twain,Life on the Mississippi.txt,Produced by David Widger. Earliest PG text edi...,A New Plan.--A Little Tact.--The Mayor is\nHir...,rd. CHAPTER XXIII. Old French Settlements.--We...
3,Mark_Twain,KING LEOPOLD’S SOLILOQUY.txt,KING LEOPOLD’S SOLILOQUY\n\n\n\n\n ...,"They\nare saying that I have a secret, a secre...",hen uttered against a king. Miscreants—they ar...
4,Mark_Twain,A Horse's Tale.txt,A Horse’s Tale\n\n\n ...,* * * * *\n\n\nA Horse’s Tale\n\nCHAP. I.\nSOL...,rian will correct these defects.” The cats in ...
...,...,...,...,...,...
137,Robin Hanson,Prestige in US Today.txt,Lauren A. Rivera’s Pedigree: How Elite Student...,"So, this is a system that is self-reinforcing,...",It seems that while these firms do sell concre...
138,Robin Hanson,"AI Risk, Again.txt",Large language models like ChatGPT have recent...,(The future world could be a world of many AIs...,"Of course the owners of such future ventures, ..."
139,Robin Hanson,New Tax Career Agent Test.txt,"If that taxpayer approved, the taxes that he o...",If the worker who gets the TCA has a higher ex...,Bids should give direct estimates of worker va...
140,Robin Hanson,A Perfect Storm of Inflexibility.txt,Most biological species specialize for particu...,"But the problem is that, in peace time, this m...","In addition to these two considerations, longe..."


In [49]:
df = result.copy()
def word_count(text):
    return len(str(text).split())

# Calculate word count for each column
df['y_train_word_count'] = df['y_train'].apply(word_count)
df['y_test_word_count'] = df['y_test'].apply(word_count)

# Sum the word counts
df['total_word_count'] = df['y_train_word_count'] + df['y_test_word_count']

# Filter the DataFrame
filtered_df = df[df['total_word_count'] >= 900]
filtered_df

Unnamed: 0,Author,Title,y_train,y_pred,y_test,y_train_word_count,y_test_word_count,total_word_count
0,Mark_Twain,THE MAN THAT CORRUPTED HADLEYBURG.txt,THE MAN THAT CORRUPTED HADLEYBURG\n\nAND OTHER...,He made a\ntrip to Hadleyburg and found it the...,"joy. He began to form a plan at once, saying t...",474,500,974
1,Mark_Twain,To the Person Sitting in Darkness.txt,TO THE PERSON SITTING IN DARKNESS\n\n\n ...,And\nthere is more of it. The People who Sit i...,Game. It shows that these new players of it ar...,500,500,1000
2,Mark_Twain,Life on the Mississippi.txt,Produced by David Widger. Earliest PG text edi...,A New Plan.--A Little Tact.--The Mayor is\nHir...,rd. CHAPTER XXIII. Old French Settlements.--We...,500,500,1000
4,Mark_Twain,A Horse's Tale.txt,A Horse’s Tale\n\n\n ...,* * * * *\n\n\nA Horse’s Tale\n\nCHAP. I.\nSOL...,rian will correct these defects.” The cats in ...,496,500,996
5,Mark_Twain,1601 Conversation as it was by the Social Fire...,1601\n\nConversation as it was by the Social F...,The first edition of it was published\nin 1880...,601. The piece is a supposititious conversatio...,470,500,970
...,...,...,...,...,...,...,...,...
137,Robin Hanson,Prestige in US Today.txt,Lauren A. Rivera’s Pedigree: How Elite Student...,"So, this is a system that is self-reinforcing,...",It seems that while these firms do sell concre...,487,500,987
138,Robin Hanson,"AI Risk, Again.txt",Large language models like ChatGPT have recent...,(The future world could be a world of many AIs...,"Of course the owners of such future ventures, ...",481,500,981
139,Robin Hanson,New Tax Career Agent Test.txt,"If that taxpayer approved, the taxes that he o...",If the worker who gets the TCA has a higher ex...,Bids should give direct estimates of worker va...,482,500,982
140,Robin Hanson,A Perfect Storm of Inflexibility.txt,Most biological species specialize for particu...,"But the problem is that, in peace time, this m...","In addition to these two considerations, longe...",479,500,979


In [51]:
filtered_df.to_csv("final_dataset.csv", index=None)

In [75]:
nlp_results.to_csv("txt_results.csv", index=None)

### N-grams

In [59]:
df = pd.read_csv("final_dataset.csv")

In [63]:
df.head()

Unnamed: 0,Author,Title,y_train,y_pred,y_test,y_train_word_count,y_test_word_count,total_word_count
0,Mark_Twain,THE MAN THAT CORRUPTED HADLEYBURG.txt,THE MAN THAT CORRUPTED HADLEYBURG\n\nAND OTHER...,He made a\ntrip to Hadleyburg and found it the...,"joy. He began to form a plan at once, saying t...",474,500,974
1,Mark_Twain,To the Person Sitting in Darkness.txt,TO THE PERSON SITTING IN DARKNESS\n\n\n ...,And\nthere is more of it. The People who Sit i...,Game. It shows that these new players of it ar...,500,500,1000
2,Mark_Twain,Life on the Mississippi.txt,Produced by David Widger. Earliest PG text edi...,A New Plan.--A Little Tact.--The Mayor is\nHir...,rd. CHAPTER XXIII. Old French Settlements.--We...,500,500,1000
3,Mark_Twain,A Horse's Tale.txt,A Horse’s Tale\n\n\n ...,* * * * *\n\n\nA Horse’s Tale\n\nCHAP. I.\nSOL...,rian will correct these defects.” The cats in ...,496,500,996
4,Mark_Twain,1601 Conversation as it was by the Social Fire...,1601\n\nConversation as it was by the Social F...,The first edition of it was published\nin 1880...,601. The piece is a supposititious conversatio...,470,500,970


In [71]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

# Function to get n-grams as sets
def extract_ngrams(text, n):
    tokens = word_tokenize(text.lower())
    return set(ngrams(tokens, n))

# Function to compare n-grams
def count_common_ngrams(row, n, col1, col2):
    pred_ngrams = extract_ngrams(row[col1], n)
    test_ngrams = extract_ngrams(row[col2], n)
    return len(pred_ngrams & test_ngrams)

# Apply on DataFrame
df['common_unigrams_y_pred_test'] = df.apply(lambda row: count_common_ngrams(row, 1, "y_pred", "y_test"), axis=1)
df['common_bigrams_y_pred_test'] = df.apply(lambda row: count_common_ngrams(row, 2, "y_pred", "y_test"), axis=1)
df['common_trigrams_y_pred_test'] = df.apply(lambda row: count_common_ngrams(row, 3, "y_pred", "y_test"), axis=1)

df['common_unigrams_y_pred_train'] = df.apply(lambda row: count_common_ngrams(row, 1, "y_pred", "y_train"), axis=1)
df['common_bigrams_y_pred_train'] = df.apply(lambda row: count_common_ngrams(row, 2, "y_pred", "y_train"), axis=1)
df['common_trigrams_y_pred_train'] = df.apply(lambda row: count_common_ngrams(row, 3, "y_pred", "y_train"), axis=1)

df['common_unigrams_y_test_train'] = df.apply(lambda row: count_common_ngrams(row, 1, "y_test", "y_train"), axis=1)
df['common_bigrams_y_test_train'] = df.apply(lambda row: count_common_ngrams(row, 2, "y_test", "y_train"), axis=1)
df['common_trigrams_y_test_train'] = df.apply(lambda row: count_common_ngrams(row, 3, "y_test", "y_train"), axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Konstantinos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [77]:
df.drop(columns=["y_train_word_count","y_test_word_count", "total_word_count"], inplace=True)

In natural language processing (NLP), n-grams are contiguous sequences of n items (typically words) extracted from a given text. They are widely used for tasks involving text comparison, language modeling, and evaluation of generated content.

Unigrams are single words. Comparing unigrams between predicted and reference texts helps evaluate basic word-level agreement, which is useful for assessing vocabulary overlap.

Bigrams are two-word sequences. Matching bigrams capture basic word ordering and local syntactic structure.

Trigrams, three-word sequences, provide deeper insight into fluency and phrase-level coherence.

In this project, we count the number of shared unigrams, bigrams, and trigrams between predicted texts (y_pred) and ground-truth references (y_test) for each row in the dataset. This allows us to quantify not just the lexical similarity but also how well the predicted text preserves short- and medium-range word sequences from the original.

In [90]:
columns_to_average = [
    'common_unigrams_y_pred_test',
    'common_unigrams_y_pred_train',
    'common_unigrams_y_test_train',
    'common_bigrams_y_pred_test',
    'common_bigrams_y_pred_train',
    'common_bigrams_y_test_train',
    'common_trigrams_y_pred_test',
    'common_trigrams_y_pred_train',
    'common_trigrams_y_test_train'
]

# Print mean of each column
for col in columns_to_average:
    mean_val = df[col].mean()
    print(f"Mean of {col}: {mean_val:.2f}")


Mean of common_unigrams_y_pred_test: 51.43
Mean of common_unigrams_y_pred_train: 57.65
Mean of common_unigrams_y_test_train: 92.05
Mean of common_bigrams_y_pred_test: 29.80
Mean of common_bigrams_y_pred_train: 38.87
Mean of common_bigrams_y_test_train: 47.82
Mean of common_trigrams_y_pred_test: 6.61
Mean of common_trigrams_y_pred_train: 12.70
Mean of common_trigrams_y_test_train: 11.32


The comparison of shared n-grams among the predicted texts (y_pred), the ground-truth test texts (y_test), and the training texts (y_train) provides insight into how well the model captures both the reference and learned linguistic patterns.

y_pred vs y_test:
The mean common n-gram counts between y_pred and y_test decrease as the n-gram size increases (51.43 unigrams → 29.80 bigrams → 6.61 trigrams). This pattern is expected, as longer n-grams are more sensitive to word order and exact phrasing. The relatively low trigram overlap indicates that while the model captures many of the correct words, it may struggle with producing fluent or precise phrasing.

y_pred vs y_train:
The model shows higher overlap with the training data (e.g., 57.65 unigrams, 38.87 bigrams, 12.70 trigrams), suggesting a degree of memorization or strong influence from the training corpus. This may be especially noticeable if the training data contains repeated or template-like structures.

y_test vs y_train:
The highest overlap is observed here, particularly at the unigram level (92.05), implying that the test set shares a significant amount of vocabulary and phrase patterns with the training set. This lexical similarity could contribute to the model’s ability to generalize, but also raises considerations about potential data leakage or insufficient domain separation.

Overall, the decreasing trend across n-gram sizes is typical and reflects increasing strictness of the match. The relatively low trigram overlap with y_test underscores the difficulty in generating coherent and exact sequences, while the higher similarity with y_train highlights the model’s reliance on seen data.

🔹 Unigrams:
Highest overlap: y_test vs y_train (92.05), indicating strong vocabulary similarity between the test and training sets.

The model (y_pred) shares more unigrams with the training set (57.65) than with the test set (51.43), suggesting it draws heavily from the training vocabulary.

Overall, the unigram results show the model is reasonably effective at using the correct words, but not as strongly as the overlap found within the data itself.

🔹 Bigrams:
Bigram overlap follows a similar trend but decreases across all pairs due to increased word-order sensitivity.

y_pred vs y_train (38.87) > y_test vs y_train (47.82) > y_pred vs y_test (29.80), implying the model better captures short local sequences from training than it reproduces the exact phrasing in the test set.

🔹 Trigrams:
The sharpest drop is seen here, with the lowest value for y_pred vs y_test (6.61).

y_pred vs y_train (12.70) is still the highest, indicating the model is most confident in reproducing learned multi-word phrases rather than generating new coherent ones.

y_test vs y_train (11.32) being higher than y_pred vs y_test confirms that the test set contains many longer sequences also found in training, but the model struggles to regenerate them precisely.

✅ Summary
The model performs best when replicating content from the training data, particularly in longer n-grams.

Exact matches with the test set degrade as n-gram size increases, highlighting difficulties with fluency and precise word ordering.

The high overlap between y_test and y_train across all n-gram levels suggests that the test set is not fully independent, which may affect evaluation and generalization insights.

### All Metrics

In [27]:
import pandas as pd
import numpy as np
import re
import string
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# from sentence_transformers import SentenceTransformer
from nltk.util import ngrams
from nltk.corpus import stopwords
import spacy.cli
import nltk
nltk.download("stopwords")
spacy.cli.download("en_core_web_sm")

df = pd.read_csv("txt_results.csv")
df["y_pred"] = df["y_pred"].astype(str)
df["y_test"] = df["y_test"].astype(str)

nlp = spacy.load("en_core_web_sm")

stop_words = set(stopwords.words('english'))
# transformer = SentenceTransformer('all-MiniLM-L6-v2')

def get_ngrams(text, n):
    tokens = text.split()
    return set(ngrams(tokens, n))

def pos_similarity(doc1, doc2):
    pos1 = [token.pos_ for token in doc1]
    pos2 = [token.pos_ for token in doc2]
    return len(set(pos1) & set(pos2)) / max(len(set(pos1 + pos2)), 1)

def tense_similarity(doc1, doc2, tense):
    tags = {
        "Past": ["VBD", "VBN"],
        "Present": ["VBP", "VBZ", "VBG"],
        "Future": ["MD"]
    }
    doc1_tags = [token.tag_ for token in doc1 if token.tag_ in tags[tense]]
    doc2_tags = [token.tag_ for token in doc2 if token.tag_ in tags[tense]]
    return min(len(doc1_tags), len(doc2_tags)) / max(len(doc1_tags + doc2_tags), 1)

def stopword_similarity(doc1, doc2):
    sw1 = [token.text for token in doc1 if token.text.lower() in stop_words]
    sw2 = [token.text for token in doc2 if token.text.lower() in stop_words]
    return len(set(sw1) & set(sw2)) / max(len(set(sw1 + sw2)), 1)

def punctuation_count(text, mark):
    return text.count(mark)

def tfidf_score(s1, s2):
    vect = TfidfVectorizer().fit([s1, s2])
    tfidf = vect.transform([s1, s2])
    return cosine_similarity(tfidf[0], tfidf[1])[0][0]

# def transformer_similarity(s1, s2):
#     emb = transformer.encode([s1, s2])
#     return cosine_similarity([emb[0]], [emb[1]])[0][0]

def compare_texts(row):
    pred, truth = row['y_pred'], row['y_test']
    doc1, doc2 = nlp(pred), nlp(truth)
    
    return pd.Series({
        "Text Similarity": doc1.similarity(doc2),
        "POS Similarity": pos_similarity(doc1, doc2),
        "TfIdf Score": tfidf_score(pred, truth),
        "Bigram Overlap": len(get_ngrams(pred, 2) & get_ngrams(truth, 2)),
        "Trigram Overlap": len(get_ngrams(pred, 3) & get_ngrams(truth, 3)),
        # "Transformer Similarity": transformer_similarity(pred, truth),
        "Sentence Length Similarity": min(len(pred), len(truth)) / max(len(pred), len(truth)),
        "Sentence Length Variance": abs(len(pred) - len(truth)),
        "Verb Tense Similarity (Past)": tense_similarity(doc1, doc2, "Past"),
        "Verb Tense Similarity (Present)": tense_similarity(doc1, doc2, "Present"),
        "Verb Tense Similarity (Future)": tense_similarity(doc1, doc2, "Future"),
        "Verb Tense Analysis Total": sum([
            tense_similarity(doc1, doc2, "Past"),
            tense_similarity(doc1, doc2, "Present"),
            tense_similarity(doc1, doc2, "Future")
        ]),
        "Stop Words Usage Similarity": stopword_similarity(doc1, doc2),
        "Question Mark Count": int('?' in pred) == int('?' in truth),
        "Punctuation Count": sum(pred.count(p) for p in string.punctuation),
        "Comma Count": punctuation_count(pred, ','),
        "Em Dash Count": punctuation_count(pred, '—'),
    })

# Assuming df has 'y_pred' and 'y_test'
import nltk
nltk.download('stopwords')
nltk.download('punkt')

results = df.apply(compare_texts, axis=1)
final_df = pd.concat([df, results], axis=1)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Konstantinos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Konstantinos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Konstantinos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  "Text Similarity": doc1.similarity(doc2),
