In [44]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from scipy.stats import pearsonr
from sklearn.metrics import jaccard_score
from collections import Counter
from scipy.spatial.distance import dice
import nltk
import re
nltk.download('stopwords')   # For stopwords
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [45]:
# Synopses
synopsis1 = "Based on real events, this film follows the investigation of the Zodiac killer, a serial murderer who taunted police with cryptic messages. The story centers on the efforts of a cartoonist, a journalist, and detectives to uncover the killer's identity."
synopsis2 = "Two detectives, a rookie and a veteran, hunt a serial killer who uses the seven deadly sins as his modus operandi. The film explores themes of morality, justice, and the human condition through its intense and dark narrative."
synopsis3 = "A sequel to the original 'Blade Runner,' this film is set in a dystopian future where bioengineered humans, known as replicants, are used for various purposes. A young blade runner, K, uncovers a long-buried secret that leads him on a quest to find former blade runner Rick Deckard, who has been missing for thirty years."

In [46]:
# Preprocessing
synopses = [synopsis1, synopsis2, synopsis3]

# Load stopwords
stop_words = set(stopwords.words('english'))

# Initializers for Stemmer and Lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_comment(text):
    if pd.isna(text):
        return text
    # Lowercase normalization
    text = text.lower()
    # Remove URLstk
    # Remove punctuation and non-ASCII
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and perform /lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Rejoin words into one string
    return ' '.join(cleaned_tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tariq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
synopses

["Based on real events, this film follows the investigation of the Zodiac killer, a serial murderer who taunted police with cryptic messages. The story centers on the efforts of a cartoonist, a journalist, and detectives to uncover the killer's identity.",
 'Two detectives, a rookie and a veteran, hunt a serial killer who uses the seven deadly sins as his modus operandi. The film explores themes of morality, justice, and the human condition through its intense and dark narrative.',
 "A sequel to the original 'Blade Runner,' this film is set in a dystopian future where bioengineered humans, known as replicants, are used for various purposes. A young blade runner, K, uncovers a long-buried secret that leads him on a quest to find former blade runner Rick Deckard, who has been missing for thirty years."]

In [48]:
synopses = [clean_comment(synopsis) for synopsis in synopses]

In [49]:
synopses

['based real event film follows investigation zodiac killer serial murderer taunted police cryptic message story center effort cartoonist journalist detective uncover killer identity',
 'two detective rookie veteran hunt serial killer us seven deadly sin modus operandi film explores theme morality justice human condition intense dark narrative',
 'sequel original blade runner film set dystopian future bioengineered human known replicants used various purpose young blade runner k uncovers longburied secret lead quest find former blade runner rick deckard missing thirty year']

In [50]:
# Vectorize the synopses
vectorizer = CountVectorizer().fit_transform(synopses)
vectors = vectorizer.toarray()

In [51]:
# Cosine Similarity
cosine_sim = cosine_similarity(vectors)
cosine_df = pd.DataFrame(cosine_sim, index=["Zodiac", "Se7en", "Blade Runner 2049"], columns=["Zodiac", "Se7en", "Blade Runner 2049"])

In [52]:
# Euclidean Distance
euclidean_dist = euclidean(vectors[0], vectors[1]), euclidean(vectors[0], vectors[2]), euclidean(vectors[1], vectors[2])

In [53]:
# Pearson Correlation
pearson_corr_1_2, _ = pearsonr(vectors[0], vectors[1])
pearson_corr_1_3, _ = pearsonr(vectors[0], vectors[2])
pearson_corr_2_3, _ = pearsonr(vectors[1], vectors[2])
pearson_corr = (pearson_corr_1_2, pearson_corr_1_3, pearson_corr_2_3)

In [54]:
# Dice Coefficient
dice_coeff_1_2 = 1 - dice(vectors[0], vectors[1])
dice_coeff_1_3 = 1 - dice(vectors[0], vectors[2])
dice_coeff_2_3 = 1 - dice(vectors[1], vectors[2])
dice_coeff = (dice_coeff_1_2, dice_coeff_1_3, dice_coeff_2_3)

In [57]:
# Jaccard Similarity
jaccard_sim_1_2 = jaccard_score(vectors[0], vectors[1], average='macro')
jaccard_sim_1_3 = jaccard_score(vectors[0], vectors[2], average='macro')
jaccard_sim_2_3 = jaccard_score(vectors[1], vectors[2], average='macro')
jaccard_sim = (jaccard_sim_1_2, jaccard_sim_1_3, jaccard_sim_2_3)

In [58]:
# Display results
results = {
    "Metric": ["Cosine Similarity", "Euclidean Distance", "Pearson Correlation", "Jaccard Similarity", "Dice Coefficient"],
    "Zodiac vs Se7en": [cosine_df.at["Zodiac", "Se7en"], euclidean_dist[0], pearson_corr[0], jaccard_sim[0], dice_coeff[0]],
    "Zodiac vs Blade Runner 2049": [cosine_df.at["Zodiac", "Blade Runner 2049"], euclidean_dist[1], pearson_corr[1], jaccard_sim[1], dice_coeff[1]],
    "Se7en vs Blade Runner 2049": [cosine_df.at["Se7en", "Blade Runner 2049"], euclidean_dist[2], pearson_corr[2], jaccard_sim[2], dice_coeff[2]]
}

results_df = pd.DataFrame(results)

results_df


Unnamed: 0,Metric,Zodiac vs Se7en,Zodiac vs Blade Runner 2049,Se7en vs Blade Runner 2049
0,Cosine Similarity,0.208514,0.030151,0.062869
1,Euclidean Distance,6.164414,8.185353,7.937254
2,Pearson Correlation,-0.180144,-0.450538,-0.431423
3,Jaccard Similarity,0.161956,0.073617,0.106492
4,Dice Coefficient,0.217391,0.036364,0.072727
