In [84]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from scipy.stats import pearsonr
from sklearn.metrics import jaccard_score
from collections import Counter
from scipy.spatial.distance import dice
import nltk
import re
nltk.download('stopwords')   # For stopwords
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tariq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [85]:
# Synopses
synopsis1 = "Based on true events, this film follows the investigation of the Zodiac killer, a serial murderer who mocks the police with cryptic messages. The story focuses on a cartoonist, a journalist, and detectives as they strive to reveal the killer's identity."
synopsis2 = "Two detectives, a novice and a seasoned veteran, pursue a serial killer who uses the seven deadly sins as his signature. The narrative centers on their efforts to uncover the murderer's identity, exploring themes of morality, justice, and the human condition."
synopsis3 = "A sequel to the original 'Blade Runner,' this film is set in a dystopian future where bioengineered humans, known as replicants, are used for various purposes. A young blade runner, K, uncovers a long-buried secret that leads him on a quest to find former blade runner Rick Deckard, who has been missing for thirty years."

In [86]:
# Preprocessing
synopses = [synopsis1, synopsis2, synopsis3]

# Load stopwords
stop_words = set(stopwords.words('english'))

# Initializers for Stemmer and Lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_comment(text):
    if pd.isna(text):
        return text
    # Lowercase normalization
    text = text.lower()
    # Remove URLstk
    # Remove punctuation and non-ASCII
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and perform /lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Rejoin words into one string
    return ' '.join(cleaned_tokens)

In [87]:
synopses

["Based on true events, this film follows the investigation of the Zodiac killer, a serial murderer who mocks the police with cryptic messages. The story focuses on a cartoonist, a journalist, and detectives as they strive to reveal the killer's identity.",
 "Two detectives, a novice and a seasoned veteran, pursue a serial killer who uses the seven deadly sins as his signature. The narrative centers on their efforts to uncover the murderer's identity, exploring themes of morality, justice, and the human condition.",
 "A sequel to the original 'Blade Runner,' this film is set in a dystopian future where bioengineered humans, known as replicants, are used for various purposes. A young blade runner, K, uncovers a long-buried secret that leads him on a quest to find former blade runner Rick Deckard, who has been missing for thirty years."]

In [88]:
synopses = [clean_comment(synopsis) for synopsis in synopses]

In [89]:
synopses

['based true event film follows investigation zodiac killer serial murderer mock police cryptic message story focus cartoonist journalist detective strive reveal killer identity',
 'two detective novice seasoned veteran pursue serial killer us seven deadly sin signature narrative center effort uncover murderer identity exploring theme morality justice human condition',
 'sequel original blade runner film set dystopian future bioengineered human known replicants used various purpose young blade runner k uncovers longburied secret lead quest find former blade runner rick deckard missing thirty year']

In [90]:
# Vectorize the synopses using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses).toarray()

In [91]:
# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_df = pd.DataFrame(cosine_sim, index=["Zodiac", "Se7en", "Blade Runner 2049"], columns=["Zodiac", "Se7en", "Blade Runner 2049"])

In [92]:
# Euclidean Distance
euclidean_dist = (euclidean(tfidf_matrix[0], tfidf_matrix[1]), 
                  euclidean(tfidf_matrix[0], tfidf_matrix[2]), 
                  euclidean(tfidf_matrix[1], tfidf_matrix[2]))

In [93]:
# Pearson Correlation
pearson_corr_1_2, _ = pearsonr(tfidf_matrix[0], tfidf_matrix[1])
pearson_corr_1_3, _ = pearsonr(tfidf_matrix[0], tfidf_matrix[2])
pearson_corr_2_3, _ = pearsonr(tfidf_matrix[1], tfidf_matrix[2])
pearson_corr = (pearson_corr_1_2, pearson_corr_1_3, pearson_corr_2_3)

In [94]:
# Dice Coefficient
dice_coeff_1_2 = 1 - dice(tfidf_matrix[0], tfidf_matrix[1])
dice_coeff_1_3 = 1 - dice(tfidf_matrix[0], tfidf_matrix[2])
dice_coeff_2_3 = 1 - dice(tfidf_matrix[1], tfidf_matrix[2])
dice_coeff = (dice_coeff_1_2, dice_coeff_1_3, dice_coeff_2_3)

In [95]:
# Jaccard Similarity
jaccard_sim_1_2 = jaccard_score(tfidf_matrix[0] > 0, tfidf_matrix[1] > 0, average='macro')
jaccard_sim_1_3 = jaccard_score(tfidf_matrix[0] > 0, tfidf_matrix[2] > 0, average='macro')
jaccard_sim_2_3 = jaccard_score(tfidf_matrix[1] > 0, tfidf_matrix[2] > 0, average='macro')
jaccard_sim = (jaccard_sim_1_2, jaccard_sim_1_3, jaccard_sim_2_3)


In [96]:
# Display results
results = {
    "Metric": ["Cosine Similarity", "Euclidean Distance", "Pearson Correlation", "Jaccard Similarity", "Dice Coefficient"],
    "Zodiac vs Se7en": [cosine_df.at["Zodiac", "Se7en"], euclidean_dist[0], pearson_corr[0], jaccard_sim[0], dice_coeff[0]],
    "Zodiac vs Blade Runner 2049": [cosine_df.at["Zodiac", "Blade Runner 2049"], euclidean_dist[1], pearson_corr[1], jaccard_sim[1], dice_coeff[1]],
    "Se7en vs Blade Runner 2049": [cosine_df.at["Se7en", "Blade Runner 2049"], euclidean_dist[2], pearson_corr[2], jaccard_sim[2], dice_coeff[2]]
}

results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Metric,Zodiac vs Se7en,Zodiac vs Blade Runner 2049,Se7en vs Blade Runner 2049
0,Cosine Similarity,0.158982,0.01912,0.018574
1,Euclidean Distance,1.296933,1.400629,1.401018
2,Pearson Correlation,-0.271905,-0.457123,-0.511892
3,Jaccard Similarity,0.265873,0.151995,0.129018
4,Dice Coefficient,0.033116,0.004056,0.003803
