In [24]:
!pip install pandas scikit-learn sentence-transformers transformers

Defaulting to user installation because normal site-packages is not writeable


In [25]:
!pip install tf-keras

Defaulting to user installation because normal site-packages is not writeable


In [26]:
!pip install gensim==4.3.0

Defaulting to user installation because normal site-packages is not writeable


In [27]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util

In [28]:
# Load the dataset
df = pd.read_csv('Marvel_Comics.csv')

# Display the first few rows
df.head()


Unnamed: 0,comic_name,active_years,issue_title,publish_date,issue_description,penciler,writer,cover_artist,Imprint,Format,Rating,Price
0,A Year of Marvels: April Infinite Comic (2016),(2016),A Year of Marvels: April Infinite Comic (2016) #1,"April 01, 2016",The Infinite Comic that will have everyone tal...,Yves Bigerel,Yves Bigerel,Jamal Campbell,Marvel Universe,Infinite Comic,Rated T+,Free
1,A Year of Marvels: August Infinite Comic (2016),(2016),A Year of Marvels: August Infinite Comic (2016...,"August 10, 2016","It’s August, and Nick Fury is just in time to ...",Jamal Campbell,"Chris Sims, Chad Bowers",,Marvel Universe,Infinite Comic,,Free
2,A Year of Marvels: February Infinite Comic (2016),(2016),A Year of Marvels: February Infinite Comic (20...,"February 10, 2016",Join us in a brand new Marvel comics adventure...,"Danilo S. Beyruth, M Mast",Ryan North,,Marvel Universe,Infinite Comic,Rated T+,Free
3,A Year of Marvels: July Infinite Comic (2016),(2016),A Year of Marvels: July Infinite Comic (2016) #1,"June 29, 2016",Celebrating the Fourth of July is complicated ...,Juanan Ramirez,Chuck Wendig,Jamal Campbell,Marvel Universe,Infinite Comic,,Free
4,A Year of Marvels: June Infinite Comic (2016),(2016),A Year of Marvels: June Infinite Comic (2016) #1,"June 15, 2016",Sam Alexander’s finding it hard to cope with t...,Diego Olortegui,Paul Allor,Jamal Campbell,Marvel Universe,Infinite Comic,,Free


In [29]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Preprocess all descriptions
df['tokens'] = df['issue_description'].apply(preprocess)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joyal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joyal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joyal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

In [31]:
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [32]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx+1}: {topic}")

def assign_topic(text):
    bow = dictionary.doc2bow(text)
    topics = lda_model.get_document_topics(bow)
    return max(topics, key=lambda x: x[1])[0]

df['topic'] = df['tokens'].apply(assign_topic)

Topic 1: 0.136*"none" + 0.023*"punisher" + 0.018*"frank" + 0.018*"fury" + 0.014*"nick" + 0.012*"castle" + 0.008*"war" + 0.007*"soldier" + 0.007*"winter" + 0.006*"doc"
Topic 2: 0.008*"must" + 0.007*"find" + 0.006*"save" + 0.006*"battle" + 0.006*"new" + 0.005*"power" + 0.005*"take" + 0.005*"strange" + 0.005*"help" + 0.004*"man"
Topic 3: 0.015*"story" + 0.014*"marvel" + 0.011*"new" + 0.010*"comic" + 0.010*"series" + 0.009*"hulk" + 0.009*"issue" + 0.007*"artist" + 0.007*"first" + 0.007*"tale"
Topic 4: 0.013*"mutant" + 0.012*"one" + 0.012*"new" + 0.010*"wolverine" + 0.008*"part" + 0.007*"life" + 0.006*"team" + 0.006*"find" + 0.006*"world" + 0.006*"get"
Topic 5: 0.021*"marvel" + 0.019*"avenger" + 0.013*"thor" + 0.012*"hero" + 0.012*"universe" + 0.011*"man" + 0.009*"new" + 0.009*"four" + 0.009*"war" + 0.009*"world"


In [33]:
model = SentenceTransformer('all-MiniLM-L6-v2')

description_embeddings = model.encode(df['issue_description'].tolist(), convert_to_tensor=True)



In [34]:
import re

def extract_characters(text):
    return re.findall(r'\b[A-Z][a-z]+\b', text)

df['characters'] = df['issue_title'].apply(extract_characters) + df['issue_description'].apply(extract_characters)
df['characters'] = df['characters'].apply(lambda x: ', '.join(set(x)))

# Encode characters
character_embeddings = model.encode(df['characters'].tolist(), convert_to_tensor=True)

In [35]:
def recommend_comics(query, df, lda_model, dictionary, description_embeddings, character_embeddings, writer_column, model, top_n=5):
    query_lower = query.lower()
    
    # Check if query is related to a writer
    if any(writer.lower() in query_lower for writer in df[writer_column].str.lower().unique()):
        filtered_df = df[df[writer_column].str.contains(query, case=False, na=False)]
        return filtered_df.head(top_n)

    # Check if query is related to characters/teams
    if re.search(r'\b[A-Z][a-z]+\b', query):
        query_embedding = model.encode([query], convert_to_tensor=True)
        cosine_sim = util.pytorch_cos_sim(query_embedding, character_embeddings)[0]
        # Ensure cosine_sim has valid indices
        if len(cosine_sim) == 0:
            return pd.DataFrame()  # Return empty DataFrame if no character embeddings are found
        recommended_indices = cosine_sim.argsort(descending=True)[:top_n]
        return df.iloc[recommended_indices]
    
    # Default to description-based recommendation
    # Determine the topic of the query using LDA
    query_tokens = preprocess(query)
    bow = dictionary.doc2bow(query_tokens)
    topics = lda_model.get_document_topics(bow)
    if not topics:
        return pd.DataFrame()  # Return empty DataFrame if no topics found
    
    # Get the most relevant topic
    topic = max(topics, key=lambda x: x[1])[0]
    
    # Filter comics based on the topic
    filtered_df = df[df['topic'] == topic]
    
    if filtered_df.empty:
        return pd.DataFrame()  # Return empty DataFrame if no comics in the topic
    
    # Apply semantic similarity
    filtered_embeddings = model.encode(filtered_df['issue_description'].tolist(), convert_to_tensor=True)
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    cosine_sim = util.pytorch_cos_sim(query_embedding, filtered_embeddings)[0]
    
    # Ensure cosine_sim has valid indices
    if len(cosine_sim) == 0:
        return pd.DataFrame()  # Return empty DataFrame if no description embeddings are found
    
    recommended_indices = cosine_sim.argsort(descending=True)[:top_n]
    
    return filtered_df.iloc[recommended_indices]



In [36]:
print(df.columns)

Index(['comic_name', 'active_years', 'issue_title', 'publish_date',
       'issue_description', 'penciler', 'writer', 'cover_artist', 'Imprint',
       'Format', 'Rating', 'Price', 'tokens', 'topic', 'characters'],
      dtype='object')


In [39]:
def normalize_query(query):
    query_lower = query.lower()
    return query_lower.replace('by ', '').strip()

# Example usage
query = "comedy"  # This can be a description, character name, or writer name
normalized_query = normalize_query(query)

recommended_comics = recommend_comics(normalized_query, df, lda_model, dictionary, description_embeddings, character_embeddings, 'writer', model)
print(recommended_comics[['comic_name', 'issue_title', 'issue_description']])

                                 comic_name                       issue_title  \
23413     Spider-Man/Deadpool (2016 - 2019)    Spider-Man/Deadpool (2016) #42   
13733    Journey Into Mystery (1952 - 1966)   Journey Into Mystery (1952) #11   
25409   Strikeforce: Morituri (1986 - 1989)   Strikeforce: Morituri (1986) #4   
14214  Lorna the Jungle Queen (1953 - 1954)  Lorna the Jungle Queen (1953) #4   
11455         Howard the Duck (1976 - 1979)        Howard the Duck (1976) #25   

                                       issue_description  
23413              Spider-Man and Deadpool…roadtripping!  
13733  A medium plays an 'undead' prank on another me...  
25409  Fame glorifies the Strikeforce when a dramatic...  
14214  Greg Knight takes a boorish stance on animal r...  
11455  Paul takes Howard out for a night on the town ...  


In [38]:
query = "action"  
normalized_query = normalize_query(query)

recommended_comics = recommend_comics(normalized_query, df, lda_model, dictionary, description_embeddings, character_embeddings, 'writer', model)
print(recommended_comics[['comic_name', 'issue_title', 'issue_description']])

                                  comic_name  \
9523            Fantastic Four (1998 - 2012)   
26744  The Invincible Iron Man (2004 - 2007)   
1766           Avengers & X-Men: Axis (2014)   
1767           Avengers & X-Men: Axis (2014)   
30033         Uncanny Avengers (2015 - 2017)   

                              issue_title  \
9523           Fantastic Four (1998) #504   
26744  The Invincible Iron Man (2004) #12   
1766     Avengers & X-Men: Axis (2014) #3   
1767     Avengers & X-Men: Axis (2014) #2   
30033         Uncanny Avengers (2015) #12   

                                       issue_description  
9523   AUTHORITATIVE ACTION PART 2 Nick Fury has been...  
26744  When a dead-man's switch is triggered, the awe...  
1766   ACT I: THE RED SUPREMACY. The heroes of the Ma...  
1767   ACT I: THE RED SUPREMACY. The heroes of the Ma...  
30033  All-out action to save an Avenger ...FROM ULTR...  


In [23]:
!pip show gensim

Name: gensim
Version: 4.3.0
Summary: Python framework for fast Vector Space Modelling
Home-page: http://radimrehurek.com/gensim
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPL-2.1-only
Location: C:\Users\joyal\AppData\Roaming\Python\Python311\site-packages
Requires: Cython, FuzzyTM, numpy, scipy, smart-open
Required-by: 


In [17]:
import torch

torch.save(character_embeddings, 'models/character_embeddings.pt')
torch.save(description_embeddings, 'models/description_embeddings.pt')


In [18]:
dictionary.save('models/dictionary.pkl')
lda_model.save('models/lda_model.pkl')
model.save('models/sentence_transformer_model')
