### Q1. Get most popular movies (20 points)

In [16]:
class TFIDF:
    """
    A class to compute TF-IDF scores for a given corpus.

    Methods:
        fit(corpus): Learns the IDF for each term in the corpus.
        transform(document): Computes the TF-IDF score for each term in a single document.
    """

    def __init__(self):
        self.idf = {}

    """
    split(self, document: str) -> list: Tokenizes the input document into a list of words.
    It performs some preprocessing by replacing punctuation and numbers with spaces and converting the text to lowercase.
    """
    def _split(self, document: str) -> list:
        document = str(document)
        punctuation = list(".,;:!?-$&@()[]'\"")
        numbers = [str(n) for n in range(10)]
        for p in punctuation:
            document = document.replace(p, " ")
        for n in numbers:
            document = document.replace(n, " ")
        document = document.lower()
        return document.split()
    """
    count_terms(self, words: list) -> dict: Counts the frequency of each term in the given list of words.
    """
    def _count_terms(self, words: list) -> dict:
        term_count = {}
        for word in words:
            term_count[word] = term_count.get(word, 0) + 1
        return term_count
    """
    compute_tf(self, term_count: dict, doc_len: int) -> dict: Computes the Term Frequency (TF) for each term based
    on its count and the total number of terms in the document.
    """

    def _compute_tf(self, term_count: dict, doc_len: int) -> dict:
        tf = {}
        for word, count in term_count.items():
            tf[word] = count / float(doc_len)
        return tf

    """
    compute_idf(self, doc_count: int, doc_freq: dict) -> None: Computes the Inverse Document Frequency (IDF) for each term based on the total number of documents
    in the corpus and the frequency of each term across all documents.
    in short word math.Log(The number of documents in the Corpus / the number of documents in which each term The word appears.)
    """
    def _compute_idf(self, doc_count: int, doc_freq: dict) -> None:
        for word, count in doc_freq.items():
            self.idf[word] = math.log(doc_count / float(count))
    
    """
    fit(self, corpus) -> None: Fits the TF-IDF model to the given corpus by learning the IDF for each term.
    """

    def fit(self, corpus) -> None:
        doc_count = len(corpus)
        doc_freq = {}

        for document in corpus:
            words = set(self._split(document))
            for word in words:
                doc_freq[word] = doc_freq.get(word, 0) + 1

        self._compute_idf(doc_count, doc_freq)
    
    """
    transform(self, document: str) -> dict: Transforms a single document into TF-IDF scores for each term.
    """

    def transform(self, document: str) -> dict:
        tf_idf = {}
        words = self._split(document)
        term_count = self._count_terms(words)
        tf = self._compute_tf(term_count, len(words))

        for word, tf_value in tf.items():
            idf = self.idf.get(word, 0.0)
            tf_idf[word] = tf_value * idf

        return tf_idf

### Q1. Get 10  popularity movie (20 points)


In [17]:
import pandas as pd

# Load the dataset
df = pd.read_csv('tmdb_5000_movies.csv')

# Sort the DataFrame by the 'popularity' column in descending order
df_sorted = df.sort_values(by='popularity', ascending=False)

# Get the top 10 most popular movies
top_10_movies = df_sorted.head(10)

# Display the result with headers and numerical index in a horizontal format
print(f"{'Index': <6} {'Title': <50}")
for idx, (index, row) in enumerate(top_10_movies.iterrows(), start=1):
    print(f"{idx: <6} {row['original_title']: <50}")


Index  Title                                             
1      Minions                                           
2      Interstellar                                      
3      Deadpool                                          
4      Guardians of the Galaxy                           
5      Mad Max: Fury Road                                
6      Jurassic World                                    
7      Pirates of the Caribbean: The Curse of the Black Pearl
8      Dawn of the Planet of the Apes                    
9      The Hunger Games: Mockingjay - Part 1             
10     Big Hero 6                                        


### Q2. Get keywords for each movie (20 points)


In [26]:
%run tf_idf.ipynb
import pandas as pd

# Load the dataset
df = pd.read_csv('tmdb_5000_movies.csv')

# Instantiate the TFIDF class
tfidf_model = TFIDF()

# Fit the model on the movie titles
tfidf_model.fit(df_sorted['original_title'])

# Display top 3 keywords for each movie with an index
iteration_count = 0
for index, row in enumerate(df_sorted.iterrows()):
    movie_title = row[1]['original_title']
    tfidf_scores = tfidf_model.transform(movie_title)
    top_keywords = sorted(tfidf_scores, key=tfidf_scores.get, reverse=True)[:3]
    print(f"Movie Number: {index + 1}")
    print(f"Movie Title: {movie_title}")
    print(f"Top Keywords: {', '.join(top_keywords)}\n")
    
    iteration_count += 1
    if iteration_count == 10:
        break


Movie Number: 1
Movie Title: Minions
Top Keywords: minions

Movie Number: 2
Movie Title: Interstellar
Top Keywords: interstellar

Movie Number: 3
Movie Title: Deadpool
Top Keywords: deadpool

Movie Number: 4
Movie Title: Guardians of the Galaxy
Top Keywords: guardians, galaxy, of

Movie Number: 5
Movie Title: Mad Max: Fury Road
Top Keywords: fury, max, mad

Movie Number: 6
Movie Title: Jurassic World
Top Keywords: jurassic, world

Movie Number: 7
Movie Title: Pirates of the Caribbean: The Curse of the Black Pearl
Top Keywords: pearl, caribbean, curse

Movie Number: 8
Movie Title: Dawn of the Planet of the Apes
Top Keywords: apes, planet, dawn

Movie Number: 9
Movie Title: The Hunger Games: Mockingjay - Part 1
Top Keywords: mockingjay, hunger, games

Movie Number: 10
Movie Title: Big Hero 6
Top Keywords: hero, big



### Q3. Movie recommendation (60 points)


In [30]:
class MovieRecommender:
    def __init__(self, titles, overviews, n_keywords):
        """
        Initialize the MovieRecommender class.

        Parameters:
        - titles: List of movie titles.
        - overviews: List of movie overviews/descriptions.
        - n_keywords: Number of keywords to consider for similarity calculation.
        """
        self.titles = titles
        self.overviews = overviews
        self.n_keywords = n_keywords
        self.tfidf_model = TFIDF()
        self.tfidf_model.fit(self.overviews)  # Fit TF-IDF model on movie overviews

    def recommend(self, input_overview, n):
        """
        Recommend movies based on input overview.

        Parameters:
        - input_overview: The input overview for which recommendations are needed.
        - n: Number of movies to recommend.

        Returns:
        - List of recommended movies with their titles and similarity scores.
        """
        # Calculate TF-IDF scores for input overview
        tfidf_scores = self.tfidf_model.transform(input_overview)
        # Get top keywords from TF-IDF scores
        top_keywords = sorted(tfidf_scores, key=tfidf_scores.get, reverse=True)[:self.n_keywords]

        list_sim = []  # List of similar movies
        for title, overview in zip(self.titles, self.overviews):
            # Calculate TF-IDF scores for each movie overview
            tfidf_df = self.tfidf_model.transform(overview)
            # Get top keywords from TF-IDF scores for the current movie
            top_keywords_df = sorted(tfidf_df, key=tfidf_df.get, reverse=True)[:self.n_keywords]

            # Calculate Jaccard similarity between the input and each movie
            mone = len(set(top_keywords) & set(top_keywords_df))
            mechane = len(set(top_keywords_df) | set(top_keywords))
            res = mone / mechane if mechane != 0 else 0
            list_sim.append((title, res))

        # Sort movies based on similarity scores and get top recommendations
        rec_movies = sorted(list_sim, key=lambda x: x[1], reverse=True)[:n]
        return rec_movies


# Load the dataset
df = pd.read_csv('tmdb_5000_movies.csv')
# Get as input the number of keywords you want to use
num_keywords = 5

# Create MovieRecommender instance
recommender_Movie = MovieRecommender(
    titles=df['title'].tolist(),
    overviews=df['overview'].tolist(),
    n_keywords=num_keywords
)

# Example query and recommendation
query_1980 = "Galaxina is lifelike, voluptuous android"
recommendations_1980 = recommender_Movie.recommend(query_1980, 5)
print(recommendations_1980)

# Sort the DataFrame by the 'popularity' column in descending order
df_sorted = df.sort_values(by='popularity', ascending=False)

#Example query and recommendation for top 10 movies
top_10_movies = df_sorted.head(10)  # Assuming you have a DataFrame with the top 10 movies

for idx, (index, row) in enumerate(top_10_movies.iterrows(), start=1):
    movie_overview = row['overview']
    recommendations = recommender_Movie.recommend(movie_overview, 5)
    
    # Print recommendations for each movie
    print(f"Recommendations for Movie {idx} - {row['title']}:")
    for title, similarity_score in recommendations:
        print(f"   {title} - Similarity: {similarity_score}")
    print("\n")



[('galaxina', 0.25), ('the boy', 0.1111111111111111), ('the lady from shanghai', 0.1111111111111111), ('avatar', 0.0), ("pirates of the caribbean: at world's end", 0.0)]
Recommendations for Movie 1 - minions:
   minions - Similarity: 1.0
   stuart little 2 - Similarity: 0.1111111111111111
   stuart little - Similarity: 0.1111111111111111
   miracle - Similarity: 0.1111111111111111
   velvet goldmine - Similarity: 0.1111111111111111


Recommendations for Movie 2 - interstellar:
   interstellar - Similarity: 1.0
   avatar - Similarity: 0.0
   pirates of the caribbean: at world's end - Similarity: 0.0
   spectre - Similarity: 0.0
   the dark knight rises - Similarity: 0.0


Recommendations for Movie 3 - deadpool:
   deadpool - Similarity: 1.0
   shaft - Similarity: 0.1111111111111111
   3:10 to yuma - Similarity: 0.1111111111111111
   ta ra rum pum - Similarity: 0.1111111111111111
   maggie - Similarity: 0.1111111111111111


Recommendations for Movie 4 - guardians of the galaxy:
   guardi