#prerun

In [20]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import linear_kernel

#experimental

In [14]:
data = pd.read_csv("imdb_top_1000.csv")

In [15]:
#del unneeded
cols=["Poster_Link","Certificate","Meta_score","Gross" ]
data = data.drop(columns=cols)

# Handle missing values (if any)
data = data.dropna()


In [16]:
# Convert 'Runtime' to numerical (e.g., "142 min" to 142)
data['Runtime'] = data['Runtime'].str.replace(' min', '').astype(int)

# Combine the features into a single string for simplicity (e.g., genre, director, stars)
data['combined_features'] = data.apply(lambda row: f"{row['Genre']} {row['Director']} {row['Star1']} {row['Star2']} {row['Star3']} {row['Star4']}", axis=1)

In [19]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

In [21]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [22]:
# Create a mapping of movie titles to indices
indices = pd.Series(data.index, index=data['Series_Title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['Series_Title'].iloc[movie_indices]


In [23]:
def get_weighted_recommendations(ratings, cosine_sim=cosine_sim):
    total_sim_scores = {}

    for movie, rating in ratings.items():
        # Get the index of the movie that matches the title
        idx = indices[movie]

        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))

        for i, score in sim_scores:
            if i in total_sim_scores:
                total_sim_scores[i] += score * rating
            else:
                total_sim_scores[i] = score * rating

    # Sort the movies based on the weighted similarity scores
    sorted_sim_scores = sorted(total_sim_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the movie indices of the top recommendations, excluding already rated movies
    recommended_movie_indices = [i[0] for i in sorted_sim_scores if data['Series_Title'].iloc[i[0]] not in ratings][:10]

    # Return the top 10 recommendations
    return data['Series_Title'].iloc[recommended_movie_indices]


In [31]:
# Assume you have a dictionary of personal ratings
personal_ratings = {
    'The Shawshank Redemption': 9,
    'Inception': 10,
    'The Dark Knight': 0.5,
    'Dunkirk':0,
    'The Prestige':8
}
# Get recommendations based on your personal ratings
recommendations = get_weighted_recommendations(personal_ratings)
print(recommendations)


155                          Batman Begins
63                   The Dark Knight Rises
21                            Interstellar
479             X-Men: Days of Future Past
202                                  Logan
357                           The Avengers
675             Back to the Future Part II
737    Captain America: The Winter Soldier
583             Captain America: Civil War
243                    Catch Me If You Can
Name: Series_Title, dtype: object


#OOP

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle

class MovieRecommender:
    def __init__(self, data_file=None):
        if data_file:
            self.data = pd.read_csv(data_file)
            self._preprocess_data()
            self._create_tfidf_matrix()
        else:
            self.data = None
            self.tfidf_matrix = None
            self.cosine_sim = None
            self.indices = None

    def _preprocess_data(self):

        try:
          # Delete unneeded columns
          cols = ["Poster_Link", "Certificate", "Meta_score", "Gross"]
          self.data = self.data.drop(columns=cols)
        except:
          pass

        # Handle missing values
        self.data = self.data.dropna()

        # Convert 'Runtime' to numerical
        self.data['Runtime'] = self.data['Runtime'].str.replace(' min', '').astype(int)

        try:
          # Combine the features into a single string
          self.data['combined_features'] = self.data.apply(lambda row: f"{row['Genre']} {row['Director']} {row['Star1']} {row['Star2']} {row['Star3']} {row['Star4']}", axis=1)
        except:
          print("error reading the cols (Genre, Director...)")


    def _create_tfidf_matrix(self):
        # Initialize the TF-IDF Vectorizer
        tfidf = TfidfVectorizer(stop_words='english')

        # Fit and transform the data
        self.tfidf_matrix = tfidf.fit_transform(self.data['combined_features'])

        # Compute the cosine similarity matrix
        self.cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)

        # Create a mapping of movie titles to indices
        self.indices = pd.Series(self.data.index, index=self.data['Series_Title']).drop_duplicates()

    def get_recommendations(self, title):
        # Get the index of the movie that matches the title
        idx = self.indices[title]

        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(self.cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:11]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        return self.data['Series_Title'].iloc[movie_indices]

    def get_weighted_recommendations(self, ratings):
        total_sim_scores = {}

        for movie, rating in ratings.items():
            # Get the index of the movie that matches the title
            idx = self.indices[movie]

            # Get the pairwise similarity scores of all movies with that movie
            sim_scores = list(enumerate(self.cosine_sim[idx]))

            for i, score in sim_scores:
                if i in total_sim_scores:
                    total_sim_scores[i] += score * rating
                else:
                    total_sim_scores[i] = score * rating

        # Sort the movies based on the weighted similarity scores
        sorted_sim_scores = sorted(total_sim_scores.items(), key=lambda x: x[1], reverse=True)

        # Get the movie indices of the top recommendations, excluding already rated movies
        recommended_movie_indices = [i[0] for i in sorted_sim_scores if self.data['Series_Title'].iloc[i[0]] not in ratings][:10]

        # Return the top 10 recommendations
        return self.data['Series_Title'].iloc[recommended_movie_indices]

    def save(self, file_name):
        with open(file_name, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load(cls, file_name):
        with open(file_name, 'rb') as file:
            return pickle.load(file)


In [2]:
# Create an instance of MovieRecommender with the data file
recommender = MovieRecommender('imdb_top_1000.csv')

# Save the recommender object to a file
recommender.save('recommender.pkl')



In [10]:
# Load the recommender object from a file
loaded_recommender = MovieRecommender.load('recommender.pkl')

# Assume you have a dictionary of personal ratings
personal_ratings = {
    'The Shawshank Redemption': 8,
    'Inception': 10,
    'The Dark Knight': 9,
    'Dunkirk': 0,
    'The Prestige': 10,
    'Batman Begins':8,
    'The Dark Knight Rises':10,
    'Interstellar':10,
    'Ford v Ferrari':8,
    'American Psycho':7,
    'The Machinist':5,


}

# Get recommendations based on your personal ratings
recommendations = loaded_recommender.get_weighted_recommendations(personal_ratings)
recommendations = list(recommendations)

for i in recommendations:
  print(i)

3:10 to Yuma
The Big Short
Empire of the Sun
The Fighter
Memento
The Man Who Would Be King
Back to the Future Part II
Back to the Future
Children of Men
Catch Me If You Can
