<a href="https://colab.research.google.com/github/Durgasai26/Machine-learning/blob/main/Movie_Recommendation1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
movies = pd.read_csv('Movies.csv')
ratings = pd.read_csv('ratings.csv')


In [9]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [10]:
movies['overview'] = movies['overview'].apply(lambda x: preprocess_text(str(x)))
movies['genre'] = movies['genre'].apply(lambda x: preprocess_text(str(x)))

In [11]:
movies['tags'] = movies['overview'] + ' ' + movies['genre']

In [12]:
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])

In [15]:
tfidf_matrix

<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 214149 stored elements in Compressed Sparse Row format>

In [13]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [14]:
cosine_sim

array([[1.        , 0.01127679, 0.03260298, ..., 0.06085897, 0.04897576,
        0.01611524],
       [0.01127679, 1.        , 0.01465445, ..., 0.        , 0.        ,
        0.        ],
       [0.03260298, 0.01465445, 1.        , ..., 0.        , 0.02033855,
        0.01532365],
       ...,
       [0.06085897, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.04897576, 0.        , 0.02033855, ..., 0.        , 1.        ,
        0.        ],
       [0.01611524, 0.        , 0.01532365, ..., 0.        , 0.        ,
        1.        ]])

In [16]:
movie_indices = pd.Series(movies.index, index=movies['title']).to_dict()

In [17]:
movie_indices

{'The Shawshank Redemption': 0,
 'Dilwale Dulhania Le Jayenge': 1,
 'The Godfather': 2,
 "Schindler's List": 3,
 'The Godfather: Part II': 4,
 'Impossible Things': 5,
 'Spirited Away': 6,
 'Your Eyes Tell': 7,
 'Dou kyu sei – Classmates': 8,
 'Your Name.': 9,
 '12 Angry Men': 797,
 "Gabriel's Inferno": 11,
 'Parasite': 12,
 'The Green Mile': 13,
 "Gabriel's Inferno: Part II": 14,
 'The Dark Knight': 15,
 'The Good, the Bad and the Ugly': 16,
 'Pulp Fiction': 17,
 'The Lord of the Rings: The Return of the King': 18,
 "Gabriel's Inferno: Part III": 19,
 'Forrest Gump': 20,
 'Cinema Paradiso': 21,
 'Seven Samurai': 22,
 'GoodFellas': 23,
 'Violet Evergarden: The Movie': 24,
 'Life Is Beautiful': 25,
 'Once Upon a Time in America': 26,
 'Harakiri': 27,
 'Psycho': 9760,
 'Josee, the Tiger and the Fish': 29,
 "A Dog's Will": 30,
 'Grave of the Fireflies': 31,
 "One Flew Over the Cuckoo's Nest": 32,
 'Fight Club': 33,
 'Evangelion: 3.0+1.0 Thrice Upon a Time': 34,
 'Spider-Man: Into the Spide

In [18]:
# --- Content-Based Filtering ---
def content_based_recommend(movie_title):
    if movie_title not in movie_indices:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []

    idx = movie_indices[movie_title]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    recommended_movies = []
    for i in range(1, 6):
        movie_idx = similarity_scores[i][0]
        recommended_movies.append(movies['title'].iloc[movie_idx])

    return recommended_movies


In [19]:

user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)

item_cosine_sim = cosine_similarity(user_movie_matrix.T)

In [20]:
item_cosine_sim

array([[1.        , 0.41056206, 0.2969169 , ..., 0.        , 0.        ,
        0.        ],
       [0.41056206, 1.        , 0.28243799, ..., 0.        , 0.        ,
        0.        ],
       [0.2969169 , 0.28243799, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [22]:
def collaborative_recommend(movie_title):
    if movie_title not in movie_indices:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []

    movie_id = movies[movies['title'] == movie_title].index[0]
    similarity_scores = list(enumerate(item_cosine_sim[movie_id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    recommended_movies = []
    for i in range(1, 6):
        movie_idx = similarity_scores[i][0]
        recommended_movies.append(movies['title'].iloc[movie_idx])

    return recommended_movies

In [23]:
def hybrid_recommend(movie_title):
    if movie_title not in movie_indices:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []

    content_recommendations = content_based_recommend(movie_title)

    collaborative_recommendations = collaborative_recommend(movie_title)

    combined_recommendations = list(set(content_recommendations + collaborative_recommendations))

    return combined_recommendations[:5]

In [24]:
def calculate_metrics(recommended_movies, actual_relevant_movies):
    # Convert the lists to sets
    recommended_set = set(recommended_movies)
    actual_set = set(actual_relevant_movies)

    # True positives: Movies that are both recommended and relevant
    true_positives = len(recommended_set.intersection(actual_set))

    # Precision
    precision = true_positives / len(recommended_set) if len(recommended_set) > 0 else 0

    # Recall
    recall = true_positives / len(actual_set) if len(actual_set) > 0 else 0

    # F1-Score
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0

    return precision, recall, f1


In [25]:
relevant_movies = ratings[ratings['rating'] > 3].merge(movies, left_on='movieId', right_on='id')['title'].tolist()

In [45]:
movie_name = "The Godfather"
# movie_name = "Thor"

In [46]:
# Content-Based Recommendation
content_recommendations = content_based_recommend(movie_name)
content_precision, content_recall, content_f1 = calculate_metrics(content_recommendations, relevant_movies)

In [47]:
# Collaborative Filtering Recommendation
collaborative_recommendations = collaborative_recommend(movie_name)
collaborative_precision, collaborative_recall, collaborative_f1 = calculate_metrics(collaborative_recommendations, relevant_movies)

In [48]:
# Hybrid Recommendation
hybrid_recommendations = hybrid_recommend(movie_name)
hybrid_precision, hybrid_recall, hybrid_f1 = calculate_metrics(hybrid_recommendations, relevant_movies)


In [49]:
print(f"Content-Based Recommendations for '{movie_name}':")
print(content_recommendations)
print(f"Content-Based Filtering Metrics for '{movie_name}':")
print(f"Precision: {content_precision:.2f}, Recall: {content_recall:.2f}, F1-Score: {content_f1:.2f}")


Content-Based Recommendations for 'The Godfather':
['The Godfather: Part II', 'The Godfather: Part III', 'Extremely Wicked, Shockingly Evil and Vile', 'Blood Ties', 'Proud Mary']
Content-Based Filtering Metrics for 'The Godfather':
Precision: 0.20, Recall: 0.00, F1-Score: 0.00


In [50]:
print(f"\nCollaborative Filtering Recommendations for '{movie_name}':")
print(collaborative_recommendations)
print(f"Collaborative Filtering Metrics for '{movie_name}':")
print(f"Precision: {collaborative_precision:.2f}, Recall: {collaborative_recall:.2f}, F1-Score: {collaborative_f1:.2f}")


Collaborative Filtering Recommendations for 'The Godfather':
['The Way', 'Manhattan', 'Shelter', 'Gone Mom: The Disappearance of Jennifer Dulos', 'The Godfather: Part II']
Collaborative Filtering Metrics for 'The Godfather':
Precision: 0.00, Recall: 0.00, F1-Score: 0.00


In [51]:
print(f"\nHybrid Recommendations for '{movie_name}':")
print(hybrid_recommendations)
print(f"Hybrid Filtering Metrics for '{movie_name}':")
print(f"Precision: {hybrid_precision:.2f}, Recall: {hybrid_recall:.2f}, F1-Score: {hybrid_f1:.2f}")



Hybrid Recommendations for 'The Godfather':
['Shelter', 'The Godfather: Part II', 'Blood Ties', 'Extremely Wicked, Shockingly Evil and Vile', 'The Godfather: Part III']
Hybrid Filtering Metrics for 'The Godfather':
Precision: 0.20, Recall: 0.00, F1-Score: 0.00
