In [1]:
import pandas as pd
movies = pd.read_csv('data/cleaned_movies.csv')
new_df = pd.read_csv('data/new_df_movies.csv')

In [2]:
import nltk
from nltk.stem import PorterStemmer

In [3]:
ps = PorterStemmer()

def stems(text):
    l = []
    for i in text.split():
        l.append(ps.stem(i))
    return " ".join(l)

In [4]:
new_df['tags'] = new_df['tags'].apply(stems)

In [5]:
new_df.iloc[0]['tags']

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi scienc fiction avatar cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words= 'english')

In [7]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [8]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
vector.shape

(4809, 5000)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.08119979, 0.08492078, ..., 0.04279605, 0.        ,
        0.        ],
       [0.08119979, 1.        , 0.08964215, ..., 0.0225877 , 0.        ,
        0.02577696],
       [0.08492078, 0.08964215, 1.        , ..., 0.02362278, 0.        ,
        0.        ],
       ...,
       [0.04279605, 0.0225877 , 0.02362278, ..., 1.        , 0.03818018,
        0.04075696],
       [0.        , 0.        , 0.        , ..., 0.03818018, 1.        ,
        0.08714204],
       [0.        , 0.02577696, 0.        , ..., 0.04075696, 0.08714204,
        1.        ]])

In [12]:
similarity.shape

(4809, 4809)

In [13]:
print(new_df['original_title'].unique()[:20])


["['Avatar']" '["PiratesoftheCaribbean:AtWorld\'sEnd"]' "['Spectre']"
 "['TheDarkKnightRises']" "['JohnCarter']" "['Spider-Man3']" "['Tangled']"
 "['Avengers:AgeofUltron']" "['HarryPotterandtheHalf-BloodPrince']"
 "['BatmanvSuperman:DawnofJustice']" "['SupermanReturns']"
 "['QuantumofSolace']" '["PiratesoftheCaribbean:DeadMan\'sChest"]'
 "['TheLoneRanger']" "['ManofSteel']"
 "['TheChroniclesofNarnia:PrinceCaspian']" "['TheAvengers']"
 "['PiratesoftheCaribbean:OnStrangerTides']" "['MeninBlack3']"
 "['TheHobbit:TheBattleoftheFiveArmies']"]


In [14]:
import ast

# Keep original_title as is (for tags), but extract clean title for search
def extract_title(title_list_string):
    try:
        return ast.literal_eval(title_list_string)[0].strip().lower()
    except:
        return ''

new_df['original_title_clean'] = new_df['original_title'].apply(extract_title)


In [15]:
def recommend(movie_name):
    movie = movie_name.strip().lower()

    if movie not in new_df['original_title_clean'].values:
        print(f"❌ Movie '{movie_name}' not found in dataset.")
        return

    index = new_df[new_df['original_title_clean'] == movie].index[0]
    distances = similarity[index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    print(f"\n🎬 Recommendations for '{movie_name.title()}':")
    for i in movies_list:
        print("→", new_df.iloc[i[0]]['original_title_clean'].title())


In [16]:
# def recommend(movie):
#     movie = movie.lower().strip()  # Normalize input
#     if movie not in new_df['original_title'].values:
#         print(f"Movie '{movie}' not found in dataset.")
#         return
    
#     index = new_df[new_df['original_title'] == movie].index[0]
#     distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
#     for i in distances[1:6]:
#         print(new_df.iloc[i[0]].original_title)


In [17]:
recommend('spider-man')


🎬 Recommendations for 'Spider-Man':
→ Spider-Man3
→ Spider-Man2
→ Arachnophobia
→ Theamazingspider-Man2
→ Theamazingspider-Man
