In [1]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [2]:

df = pd.read_csv("dataset1.csv")
print("✅ Dataset Loaded:", df.shape)

# Remove duplicates based on the 'title' column
df = df.drop_duplicates(subset=['title'], keep='first')

# Save cleaned dataset
df.to_csv("cleaned_dataset.csv", index=False)

print("✅ Duplicates removed! New shape:", df.shape)

df.head()


✅ Dataset Loaded: (277, 6)
✅ Duplicates removed! New shape: (207, 6)


Unnamed: 0,title,genres,overview,cast,crew,movie_id
0,Mayabazar,Mythological,A classic fantasy drama.,"NTR, Savitri",K. V. Reddy,1
1,Baahubali: The Beginning,Epic Action,A warrior rises to discover his legacy.,"Prabhas, Rana",Rajamouli,2
2,Baahubali 2: The Conclusion,Epic Action,The continuation of Baahubali saga.,"Prabhas, Anushka",Rajamouli,3
3,RRR,Period Action,Two revolutionaries fight for freedom.,"Ram Charan, NTR",Rajamouli,4
4,Pushpa,Action Drama,A laborer rises in red sandalwood smuggling.,"Allu Arjun, Rashmika",Sukumar,5


In [3]:
# --------------------------------------------------
# ✅ STEP 2 — CLEAN & CONVERT LIST COLUMNS
# --------------------------------------------------

def safe_convert(obj):
    if isinstance(obj, str):
        try:
            return ast.literal_eval(obj)
        except:
            return []
    return obj

df['genres'] = df['genres'].apply(safe_convert)
df['cast'] = df['cast'].apply(safe_convert)
df['crew'] = df['crew'].apply(safe_convert)

df.head()


Unnamed: 0,title,genres,overview,cast,crew,movie_id
0,Mayabazar,[],A classic fantasy drama.,[],[],1
1,Baahubali: The Beginning,[],A warrior rises to discover his legacy.,[],[],2
2,Baahubali 2: The Conclusion,[],The continuation of Baahubali saga.,[],[],3
3,RRR,[],Two revolutionaries fight for freedom.,[],[],4
4,Pushpa,[],A laborer rises in red sandalwood smuggling.,[],[],5


In [4]:
# --------------------------------------------------
# ✅ STEP 3 — EXTRACT USEFUL FEATURES
# --------------------------------------------------

# top 3 cast
df['cast'] = df['cast'].apply(lambda x: x[:3] if x else [])

# extract director
def get_director(crew_list):
    if not crew_list:
        return []
    return [person for person in crew_list if "director" in person.lower()][:1]

df['director'] = df['crew'].apply(get_director)

df.head()


Unnamed: 0,title,genres,overview,cast,crew,movie_id,director
0,Mayabazar,[],A classic fantasy drama.,[],[],1,[]
1,Baahubali: The Beginning,[],A warrior rises to discover his legacy.,[],[],2,[]
2,Baahubali 2: The Conclusion,[],The continuation of Baahubali saga.,[],[],3,[]
3,RRR,[],Two revolutionaries fight for freedom.,[],[],4,[]
4,Pushpa,[],A laborer rises in red sandalwood smuggling.,[],[],5,[]


In [5]:
# --------------------------------------------------
# ✅ STEP 4 — CONVERT LISTS TO STRINGS
# --------------------------------------------------

df['genres'] = df['genres'].apply(lambda x: " ".join(x))
df['cast'] = df['cast'].apply(lambda x: " ".join(x))
df['director'] = df['director'].apply(lambda x: " ".join(x))
df['overview'] = df['overview'].fillna("")


In [6]:
# --------------------------------------------------
# ✅ STEP 5 — CREATE TAGS COLUMN
# --------------------------------------------------

df['tags'] = (
    df['overview'] + " " +
    df['genres'] + " " +
    df['cast'] + " " +
    df['director']
)

print("✅ Tags Column Created")
df[['title', 'tags']].head()


✅ Tags Column Created


Unnamed: 0,title,tags
0,Mayabazar,A classic fantasy drama.
1,Baahubali: The Beginning,A warrior rises to discover his legacy.
2,Baahubali 2: The Conclusion,The continuation of Baahubali saga.
3,RRR,Two revolutionaries fight for freedom.
4,Pushpa,A laborer rises in red sandalwood smuggling.


In [7]:
# --------------------------------------------------
# ✅ STEP 6 — TF-IDF VECTORIZATION
# --------------------------------------------------

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(df['tags']).toarray()

print("✅ Vectorization Complete:", vectors.shape)


✅ Vectorization Complete: (207, 462)


In [8]:
# --------------------------------------------------
# ✅ STEP 7 — COSINE SIMILARITY MATRIX
# --------------------------------------------------

similarity = cosine_similarity(vectors)
print("✅ Similarity Matrix Created:", similarity.shape)


✅ Similarity Matrix Created: (207, 207)


In [9]:
# --------------------------------------------------
# ✅ STEP 8 — SAVE MODEL FILES
# --------------------------------------------------

pickle.dump(df, open("movies.pkl", "wb"))
pickle.dump(similarity, open("similarity.pkl", "wb"))

print("✅ MODEL FILES SAVED: movies.pkl & similarity.pkl")


✅ MODEL FILES SAVED: movies.pkl & similarity.pkl


In [10]:
# --------------------------------------------------
# ✅ STEP 9 — RECOMMENDATION FUNCTION
# --------------------------------------------------

def recommend(movie):
    movie = movie.lower()
    if movie not in df['title'].str.lower().values:
        print("❌ Movie not found in dataset")
        return

    index = df[df['title'].str.lower() == movie].index[0]
    distances = similarity[index]
    movie_list = sorted(
        list(enumerate(distances)),
        reverse=True,
        key=lambda x: x[1]
    )[1:6]

    print(f"\n✅ Top Recommendations for ✅ {df.iloc[index].title}:\n")
    for i in movie_list:
        print(df.iloc[i[0]].title)


In [11]:
recommend("RRR")



✅ Top Recommendations for ✅ RRR:

Mugguru Monagallu
Leader
Gang Leader
Sye Raa Narasimha Reddy
Mayabazar


In [12]:
df

Unnamed: 0,title,genres,overview,cast,crew,movie_id,director,tags
0,Mayabazar,,A classic fantasy drama.,,[],1,,A classic fantasy drama.
1,Baahubali: The Beginning,,A warrior rises to discover his legacy.,,[],2,,A warrior rises to discover his legacy.
2,Baahubali 2: The Conclusion,,The continuation of Baahubali saga.,,[],3,,The continuation of Baahubali saga.
3,RRR,,Two revolutionaries fight for freedom.,,[],4,,Two revolutionaries fight for freedom.
4,Pushpa,,A laborer rises in red sandalwood smuggling.,,[],5,,A laborer rises in red sandalwood smuggling.
...,...,...,...,...,...,...,...,...
264,Game Changer,,A man fights corruption through systemic reforms.,,[],288,,A man fights corruption through systemic refor...
272,Nijam,,A man avenges his father’s death.,,[],296,,A man avenges his father’s death.
273,Takkari Donga,,A cowboy thief fights for justice.,,[],297,,A cowboy thief fights for justice.
275,Maharshi,,A successful businessman reconnects with his r...,,[],299,,A successful businessman reconnects with his r...
