In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import heapq

In [3]:
# Sample Data
df = pd.read_csv('usage.csv')  # Load your dataframe here with a 'plot' or 'tag' column

In [7]:
df['tag'][17]

'the era of the new gener is coming. while the internet, artifici intellig and other high technolog are appli in all aspect of life, crime technolog and crimin investig are also upgrad day by day. documentari tamarasaviano'

In [9]:
# Step 1: Vectorize the 'tag' column using TF-IDF (producing a sparse matrix)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['tag'])  # Adjust the column name accordingly

In [11]:

# Step 2: Function to find top N similar movies for each movie
def get_top_n_similar(movie_idx, similarity_scores, N=100):
    """ Get the top N similar movies for a given movie index, sorted by similarity. """
    # Find top N movie indices using a heap (efficient for large N)
    top_n_indices = heapq.nlargest(N, range(len(similarity_scores)), similarity_scores.take)
    # Include the movie itself in the top N
    return top_n_indices[:N]

In [13]:
# Step 3: Initialize a list to hold the results (just the top N similar movies)
top_n_similarities = []

In [15]:
# Step 4: Process in chunks to avoid memory overload
num_movies = tfidf_matrix.shape[0]

# Progress bar
with tqdm(total=num_movies) as pbar:
    for i in range(num_movies):
        # Compute similarity between movie i and all other movies in chunks
        movie_vector = tfidf_matrix[i]  # Get the vector for movie i
        similarities = cosine_similarity(movie_vector, tfidf_matrix).flatten()  # Cosine similarities with all movies
        
        # Get top 100 most similar movies including the movie itself
        top_similar_movies = get_top_n_similar(i, similarities, N=100)
        
        # Append the list of top similar movie indices (including movie_idx itself)
        top_n_similarities.append(top_similar_movies)
        
        # Update progress
        pbar.update(1)

100%|███████████████████████████████████████████████████████████████████████████████████████| 56249/56249 [1:50:52<00:00,  8.46it/s]


In [17]:
# Step 5: Convert results to a DataFrame for storage
similarity_df = pd.DataFrame(top_n_similarities)

In [19]:
similarity_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,582,7878,15355,6379,50669,6300,55150,46339,13436,...,49706,27289,40912,11059,20006,15502,7711,45143,34132,37546
1,1,53603,388,2944,3146,12188,12229,5833,5257,9822,...,51305,55037,7206,7543,14709,4243,12977,9578,7866,14557
2,2,12926,13517,16436,2497,2860,6282,13953,966,51697,...,898,18332,434,15869,14699,645,2018,16718,37283,2974
3,3,43635,3117,4348,5017,7388,8115,8302,8902,9044,...,16173,13639,10294,554,22736,6689,39539,11784,17520,33939
4,4,6768,1293,43627,305,30814,12235,34714,289,284,...,6997,48665,37814,39765,34592,42254,39386,53678,24808,29590


In [None]:
movies=pd.read_csv('final_data.csv')

In [29]:
movies.head()

Unnamed: 0,tconst,originalTitle,year,genres,Rating,director,actors,description
0,tt11545370,Mission Ulja Funk,2021.0,"Adventure,Family",6.9,Barbara Kronenberg,"Jonas Oeßel, Luc Feit, Ivan Shvedoff, Peter Tr...",Plot not available
1,tt11547828,Echoes of Violence,2021.0,"Crime,Drama,Thriller",5.2,Nicholas Woods,"Heston Horwin, Chase Cargill, Taylor Flowers, ...",A man with a fragile state of mind will battle...
2,tt11547982,The 2:11 Home,2021.0,Family,5.5,Jim Huggins,"Geno Romo, Edward Stiner, Nick Dent, Burke Sag...","12 year old astronomy geek Uja, armed with a s..."
3,tt11548822,Cow,2021.0,Documentary,7.1,Andrea Arnold,,An honest policeman's quest to nab a dreaded c...
4,tt11552344,Murder at Yellowstone City,2022.0,"Crime,Drama,Mystery",5.3,Richard Gray,"Isaiah Mustafa, Zach McGowan, Thomas Jane, Ron...","The simple question: ""How does a woman get pre..."


In [59]:
 movie_index=movies[movies['originalTitle']=='Khaleja'].index[0]
movie_index

44058

In [45]:
l=similarity_df.iloc[44058]

In [61]:
type(l)

pandas.core.series.Series

In [47]:
d=list(l)

In [53]:
for i in d[0:10]:
    print(movies.iloc[i].originalTitle)

Khaleja
Jalsa
Athadu
Nuvve.. Nuvve...
Ready
Chirutha
Bodyguard
Jai Chiranjeeva
King
Kantri


In [55]:
# Step 6: Save the DataFrame in a compact format (like Parquet)
similarity_df.to_csv('top_100_similar_movies.csv')

print("Top 100 similar movies for each movie have been saved to 'top_100_similar_movies.parquet'.")

Top 100 similar movies for each movie have been saved to 'top_100_similar_movies.parquet'.


In [63]:
def top_movies(movie):
    movie_index=movies[movies['originalTitle']==movie].index[0]
    l=similarity_df.iloc[movie_index]
    d=list(l)
    for i in d[0:10]:
        print(movies.iloc[i].originalTitle)

In [99]:
top_movies("Saripodhaa Sanivaaram")

Ala Modalaindi
Anthaka Mundu Aa Tarvatha
Jabardasth
Alludu Diddina Kapuram
Kalyana Vaibhogame
Awe!
Oh Baby...
The land of Skulls
Halfway to Amarillo
Feldpost
