In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  # For stemming purpose
from nltk.stem import PorterStemmer
import pickle
import os


In [3]:
df = pd.read_csv('D:\PROJECTS\Movie-recommender-system\\finalData\\final_df.csv')

In [4]:
df.drop(columns='Unnamed: 0', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,title,tags
0,New York Doll,A recovering alcoholic and recently converted ...
1,Mickey's Magical Christmas: Snowed in at the H...,After everyone is snowed in at the House of Mo...
2,Mickey's House of Villains,The villains from the popular animated Disney ...
3,And Then I Go,"In the cruel world of junior high, Edwin suffe..."
4,An Extremely Goofy Movie,It's a big time in Max's life. He's college bo...


- Applying the NLP transformations

In [6]:
df['tags'] = df['tags'].apply(lambda x: x.lower())

- Perform stemming to reduce the corpus size and get root words.

In [7]:
ps = PorterStemmer()

In [8]:
def stem(text):
    l = []
    for i in text.split():
        l.append(ps.stem(i))

    return " ".join(l)

In [9]:
df['tags'] = df['tags'].apply(stem)

- Now perform text vectorization

In [10]:
cv = CountVectorizer(max_features=13260,stop_words='english')

In [11]:
movie_vector = cv.fit_transform(df['tags']).toarray()

In [12]:
movie_vector.shape

(13260, 13260)

In [13]:
similarity = cosine_similarity(movie_vector)

In [14]:
similarity

array([[1.        , 0.        , 0.02948839, ..., 0.02864166, 0.11566299,
        0.09430419],
       [0.        , 1.        , 0.34641016, ..., 0.01869241, 0.03774257,
        0.        ],
       [0.02948839, 0.34641016, 1.        , ..., 0.01942572, 0.03922323,
        0.02132007],
       ...,
       [0.02864166, 0.01869241, 0.01942572, ..., 1.        , 0.03809697,
        0.04141577],
       [0.11566299, 0.03774257, 0.03922323, ..., 0.03809697, 1.        ,
        0.0836242 ],
       [0.09430419, 0.        , 0.02132007, ..., 0.04141577, 0.0836242 ,
        1.        ]])

In [15]:
df[df['title'] == 'The Lego Movie'].index[0]

1325

In [16]:
def recommend(movie):
    index = df[df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(df.iloc[i[0]].title)

In [17]:
recommend('The Avengers')

Iron Man 3
Captain America: Civil War
Iron Man 2
Being Flynn
The Survivalist


In [18]:

# Assuming similarity is your similarity dataset
# Calculate the number of chunks to divide the dataset into
num_chunks = 15
chunk_size = len(similarity) // num_chunks

# Specify the directory where you want to save the chunked .pkl files
output_dir = 'D:\\PROJECTS\\Movie-recommender-system\\streamLitWebApp'

# Iterate over the chunks and save each chunk separately
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = start_idx + chunk_size
    if i == num_chunks - 1:
        # Last chunk may have different size if the dataset size is not divisible by num_chunks
        chunk_data = similarity[start_idx:]
    else:
        chunk_data = similarity[start_idx:end_idx]

    # Save the chunk as a .pkl file in the specified directory
    filename = os.path.join(output_dir, f'similarity_chunk_{i+1}.pkl')
    with open(filename, 'wb') as f:
        pickle.dump(chunk_data, f)

    print(f'Chunk {i+1} saved as {filename}')


Chunk 1 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_1.pkl
Chunk 2 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_2.pkl
Chunk 3 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_3.pkl
Chunk 4 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_4.pkl
Chunk 5 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_5.pkl
Chunk 6 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_6.pkl
Chunk 7 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_7.pkl
Chunk 8 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_8.pkl
Chunk 9 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_9.pkl
Chunk 10 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebApp\similarity_chunk_10.pkl
Chunk 11 saved as D:\PROJECTS\Movie-recommender-system\streamLitWebA

In [20]:

output_dir = 'D:\\PROJECTS\\Movie-recommender-system\\streamLitWebApp'
filename = os.path.join(output_dir, 'movies.pkl')

# Assuming df is your DataFrame containing movie data
with open(filename, 'wb') as f:
    pickle.dump(df, f)
