In [10]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK stopwords

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load dataset

In [3]:
df = pd.read_csv("hf://datasets/Pablinho/movies-dataset/9000plus.csv")

# Drop missing overviews

In [4]:
df = df.dropna(subset=['Overview']).reset_index(drop=True)

# Text preprocessing function

In [5]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Apply text cleaning

In [6]:
df['cleaned_overview'] = df['Overview'].apply(clean_text)

# TF-IDF Vectorization

In [7]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['cleaned_overview'])

# Compute cosine similarity

In [8]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Save processed data

In [12]:
df.to_csv("processed_movies.csv", index=False)
np.save("cosine_similarity.npy", cosine_sim)

# Preprocessing complete. Processed dataset and similarity matrix saved.