## Data Preprocessing

In [2]:
import pandas as pd

df = pd.read_csv('../Dataset/combined.csv')
df.shape

(45483, 5)

In [3]:
df.isna().sum()

id              0
keywords    14041
title           0
genres       2256
overview        0
dtype: int64

In [4]:
df.fillna('', inplace=True)
df.isna().sum()

id          0
keywords    0
title       0
genres      0
overview    0
dtype: int64

In [5]:
import pandas as pd

# Check for common issues like HTML tags, punctuation, or missing values
print("Contains HTML tags:", df['overview'].str.contains('<[^>]+>').any())
print("Contains special characters:", df['overview'].str.contains(r'[^\w\s]').any())
print("Missing values:", df['overview'].isnull().sum())

Contains HTML tags: False
Contains special characters: True
Missing values: 0


In [6]:
import re

df['overview'] = df['overview'].apply(
    lambda x: re.sub(r'[^\w\s]', '', x) if isinstance(x, str) else ''
)
print("Contains HTML tags:", df['overview'].str.contains('<[^>]+>').any())
print("Contains special characters:", df['overview'].str.contains(r'[^\w\s]').any())
print("Missing values:", df['overview'].isnull().sum())

Contains HTML tags: False
Contains special characters: False
Missing values: 0


In [8]:
import re

df['genres'] = df['genres'].apply(
    lambda x: re.sub(r'[^\w\s]', '', x) if isinstance(x, str) else ''
)
print("Contains HTML tags:", df['genres'].str.contains('<[^>]+>').any())
print("Contains special characters:", df['genres'].str.contains(r'[^\w\s]').any())
print("Missing values:", df['genres'].isnull().sum())

Contains HTML tags: False
Contains special characters: False
Missing values: 0


In [9]:
import re

df['keywords'] = df['keywords'].apply(
    lambda x: re.sub(r'[^\w\s]', '', x) if isinstance(x, str) else ''
)
print("Contains HTML tags:", df['keywords'].str.contains('<[^>]+>').any())
print("Contains special characters:", df['keywords'].str.contains(r'[^\w\s]').any())
print("Missing values:", df['keywords'].isnull().sum())

Contains HTML tags: False
Contains special characters: False
Missing values: 0


## TF - IDF Vectorization

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
genres_tfidf_matrix = vectorizer.fit_transform(df['genres'])
keywords_tfidf_matrix = vectorizer.fit_transform(df['keywords'])
overview_tfidf_matrix = vectorizer.fit_transform(df['overview'])

In [11]:
import pickle as pkl
with open('../Models/tfidf/genres_tfidf_mtx.pkl', 'wb') as f:
    pkl.dump(genres_tfidf_matrix, f)
genres_tfidf_matrix.shape

(45483, 22)

In [12]:
with open('../Models/tfidf/keywords_tfidf_mtx.pkl', 'wb') as f:
    pkl.dump(keywords_tfidf_matrix, f)
keywords_tfidf_matrix.shape

(45483, 12704)

In [13]:
with open('../Models/tfidf/overview_tfidf_mtx.pkl', 'wb') as f:
    pkl.dump(overview_tfidf_matrix, f)
overview_tfidf_matrix.shape

(45483, 91346)

In [14]:
from scipy.sparse import hstack

# Concatenate sparse matrices
combined_matrix = hstack((genres_tfidf_matrix, keywords_tfidf_matrix, overview_tfidf_matrix))

In [15]:
with open('../Models/tfidf/combined_tfidf_mtx.pkl', 'wb') as f:
    pkl.dump(combined_matrix, f)
combined_matrix.shape

(45483, 104072)

In [16]:
from sklearn.preprocessing import normalize

# Normalize the combined sparse matrix
normalized_matrix = normalize(combined_matrix)

In [None]:
with open('../Models/tfidf/normalized_tfidf_mtx.pkl', 'wb') as f:
    pkl.dump(normalized_matrix, f)
normalized_matrix.shape

In [18]:
from sklearn.decomposition import TruncatedSVD

# Reduce dimensions using Truncated SVD
svd = TruncatedSVD(n_components=100, random_state=42)
reduced_matrix = svd.fit_transform(normalized_matrix)

In [None]:
with open('../Models/tfidf/reduced_tfidf_mtx.pkl', 'wb') as f:
    pkl.dump(reduced_matrix, f)
reduced_matrix.shape

In [24]:
from sklearn.neighbors import NearestNeighbors

# Fit the NearestNeighbors model on the reduced matrix
nn_model = NearestNeighbors(n_neighbors=21, metric='cosine')  # k=11 (1 query + 10 recommendations)
nn_model.fit(reduced_matrix)

with open('../Models/tfidf/tfidf_knn.pkl', 'wb') as f:
    pkl.dump(nn_model, f)

## Hashing Vectorization

In [29]:
from sklearn.feature_extraction.text import HashingVectorizer

hash_vec = HashingVectorizer(n_features=1000, stop_words='english')  # n_features controls dimensionality
genres_hv_matrix = hash_vec.fit_transform(df['genres'])
keywords_hv_matrix = hash_vec.fit_transform(df['keywords'])
overview_hv_matrix = hash_vec.fit_transform(df['overview'])

In [30]:
with open('../Models/hashing/genres_hv_mtx.pkl', 'wb') as f:
    pkl.dump(genres_hv_matrix, f)
genres_hv_matrix.shape

(45483, 1000)

In [31]:
with open('../Models/hashing/keywords_hv_mtx.pkl', 'wb') as f:
    pkl.dump(keywords_hv_matrix, f)
keywords_hv_matrix.shape

(45483, 1000)

In [32]:
with open('../Models/hashing/overview_hv_mtx.pkl', 'wb') as f:
    pkl.dump(overview_hv_matrix, f)
overview_hv_matrix.shape

(45483, 1000)

In [33]:
from scipy.sparse import hstack

# Concatenate sparse matrices
combined_matrix = hstack((genres_hv_matrix, keywords_hv_matrix, overview_hv_matrix))

In [34]:
with open('../Models/hashing/combined_hv_mtx.pkl', 'wb') as f:
    pkl.dump(combined_matrix, f)
combined_matrix.shape

(45483, 3000)

In [35]:
from sklearn.preprocessing import normalize

# Normalize the combined sparse matrix
normalized_matrix = normalize(combined_matrix)

In [36]:
with open('../Models/hashing/normalized_hv_mtx.pkl', 'wb') as f:
    pkl.dump(normalized_matrix, f)
normalized_matrix.shape

(45483, 3000)

In [37]:
from sklearn.decomposition import TruncatedSVD

# Reduce dimensions using Truncated SVD
svd = TruncatedSVD(n_components=100, random_state=42)
reduced_matrix = svd.fit_transform(normalized_matrix)

In [38]:
with open('../Models/hashing/reduced_hv_mtx.pkl', 'wb') as f:
    pkl.dump(reduced_matrix, f)
reduced_matrix.shape

(45483, 100)