## Data preprocessing

### Load the movie dataset

In [13]:
import pandas as pd

file_path = "/Users/ccy/Documents/CMU/Side projects/Simple Content-Based Recommendation/data/movies_metadata.csv"
df = pd.read_csv(file_path)
# Only select the title and overview for processing
overview_df = df[['original_title', 'overview']]

  df = pd.read_csv(file_path)


### Nan value removal

In [14]:
# Drop missing values in description
print("Before")
null = overview_df.isnull().sum()
print(null)
print(len(overview_df))
print("After")
overview_df = overview_df.dropna(subset=['overview'])
null = overview_df.isnull().sum()
print(null)
print(len(overview_df))

Before
original_title      0
overview          954
dtype: int64
45466
After
original_title    0
overview          0
dtype: int64
44512


## Text Vectorization Using TF-IDF


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2),stop_words='english', max_features=2000)   # remove the frequent stop words like the, is, about, and so on.  # Limit features for efficiency

# Fit and transform the movie overview
tfidf_matrix = vectorizer.fit_transform(overview_df['overview'])
feature_names = vectorizer.get_feature_names_out()
print(tfidf_matrix)
print(feature_names)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 728352 stored elements and shape (44512, 2000)>
  Coords	Values
  (0, 1029)	0.17568592039600542
  (0, 85)	0.6741819087710951
  (0, 1051)	0.14249782524176377
  (0, 802)	0.2179189601093984
  (0, 1509)	0.18201057513680932
  (0, 189)	0.20263106738094483
  (0, 226)	0.1739726534088134
  (0, 1536)	0.18580505108570203
  (0, 1073)	0.19485664671271302
  (0, 1325)	0.1502505518334643
  (0, 813)	0.1650811133423648
  (0, 320)	0.20532213433021398
  (0, 1572)	0.21293493984007275
  (0, 1282)	0.16911161581767634
  (0, 540)	0.21743226165659393
  (0, 602)	0.17074395854343646
  (0, 1025)	0.16588580076047182
  (1, 1509)	0.17012149686340075
  (1, 1316)	0.1766589951259523
  (1, 502)	0.14567159565103233
  (1, 198)	0.19175634218781873
  (1, 737)	0.4667777554055532
  (1, 1267)	0.19432100609799657
  (1, 522)	0.18103690844163978
  (1, 1095)	0.18655519648186308
  :	:
  (44510, 1015)	0.10084010776338252
  (44510, 865)	0.09474524685312195
  (44510, 372)	0.

## Computing similarity

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(user_query, top_n=5):
    # Transform the user query into TF-IDF vector
    query_vector = vectorizer.transform([user_query])
    
    # Compute cosine similarity
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()  # Flatten to 1D array
    
    # Get indices of top N similar movies
    top_indices = similarity_scores.argsort()[-top_n:][::-1]  # Sort in descending order
    
    # Create a DataFrame with movie titles, overviews, and similarity scores
    recommendations = overview_df.iloc[top_indices][['original_title', 'overview']].copy()
    recommendations['similarity_score'] = similarity_scores[top_indices]  # Append similarity score
    
    return recommendations


### Example

In [17]:
# Example query
query = "I love thrilling action movies set in space, with a comedic twist."
recommended_movies = recommend_movies(query)
# Print recommended movies with similarity scores
print(recommended_movies)

                  original_title  \
41138  Hollywood Without Make-Up   
14096                     London   
27514    100 Years at the Movies   
44536               The Farthest   
44219           Manhunt in Space   

                                                overview  similarity_score  
41138  Ken Murray - The Man Who Makes Movies of Peopl...          0.458327  
14096  The first film in Patrick Keiller's Robinson t...          0.441086  
27514  Commemorates the centennial of American movies...          0.423447  
44536   Documentary about NASA's Voyager space programme          0.406321  
44219  Rocky Jones, Space Ranger fights space pirates...          0.400480  


### Your turn !

In [18]:
query = input("Give me sentences to describe the movies you like, and i will help recommend the movies for you!")
recommended_movies = recommend_movies(query)
recommended_movies['original_title']

38682       Teri Meri Kahaani
5323                Zebrahead
32616                  Equals
37345                  Finale
10623    Mozart and the Whale
Name: original_title, dtype: object