In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ---- 1. Sample Movie Dataset ----
movie_data = {
    'movie_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'title': ['The Matrix', 'The Godfather', 'Pulp Fiction', 'The Dark Knight', 'Inception',
              'Forrest Gump', 'The Shawshank Redemption', 'Fight Club', 'Interstellar', 'Gladiator'],
    'description': [
        'A computer hacker learns about the true nature of his reality.',
        'An aging crime boss hands over control of his empire to his reluctant son.',
        'The lives of two hitmen, a boxer, and others cross paths in tales of violence and redemption.',
        'Batman escalates his war on crime against a rising threat.',
        'A thief steals secrets through dream-sharing and is given one last job to pull off.',
        'The life story of a simple man with a big heart, spanning decades.',
        'Two imprisoned men bond over a number of years, finding solace and eventual redemption.',
        'An insomniac office worker and a soap maker form an underground fight club.',
        'A team of explorers travel through a wormhole in space to ensure humanity’s survival.',
        'A former Roman General sets out to exact vengeance against the corrupt emperor.'
    ]
}
movies = pd.DataFrame(movie_data)
print("Movie Dataset:")
display(movies)

# ---- 2. Data Preprocessing ----
# Check missing values
print("\nMissing values in dataset:")
print(movies.isnull().sum())

# No duplicates or missing data here, but let's show the check:
print("\nChecking duplicates:")
print(f"Duplicates before removal: {movies.duplicated().sum()}")
movies = movies.drop_duplicates()
print(f"Duplicates after removal: {movies.duplicated().sum()}")

# ---- 3. EDA ----
# Length of descriptions
movies['desc_length'] = movies['description'].apply(len)

plt.figure(figsize=(8,4))
sns.histplot(movies['desc_length'], bins=5)
plt.title('Distribution of Movie Description Lengths')
plt.xlabel('Description Length (characters)')
plt.show()

# ---- 4. Feature Engineering ----
# Vectorize text descriptions with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies['description'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Reduce dimensionality for visualization and clustering
svd = TruncatedSVD(n_components=2, random_state=42)
reduced_features = svd.fit_transform(tfidf_matrix)

movies['component_1'] = reduced_features[:, 0]
movies['component_2'] = reduced_features[:, 1]

plt.figure(figsize=(8,6))
sns.scatterplot(x='component_1', y='component_2', data=movies, s=100)
for i, title in enumerate(movies['title']):
    plt.text(movies.component_1[i] + 0.03, movies.component_2[i] + 0.02, title, fontsize=9)
plt.title('2D projection of movie descriptions')
plt.show()

# ---- 5. Model Building (Clustering for grouping similar movies) ----
kmeans = KMeans(n_clusters=3, random_state=42)
movies['cluster'] = kmeans.fit_predict(tfidf_matrix)

print("\nCluster assignments:")
display(movies[['title', 'cluster']])

# Silhouette score for quality of clusters
score = silhouette_score(tfidf_matrix, movies['cluster'])
print(f"Silhouette Score: {score:.3f}")

# ---- 6. Recommendation Function ----
def recommend_movies(title, top_n=3):
    if title not in movies['title'].values:
        return f"'{title}' not found in movie dataset."

    idx = movies.index[movies['title'] == title][0]
    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()

    similar_indices = cosine_sim.argsort()[::-1][1:top_n+1]
    recommendations = movies.iloc[similar_indices][['title', 'description']]
    return recommendations

print("\nRecommendations for 'The Matrix':")
display(recommend_movies('The Matrix'))

# ---- 7. Visualization of Similarity Matrix ----
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

plt.figure(figsize=(10,8))
sns.heatmap(similarity_matrix, xticklabels=movies['title'], yticklabels=movies['title'], annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Cosine Similarity Heatmap Between Movie Descriptions')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()
