# Enhanced LSA Movie Recommendation System
## Advanced implementation with hybrid features and intelligent recommendations

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


## 1. Data Loading and Enhanced Preprocessing

In [3]:
# Load the processed dataset
df = pd.read_csv('../data/processed/movies_final_processed.csv')
print(f"Dataset shape: {df.shape}")
print(f"Movies loaded: {len(df)}")

# Fill missing text features
df['text_features'] = df['text_features'].fillna('')
df['overview'] = df['overview'].fillna('')

# Display basic info
try:
    unique_genres = set([g for sublist in df['genres_list'].apply(eval) for g in sublist])
    print(f"\nUnique genres: {len(unique_genres)}")
except:
    print("\nGenres processing needed")

print(f"Date range: {df['release_year'].min():.0f} - {df['release_year'].max():.0f}")
df.head()

Dataset shape: (4802, 62)
Movies loaded: 4802

Unique genres: 20
Date range: 1916 - 2017


Unnamed: 0,original_title,budget,genres,homepage,id,keywords,original_language,overview,popularity,production_companies,...,text_features,enhanced_text_features,genre_text,budget_log,revenue_log,popularity_log,runtime_log,vote_count_log,popularity_tier,rating_tier
0,Avatar,0.892462,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.avatarmovie.com/,19995,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,"In the 22nd century, a paraplegic Marine is di...",2.157264,"[{'name': 'Ingenious Film Partners', 'id': 289...",...,22nd century paraplegic marine dispatched moon...,22nd century paraplegic marine dispatched moon...,action adventure fantasy science fiction actio...,19.283571,21.748578,5.020174,5.09375,9.37594,High,Poor
1,Spectre,0.897178,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...",en,A cryptic message from Bond‚Äôs past sends him o...,1.869012,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...",...,cryptic message bond past sends trail uncover ...,cryptic message bond past sends trail uncover ...,action adventure crime action adventure crime ...,19.316769,20.596199,4.685614,5.003946,8.404472,High,Poor
2,The Dark Knight Rises,0.900048,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",http://www.thedarkknightrises.com/,49026,"[{'id': 849, 'name': 'dc comics'}, {'id': 853,...",en,Following the death of District Attorney Harve...,1.907387,"[{'name': 'Legendary Pictures', 'id': 923}, {'...",...,following death district attorney harvey dent ...,following death district attorney harvey dent ...,action crime drama thriller action crime drama...,19.336971,20.80479,4.730153,5.111988,9.116799,High,Poor
3,John Carter,0.905619,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://movies.disney.com/john-carter,49529,"[{'id': 818, 'name': 'based on novel'}, {'id':...",en,"John Carter is a war-weary, former military ca...",1.110321,"[{'name': 'Walt Disney Pictures', 'id': 2}]",...,john carter war weary former military captain ...,john carter war weary former military captain ...,action adventure science fiction action advent...,19.376192,19.464974,3.805039,4.890349,7.661527,High,Poor
4,Spider-Man 3,0.904522,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",http://www.sonypictures.com/movies/spider-man3/,559,"[{'id': 851, 'name': 'dual identity'}, {'id': ...",en,The seemingly invincible Spider-Man goes up ag...,1.932762,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...",...,seemingly invincible spider go new crop villai...,seemingly invincible spider go new crop villai...,fantasy action adventure fantasy action advent...,19.36847,20.607711,4.759605,4.941642,8.18228,High,Poor


## 2. Enhanced Text Feature Engineering

In [4]:
# Enhanced text preprocessing with weighted features
def create_enhanced_text_features(df):
    """
    Create enhanced text features with weighted importance for movie recommendation
    """
    enhanced_features = []
    
    for idx, row in df.iterrows():
        # Parse list columns if they're strings
        try:
            genres = eval(row['genres_list']) if isinstance(row['genres_list'], str) else row['genres_list']
            keywords = eval(row['keywords_list']) if isinstance(row['keywords_list'], str) else row['keywords_list']
            cast = eval(row['cast_list']) if isinstance(row['cast_list'], str) else row['cast_list']
            directors = eval(row['director_list']) if isinstance(row['director_list'], str) else row['director_list']
        except:
            genres = keywords = cast = directors = []
        
        # Create weighted text features
        feature_text = (
            str(row['overview']) + ' ' +
            str(row.get('tagline', '')) + ' ' +
            # Genres are very important - repeat 4x
            ' '.join(genres * 4) + ' ' +
            # Keywords are important - repeat 3x
            ' '.join(keywords * 3) + ' ' +
            # Directors are important - repeat 3x
            ' '.join(directors * 3) + ' ' +
            # Cast members - repeat 2x
            ' '.join(cast[:5] * 2) + ' ' +
            # Add release year as context
            f"year_{int(row['release_year'])}" if pd.notna(row['release_year']) else ''
        )
        
        enhanced_features.append(feature_text.strip())
    
    return enhanced_features

# Create enhanced text features
print("Creating enhanced text features...")
df['enhanced_text_features'] = create_enhanced_text_features(df)
print(f"Enhanced text features created for {len(df)} movies")

# Show example
print("\nExample enhanced text features:")
print(f"Movie: {df.iloc[0]['original_title']}")
print(f"Features: {df.iloc[0]['enhanced_text_features'][:200]}...")

Creating enhanced text features...
Enhanced text features created for 4802 movies

Example enhanced text features:
Movie: Avatar
Features: In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Enter the World of Pando...


## 3. Advanced TF-IDF Vectorization

In [5]:
# Advanced TF-IDF with optimized parameters
print("Creating advanced TF-IDF vectors...")

tfidf_enhanced = TfidfVectorizer(
    max_features=8000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8,
    sublinear_tf=True,
    norm='l2'
)

tfidf_matrix_enhanced = tfidf_enhanced.fit_transform(df['enhanced_text_features'])
print(f"Enhanced TF-IDF matrix shape: {tfidf_matrix_enhanced.shape}")

# Standard TF-IDF for comparison
tfidf_standard = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix_standard = tfidf_standard.fit_transform(df['text_features'])
print(f"Standard TF-IDF matrix shape: {tfidf_matrix_standard.shape}")

Creating advanced TF-IDF vectors...
Enhanced TF-IDF matrix shape: (4802, 8000)
Standard TF-IDF matrix shape: (4802, 5000)


## 4. Optimized LSA Implementation

In [6]:
# Apply LSA with optimized components
print("Applying optimized LSA...")

lsa_enhanced = TruncatedSVD(n_components=150, random_state=42, algorithm='randomized')
lsa_matrix_enhanced = lsa_enhanced.fit_transform(tfidf_matrix_enhanced)
print(f"Enhanced LSA matrix shape: {lsa_matrix_enhanced.shape}")
print(f"Explained variance ratio: {lsa_enhanced.explained_variance_ratio_.sum():.4f}")

# Standard LSA for comparison
lsa_standard = TruncatedSVD(n_components=100, random_state=42)
lsa_matrix_standard = lsa_standard.fit_transform(tfidf_matrix_standard)
print(f"Standard LSA matrix shape: {lsa_matrix_standard.shape}")

Applying optimized LSA...
Enhanced LSA matrix shape: (4802, 150)
Explained variance ratio: 0.2127
Standard LSA matrix shape: (4802, 100)


## 5. Hybrid Feature Engineering

In [7]:
# Create hybrid features combining LSA with numerical features
print("Creating hybrid features...")

# Numerical features
numeric_cols = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']
numeric_features = df[numeric_cols].fillna(0).values

# Combine LSA (80%) + numerical (20%)
lsa_weighted = lsa_matrix_enhanced * 0.8
numeric_weighted = numeric_features * 0.2

hybrid_features_enhanced = np.hstack([lsa_weighted, numeric_weighted])
print(f"Enhanced hybrid features shape: {hybrid_features_enhanced.shape}")

Creating hybrid features...
Enhanced hybrid features shape: (4802, 156)


## 6. Advanced Similarity Computation

In [8]:
# Compute similarity matrices
print("Computing similarity matrices...")

similarity_enhanced = cosine_similarity(hybrid_features_enhanced)
print(f"Enhanced similarity matrix computed: {similarity_enhanced.shape}")

similarity_lsa_only = cosine_similarity(lsa_matrix_enhanced)
print(f"LSA-only similarity matrix computed: {similarity_lsa_only.shape}")

Computing similarity matrices...
Enhanced similarity matrix computed: (4802, 4802)
LSA-only similarity matrix computed: (4802, 4802)


## 7. Advanced Recommendation Engine

In [9]:
class AdvancedMovieRecommender:
    def __init__(self, df, similarity_matrix):
        self.df = df
        self.similarity_matrix = similarity_matrix
        self.movie_to_idx = {title: idx for idx, title in enumerate(df['original_title'])}
    
    def get_recommendations(self, movie_title, n_recommendations=10, min_rating=6.0):
        if movie_title not in self.movie_to_idx:
            return f"Movie '{movie_title}' not found in dataset."
        
        movie_idx = self.movie_to_idx[movie_title]
        sim_scores = list(enumerate(self.similarity_matrix[movie_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        recommendations = []
        for idx, score in sim_scores[1:]:
            movie_data = self.df.iloc[idx]
            
            if movie_data['vote_average'] < min_rating:
                continue
            
            try:
                genres = eval(movie_data['genres_list']) if isinstance(movie_data['genres_list'], str) else []
            except:
                genres = []
            
            recommendations.append({
                'title': movie_data['original_title'],
                'year': int(movie_data['release_year']) if pd.notna(movie_data['release_year']) else 'Unknown',
                'genres': genres[:3],
                'rating': round(movie_data['vote_average'], 1),
                'similarity_score': round(score, 3)
            })
            
            if len(recommendations) >= n_recommendations:
                break
        
        return recommendations
    
    def analyze_movie_profile(self, movie_title):
        if movie_title not in self.movie_to_idx:
            return f"Movie '{movie_title}' not found in dataset."
        
        movie_idx = self.movie_to_idx[movie_title]
        movie = self.df.iloc[movie_idx]
        
        try:
            genres = eval(movie['genres_list']) if isinstance(movie['genres_list'], str) else []
            directors = eval(movie['director_list']) if isinstance(movie['director_list'], str) else []
        except:
            genres = directors = []
        
        return {
            'title': movie['original_title'],
            'year': int(movie['release_year']) if pd.notna(movie['release_year']) else 'Unknown',
            'genres': genres,
            'rating': round(movie['vote_average'], 1),
            'directors': directors,
            'overview': movie['overview'][:200] + '...' if len(str(movie['overview'])) > 200 else movie['overview']
        }

# Initialize the recommender
print("Initializing Advanced Movie Recommender...")
recommender = AdvancedMovieRecommender(df, similarity_enhanced)
print("Advanced Movie Recommender ready!")

Initializing Advanced Movie Recommender...
Advanced Movie Recommender ready!


## 8. Testing and Demonstration

In [13]:
# Test the recommendation system
test_movies = ['Avatar', 'The Dark Knight', 'Inception', 'Titanic']

print("\nENHANCED LSA MOVIE RECOMMENDATION SYSTEM - DEMO")
print("=" * 70)

for movie in test_movies:
    if movie in recommender.movie_to_idx:
        print(f"\nRECOMMENDATIONS FOR: {movie}")
        print("-" * 50)
        
        # Get movie profile
        profile = recommender.analyze_movie_profile(movie)
        print(f"Movie Profile:")
        print(f"   Year: {profile['year']} | Rating: {profile['rating']}/10")
        print(f"   Genres: {', '.join(profile['genres'])}")
        print(f"   Directors: {', '.join(profile['directors'])}")
        
        # Get recommendations
        recommendations = recommender.get_recommendations(movie, n_recommendations=8)
        
        print(f"\nTop Recommendations:")
        for i, rec in enumerate(recommendations, 1):
            print(f"   {i:2d}. {rec['title']} ({rec['year']})")
            print(f"       Rating: {rec['rating']}/10 | Similarity: {rec['similarity_score']}")
            print(f"       Genres: {', '.join(rec['genres'])}")
        
        print("\n" + "="*50)
    else:
        print(f"\nMovie '{movie}' not found in dataset")


ENHANCED LSA MOVIE RECOMMENDATION SYSTEM - DEMO

RECOMMENDATIONS FOR: Avatar
--------------------------------------------------
Movie Profile:
   Year: 2009 | Rating: 0.9/10
   Genres: Action, Adventure, Fantasy, Science Fiction
   Directors: James Cameron

Top Recommendations:


RECOMMENDATIONS FOR: The Dark Knight
--------------------------------------------------
Movie Profile:
   Year: 2008 | Rating: 1.8/10
   Genres: Drama, Action, Crime, Thriller
   Directors: Christopher Nolan

Top Recommendations:


RECOMMENDATIONS FOR: Inception
--------------------------------------------------
Movie Profile:
   Year: 2010 | Rating: 1.7/10
   Genres: Action, Thriller, Science Fiction, Mystery, Adventure
   Directors: Christopher Nolan

Top Recommendations:


RECOMMENDATIONS FOR: Titanic
--------------------------------------------------
Movie Profile:
   Year: 1997 | Rating: 1.2/10
   Genres: Drama, Romance, Thriller
   Directors: James Cameron

Top Recommendations:



## 9. Save Enhanced Models

In [None]:
# Save all enhanced models
print("\nüíæ SAVING ENHANCED MODELS")
print("=" * 30)

os.makedirs('../models', exist_ok=True)

# Save enhanced components
joblib.dump(tfidf_enhanced, '../models/tfidf_enhanced.pkl')
joblib.dump(lsa_enhanced, '../models/lsa_enhanced.pkl')
joblib.dump(hybrid_features_enhanced, '../models/hybrid_features_enhanced.pkl')
joblib.dump(similarity_enhanced, '../models/similarity_enhanced.pkl')

# Save the recommender
import pickle
with open('../models/advanced_recommender.pkl', 'wb') as f:
    pickle.dump(recommender, f)

# Save enhanced dataframe
df.to_csv('../data/processed/movies_enhanced_df.csv', index=False)

print("Enhanced models saved successfully!")
print("\nSaved files:")
print("   - tfidf_enhanced.pkl")
print("   - lsa_enhanced.pkl")
print("   - hybrid_features_enhanced.pkl")
print("   - similarity_enhanced.pkl")
print("   - advanced_recommender.pkl")
print("   - movies_enhanced_df.csv")

print("\nENHANCED LSA MOVIE RECOMMENDATION SYSTEM COMPLETE!")
print("Key Features:")
print("   Weighted text features (genres 4x, keywords 3x, directors 3x)")
print("   Advanced TF-IDF with n-grams and optimized parameters")
print("   Hybrid similarity (LSA 80% + numerical 20%)")
print("   Quality filtering and intelligent recommendations")
print("   Comprehensive movie analysis and profiling")


üíæ SAVING ENHANCED MODELS
‚úÖ Enhanced models saved successfully!

üìÅ Saved files:
   - tfidf_enhanced.pkl
   - lsa_enhanced.pkl
   - hybrid_features_enhanced.pkl
   - similarity_enhanced.pkl
   - advanced_recommender.pkl
   - movies_enhanced_df.csv

üéâ ENHANCED LSA MOVIE RECOMMENDATION SYSTEM COMPLETE!
üöÄ Key Features:
   ‚úì Weighted text features (genres 4x, keywords 3x, directors 3x)
   ‚úì Advanced TF-IDF with n-grams and optimized parameters
   ‚úì Hybrid similarity (LSA 80% + numerical 20%)
   ‚úì Quality filtering and intelligent recommendations
   ‚úì Comprehensive movie analysis and profiling
