# Make sure the existance of all poster images

In [None]:
import pandas as pd
import os

# Check how many posters are missing
downloaded = set(int(f.split('.')[0]) for f in os.listdir('posters_test') if f.endswith('.jpg'))
all_movies = set(pd.read_csv('movies_test.csv')['movieId'])
missing_count = len(all_movies - downloaded)

print(f"Missing posters: {missing_count}")

# Feature extraction and Dim reduction

In [1]:
"""
Feature Extraction and Dimensionality Reduction for Test Posters
Extracts HOG features and applies LDA reduction to 18 dimensions
"""

import cv2
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from skimage.feature import hog
from skimage.color import rgb2gray
import warnings
warnings.filterwarnings('ignore')

# ========================================
# FEATURE EXTRACTION - HOG
# ========================================

class HOGFeatureExtractor:
    """Extract HOG features from movie posters"""
    
    def __init__(self, posters_dir='posters_test', features_dir='posters_test'):
        self.posters_dir = Path(posters_dir)
        self.features_dir = Path(features_dir)
        self.features_dir.mkdir(exist_ok=True)
        
    def extract_hog_features(self, image, orientations=9, pixels_per_cell=(16, 16), cells_per_block=(2, 2)):
        """
        Extract HOG (Histogram of Oriented Gradients) features
        Args:
            image: BGR image from cv2
            orientations: number of orientation bins
            pixels_per_cell: size of a cell
            cells_per_block: number of cells in each block
        Returns:
            HOG feature vector (1764 features)
        """
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Resize to standard size for consistency
        gray = cv2.resize(gray, (256, 256))
        
        # Extract HOG features
        features = hog(gray, 
                      orientations=orientations, 
                      pixels_per_cell=pixels_per_cell,
                      cells_per_block=cells_per_block,
                      visualize=False, 
                      feature_vector=True)
        
        return features
    
    def load_image(self, image_path):
        """Load and validate image"""
        try:
            image = cv2.imread(str(image_path))
            if image is None:
                return None
            return image
        except Exception as e:
            print(f"Error loading {image_path}: {e}")
            return None
    
    def extract_hog_features_all(self):
        """
        Extract HOG features for all test posters
        Returns:
            DataFrame with movieId and HOG features
        """
        # Get all poster files
        poster_files = sorted(list(self.posters_dir.glob('*.jpg')))
        
        if not poster_files:
            print(f"No poster files found in {self.posters_dir}")
            return None
        
        print(f"Found {len(poster_files)} poster files")
        
        hog_features = []
        movieIds = []
        failed_images = []
        
        # Process each poster
        for poster_file in tqdm(poster_files, desc="Extracting HOG features"):
            # Extract movieId from filename (e.g., "123.jpg" -> 123)
            try:
                movie_id = int(poster_file.stem)
            except ValueError:
                print(f"Skipping invalid filename: {poster_file.name}")
                continue
            
            # Load image
            image = self.load_image(poster_file)
            if image is None:
                failed_images.append(movie_id)
                continue
            
            try:
                # Extract HOG features
                hog_feat = self.extract_hog_features(image)
                
                # Store features
                movieIds.append(movie_id)
                hog_features.append(hog_feat)
                
            except Exception as e:
                print(f"Error processing {poster_file.name}: {e}")
                failed_images.append(movie_id)
                continue
        
        # Create DataFrame
        if hog_features:
            features_array = np.array(hog_features)
            feature_cols = [f'HOG_feat_{i}' for i in range(features_array.shape[1])]
            df = pd.DataFrame(features_array, columns=feature_cols)
            df.insert(0, 'movieId', movieIds)
            
            # Save raw HOG features
            output_path = self.features_dir / 'feature_HOG.csv'
            df.to_csv(output_path, index=False)
            print(f"✓ Saved raw HOG features: {features_array.shape} -> {output_path}")
            
            # Report results
            print(f"\n{'='*60}")
            print(f"HOG Feature Extraction Completed!")
            print(f"Total processed: {len(movieIds)}")
            print(f"Failed images: {len(failed_images)}")
            if failed_images:
                print(f"Failed movieIds: {failed_images[:10]}{'...' if len(failed_images) > 10 else ''}")
            print(f"{'='*60}")
            
            return df
        else:
            print("✗ No features extracted!")
            return None

# ========================================
# DIMENSIONALITY REDUCTION - LDA
# ========================================

class LDA_FromScratch:
    """LDA Implementation from Scratch"""
    
    def __init__(self, n_components=None):
        self.n_components = n_components
        self.components_ = None
        self.mean_ = None
    
    def fit(self, X, y):
        """Fit LDA on data X with labels y"""
        n_features = X.shape[1]
        class_labels = np.unique(y)
        n_classes = len(class_labels)
        
        if self.n_components is None:
            self.n_components = min(n_features, n_classes - 1)
        else:
            self.n_components = min(self.n_components, n_classes - 1)
        
        self.mean_ = np.mean(X, axis=0)
        S_W = np.zeros((n_features, n_features))
        S_B = np.zeros((n_features, n_features))
        
        for c in class_labels:
            X_c = X[y == c]
            mean_c = np.mean(X_c, axis=0)
            n_c = X_c.shape[0]
            S_W += np.dot((X_c - mean_c).T, (X_c - mean_c))
            mean_diff = (mean_c - self.mean_).reshape(-1, 1)
            S_B += n_c * np.dot(mean_diff, mean_diff.T)
        
        try:
            S_W_reg = S_W + np.eye(n_features) * 1e-6
            eigenvalues, eigenvectors = np.linalg.eig(np.dot(np.linalg.inv(S_W_reg), S_B))
            idx = np.argsort(np.abs(eigenvalues))[::-1]
            eigenvalues = eigenvalues[idx]
            eigenvectors = eigenvectors[:, idx]
            self.components_ = np.real(eigenvectors[:, :self.n_components]).T
        except np.linalg.LinAlgError:
            print("  ⚠ Warning: LDA encountered numerical issues, using fallback")
            # Fallback to PCA-like approach
            cov_matrix = np.dot(X.T, X) / (X.shape[0] - 1)
            eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
            idx = np.argsort(eigenvalues)[::-1]
            eigenvectors = eigenvectors[:, idx]
            self.components_ = eigenvectors[:, :self.n_components].T
        
        return self
    
    def transform(self, X):
        """Transform data using fitted LDA"""
        return np.dot(X, self.components_.T)
    
    def fit_transform(self, X, y):
        """Fit and transform in one step"""
        self.fit(X, y)
        return self.transform(X)

def load_train_genres(train_csv_path='movies_train.csv'):
    """Load genre information from training data for LDA"""
    df = pd.read_csv(train_csv_path)
    genre_dict = {}
    for _, row in df.iterrows():
        movie_id = row['movieId']
        genres = row['genres']
        if pd.isna(genres) or genres == '(no genres listed)':
            primary_genre = 'Unknown'
        else:
            primary_genre = genres.split('|')[0]
        genre_dict[movie_id] = primary_genre
    return genre_dict

def normalize_features(features):
    """Normalize features using StandardScaler"""
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    normalized = scaler.fit_transform(features)
    return normalized, scaler

def apply_lda_reduction(hog_features_df, train_csv='movies_train.csv', n_components=18):
    """
    Apply LDA dimensionality reduction to HOG features
    """
    print(f"\nApplying LDA reduction to {n_components} dimensions...")
    
    # Load training data for LDA fitting
    train_features_path = 'features/feature_HOG.csv'  # Usar datos de entrenamiento para entrenar LDA
    if not Path(train_features_path).exists():
        print(f"✗ Error: Training features not found at {train_features_path}")
        print("  LDA needs training data to learn the transformation")
        return None
    
    # Cargar características de entrenamiento
    train_hog_df = pd.read_csv(train_features_path)
    train_movie_ids = train_hog_df['movieId'].values
    train_features = train_hog_df.drop('movieId', axis=1).values
    
    # Cargar géneros de entrenamiento
    genre_dict = load_train_genres(train_csv)
    train_labels = np.array([genre_dict.get(mid, 'Unknown') for mid in train_movie_ids])
    
    # Normalizar características de entrenamiento
    train_features_norm, scaler = normalize_features(train_features)
    
    # Entrenar LDA con datos de entrenamiento
    lda = LDA_FromScratch(n_components=n_components)
    lda.fit(train_features_norm, train_labels)
    print(f"✓ LDA trained with {len(np.unique(train_labels))} genre classes")
    
    # Preparar características de test
    test_movie_ids = hog_features_df['movieId'].values
    test_features = hog_features_df.drop('movieId', axis=1).values
    
    # Normalizar características de test con el mismo scaler
    test_features_norm = scaler.transform(test_features)
    
    # Aplicar transformación LDA
    reduced_features = lda.transform(test_features_norm)
    
    print(f"✓ LDA reduction applied: {test_features.shape[1]} -> {reduced_features.shape[1]} dimensions")
    
    return test_movie_ids, reduced_features

def save_reduced_features(movie_ids, reduced_features, output_path):
    """Save reduced features to CSV"""
    n_components = reduced_features.shape[1]
    df = pd.DataFrame(reduced_features, columns=[f'comp_{i}' for i in range(n_components)])
    df.insert(0, 'movieId', movie_ids)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f"✓ Saved reduced features: {output_path.name} ({reduced_features.shape[1]} components)")
    return df

# ========================================
# MAIN PIPELINE
# ========================================

def main():
    """Main execution function"""
    print("="*70)
    print("HOG Feature Extraction + LDA Reduction for Test Posters")
    print("="*70)
    
    # Initialize extractor
    extractor = HOGFeatureExtractor(posters_dir='posters_test', features_dir='posters_test')
    
    # Step 1: Extract HOG features
    print("\n1. Extracting HOG features from test posters...")
    hog_df = extractor.extract_hog_features_all()
    
    if hog_df is None:
        print("✗ Failed to extract HOG features")
        return
    
    # Step 2: Apply LDA reduction
    print("\n2. Applying LDA dimensionality reduction...")
    result = apply_lda_reduction(hog_df, n_components=18)
    
    if result is None:
        print("✗ Failed to apply LDA reduction")
        return
    
    movie_ids, reduced_features = result
    
    # Step 3: Save final features
    print("\n3. Saving final reduced features...")
    output_path = Path('features_test') / 'HOG_lda_18d.csv'
    final_df = save_reduced_features(movie_ids, reduced_features, output_path)
    
    # Final summary
    print(f"\n{'='*70}")
    print("✅ PROCESSING COMPLETED!")
    print(f"{'='*70}")
    print(f"Input posters: posters_test/")
    print(f"Raw HOG features: posters_test/feature_HOG.csv")
    print(f"Final features: features_test/HOG_lda_18d.csv")
    print(f"HOG feature dimension: 1764")
    print(f"LDA reduced dimension: 18")
    print(f"Total movies processed: {len(movie_ids)}")
    print(f"{'='*70}")

if __name__ == "__main__":
    main()

HOG Feature Extraction + LDA Reduction for Test Posters

1. Extracting HOG features from test posters...
Found 1497 poster files


Extracting HOG features: 100%|██████████| 1497/1497 [01:07<00:00, 22.05it/s]


✓ Saved raw HOG features: (1497, 8100) -> posters_test\feature_HOG.csv

HOG Feature Extraction Completed!
Total processed: 1497
Failed images: 0

2. Applying LDA dimensionality reduction...

Applying LDA reduction to 18 dimensions...
✓ LDA trained with 19 genre classes
✓ LDA reduction applied: 8100 -> 18 dimensions

3. Saving final reduced features...
✓ Saved reduced features: HOG_lda_18d.csv (18 components)

✅ PROCESSING COMPLETED!
Input posters: posters_test/
Raw HOG features: posters_test/feature_HOG.csv
Final features: features_test/HOG_lda_18d.csv
HOG feature dimension: 1764
LDA reduced dimension: 18
Total movies processed: 1497
