# Feature Engineering



## Objectives

*   Engineer features for Regression model


## Inputs

* inputs/datasets/cleaned/test_df_cleaned.pkl
* inputs/datasets/cleaned/train_df_cleaned.pkl

## Outputs

* generate a list with variables to engineer

## Conclusions

* Feature Engineering Transformers
* 
* 

# Change working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/Film_Hit_prediction/jupyter_notebooks'

We want to make the parent of the current directory the new current directory.
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir


'/workspace/Film_Hit_prediction'

---

# Load Cleaned Data

Train Set

In [4]:
import os
import pandas as pd

# Correct path relative to the current directory
Train_set_path = "/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/train_df_cleaned.pkl"

try:
    TrainSet = pd.read_pickle(Train_set_path)
    print(TrainSet.head(3))
    print("Shape of the dataframe:", TrainSet.shape)
except FileNotFoundError:
    print(f"File not found at path: {Train_set_path}")

File not found at path: /workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/train_df_cleaned.pkl


Test Set

In [5]:
import os
import pandas as pd

# Correct path relative to the current directory
Test_set_path = "/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/test_df_cleaned.pkl"

try:
    TestSet = pd.read_pickle(Test_set_path)
    print(TestSet.head(3))
    print("Shape of the dataframe:", TestSet.shape)
except FileNotFoundError:
    print(f"File not found at path: {Test_set_path}")

File not found at path: /workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/test_df_cleaned.pkl


# Data Exploration

Evaluate potential transformations to be made


In [6]:
import pandas as pd

# Sample a subset of the dataset (e.g., 1000 rows or 10%)
sampled_df = TrainSet.sample(n=min(1000, len(TrainSet)), random_state=42)

# Basic profiling
print("Dataset Overview:")
print(sampled_df.info())

print("\nSummary Statistics:")
print(sampled_df.describe())

print("\nMissing Values:")
print(sampled_df.isnull().sum())

print("\nTop 5 Rows:")
print(sampled_df.head())

# Save the sampled data if needed for further exploration
sampled_df.to_csv('sampled_dataset.csv', index=False)





NameError: name 'TrainSet' is not defined

# Correlation and PPS Analysis

* We don’t expect changes compared to the data cleaning notebook 

# Feature Engineering

## Custom functions

### Function for top actors

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

def top_revenue_actors(TrainSet, TestSet, n_actors=30):

    # Create copies and get cast columns once
    train_copy, test_copy = TrainSet.copy(), TestSet.copy()
    cast_cols = [col for col in TrainSet.columns if col.startswith('cast_')]

    # Calculate multiple metrics for each actor
    actor_metrics = {}
    for col in cast_cols:
        actor_name = col.replace('cast_', '')
        movies_with_actor = TrainSet[TrainSet[col] == 1]
        movies_count = len(movies_with_actor)

        metrics = {
            'movies_count': movies_count,
            'total_revenue': movies_with_actor['revenue'].sum(),
            'avg_revenue': movies_with_actor['revenue'].mean(),
            'revenue_consistency': movies_with_actor['revenue'].std(),
            'hit_rate': (movies_with_actor['revenue'] > movies_with_actor['revenue'].mean()).mean(),
            'avg_popularity': movies_with_actor['popularity'].mean(),
            'popularity_consistency': movies_with_actor['popularity'].std(),
            'revenue_popularity_correlation': movies_with_actor[['revenue', 'popularity']].corr().iloc[0,1]
        }          
        actor_metrics[actor_name] = metrics

         # Calculate composite scores
    for actor, metrics in actor_metrics.items():
        # Calculate normalized metrics
        revenue_norm = metrics['total_revenue'] / max(m['total_revenue'] for m in actor_metrics.values())
        avg_norm = metrics['avg_revenue'] / max(m['avg_revenue'] for m in actor_metrics.values())
        consistency_norm = 1 - (metrics['revenue_consistency'] / max(m['revenue_consistency'] for m in actor_metrics.values()))
        popularity_norm = metrics['avg_popularity'] / max(m['avg_popularity'] for m in actor_metrics.values())
        correlation_norm = abs(metrics['revenue_popularity_correlation'])
        
        # Composite score with popularity factors
        metrics['composite_score'] = (
            0.3 * revenue_norm +           # Total revenue importance
            0.2 * avg_norm +               # Average revenue importance
            0.2 * consistency_norm +       # Revenue consistency importance
            0.2 * popularity_norm +        # Popularity importance
            0.1 * correlation_norm         # Revenue-popularity correlation importance
        )

    # Get top actors based on composite score
    top_actors = sorted(actor_metrics.items(), key=lambda x: x[1]['composite_score'], reverse=True)[:n_actors]
    top_actor_cols = [f"cast_{actor}" for actor, _ in top_actors]

    print(f"Number of columns after adding top actor features: {len(train_copy.columns) + 2*n_actors}") 

    # Process both DataFrames
    processed_dfs = []
    for df in [train_copy, test_copy]:
        # Keep original non-cast columns + top actor columns
        keep_cols = [col for col in df.columns if not col.startswith('cast_')] + top_actor_cols
        processed = df[keep_cols].copy()
        
        # Add popularity weighted features for top actors
        for actor_col in top_actor_cols:
            if actor_col in df.columns:
                actor_name = actor_col.replace('cast_', '')
                processed[f"{actor_col}_pop_weight"] = (
                    df[actor_col] * actor_metrics[actor_name]['avg_popularity']
                )
            else:
                processed[actor_col] = 0
                processed[f"{actor_col}_pop_weight"] = 0
        
        # Add other actor count
        other_cast_cols = [col for col in cast_cols if col not in top_actor_cols]
        processed['other_actor_count'] = df[other_cast_cols].sum(axis=1)
        processed_dfs.append(processed)

    # Save top actors data
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/top_revenue_actors.pkl', 'wb') as f:
        pickle.dump({'columns': top_actor_cols, 'metrics': actor_metrics}, f)

    return processed_dfs[0], processed_dfs[1]
    print(f"Final shape - TrainSet_processed: {processed_dfs[0].shape}, TestSet_processed: {processed_dfs[1].shape}")
  
    

In [8]:
TrainSet_processed, TestSet_processed = top_revenue_actors(TrainSet, TestSet, n_actors=30)

Number of columns after adding top actor features: 1191


### Function for top directors

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

def top_revenue_directors(TrainSet, TestSet, n_directors=20):
    # Create copies of input data to avoid modifications
    train_copy = TrainSet.copy()
    test_copy = TestSet.copy()

    # Find top revenue-generating directors
    director_cols = [col for col in TrainSet.columns if col.startswith('crew_Director_')]

    print(f"Number of director columns found: {len(director_cols)}")
    print("First few director columns:", director_cols[:5])
    
    # Calculate multiple metrics for each director
    director_metrics = {}
    for col in director_cols:
        director_name = col.replace('crew_Director_', '')
        movies_with_director = TrainSet[TrainSet[col] == 1]
        movies_count = TrainSet[col].sum()
    
        metrics = {
            'movies_count': movies_count,
            'total_revenue': movies_with_director['revenue'].sum(),
            'avg_revenue': movies_with_director['revenue'].mean(),
            'revenue_consistency': movies_with_director['revenue'].std(),
            'hit_rate': (movies_with_director['revenue'] > movies_with_director['revenue'].mean()).mean(),
            'avg_popularity': movies_with_director['popularity'].mean(),
            'popularity_consistency': movies_with_director['popularity'].std(),
            'revenue_popularity_correlation': movies_with_director[['revenue', 'popularity']].corr().iloc[0,1]
        }
        director_metrics[director_name] = metrics

    # Calculate composite scores
    for director, metrics in director_metrics.items():
        # Calculate normalized metrics
        revenue_norm = metrics['total_revenue'] / max(m['total_revenue'] for m in director_metrics.values())
        avg_norm = metrics['avg_revenue'] / max(m['avg_revenue'] for m in director_metrics.values())
        consistency_norm = 1 - (metrics['revenue_consistency'] / max(m['revenue_consistency'] for m in director_metrics.values()))
        popularity_norm = metrics['avg_popularity'] / max(m['avg_popularity'] for m in director_metrics.values())
        correlation_norm = abs(metrics['revenue_popularity_correlation'])
        
        metrics['composite_score'] = (
            0.3 * revenue_norm +           # Total revenue importance
            0.2 * avg_norm +               # Average revenue importance
            0.2 * consistency_norm +       # Revenue consistency importance
            0.2 * popularity_norm +        # Popularity importance
            0.1 * correlation_norm         # Revenue-popularity correlation importance
        )

    # Get top directors based on composite score
    top_directors = sorted(director_metrics.items(), key=lambda x: x[1]['composite_score'], reverse=True)[:n_directors]
    top_director_cols = [f"crew_Director_{director}" for director, _ in top_directors]

    # Process both DataFrames
    processed_dfs = []
    for df in [train_copy, test_copy]:
        # Keep original non-director columns + top director columns
        keep_cols = [col for col in df.columns if not col.startswith('crew_Director_')] + top_director_cols
        processed = df[keep_cols].copy()
        
        # Add popularity weighted features for top directors
        for director_col in top_director_cols:
            if director_col in df.columns:
                director_name = director_col.replace('crew_Director_', '')
                processed[f"{director_col}_pop_weight"] = (
                    df[director_col] * director_metrics[director_name]['avg_popularity']
                )
            else:
                processed[director_col] = 0
                processed[f"{director_col}_pop_weight"] = 0
        
        # Add other director count
        other_director_cols = [col for col in director_cols if col not in top_director_cols]
        processed['other_director_count'] = df[other_director_cols].sum(axis=1)
        processed_dfs.append(processed)

    # Save top directors data
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/top_revenue_directors.pkl', 'wb') as f:
        pickle.dump({'columns': top_director_cols, 'metrics': director_metrics}, f)

    return processed_dfs[0], processed_dfs[1]
    

In [10]:
print("\nStarting director feature engineering...")
TrainSet_processed, TestSet_processed = top_revenue_directors(TrainSet, TestSet, n_directors=20)

# See what the processed data looks like
print("\nFeature Engineering Summary:")
print("-" * 20)
print("Top directors analyzed by:")
print("- Total revenue")
print("- Average popularity")
print("- Revenue-popularity correlation")
print("\nDataset Shapes:")
print(f"Processed train shape: {TrainSet_processed.shape}")
print(f"Processed test shape: {TestSet_processed.shape}")




Starting director feature engineering...
Number of director columns found: 227
First few director columns: ['crew_Director_Aaron Seltzer', 'crew_Director_Adam McKay', 'crew_Director_Adam Shankman', 'crew_Director_Alejandro González Iñárritu', 'crew_Director_Alex Proyas']

Feature Engineering Summary:
--------------------
Top directors analyzed by:
- Total revenue
- Average popularity
- Revenue-popularity correlation

Dataset Shapes:
Processed train shape: (3840, 945)
Processed test shape: (961, 945)


### Function for top writers

In [11]:
def top_revenue_writers(TrainSet, TestSet, n_writers=10):
    # Create copies and get writer columns once
    train_copy, test_copy = TrainSet.copy(), TestSet.copy()
    writer_cols = [col for col in TrainSet.columns if col.startswith('crew_Writer_')]

    print("\nNumber of writer columns found:", len(writer_cols))
    print("First few writer columns:", writer_cols[:5])

    # Calculate writer metrics in one pass
    writer_metrics = {}
    for col in writer_cols:
        writer_name = col.replace('crew_Writer_', '')
        movies_with_writer = TrainSet[TrainSet[col] == 1]
        movies_count = len(movies_with_writer)
        
        metrics = {
            'movies_count': movies_count,
            'total_revenue': movies_with_writer['revenue'].sum(),
            'avg_revenue': movies_with_writer['revenue'].mean(),
            'revenue_consistency': movies_with_writer['revenue'].std(),
            'avg_popularity': movies_with_writer['popularity'].mean(),
            'popularity_consistency': movies_with_writer['popularity'].std(),
            'revenue_popularity_correlation': movies_with_writer[['revenue', 'popularity']].corr().iloc[0,1]
        }
        writer_metrics[writer_name] = metrics

    # Print information about all writers
    print(f"\nFound {len(writer_metrics)} writers")
    print("\nAll writers and their metrics:")
    for writer, metrics in writer_metrics.items():
        print(f"{writer}: {metrics['movies_count']} movies, ${metrics['total_revenue']:,.2f} total revenue")

    if not writer_metrics:
        print("No writers found.")
        return train_copy, test_copy

    # Calculate composite scores
    for writer, metrics in writer_metrics.items():
        max_revenue = max(m['total_revenue'] for m in writer_metrics.values())
        max_avg_revenue = max(m['avg_revenue'] for m in writer_metrics.values())
        max_revenue_consistency = max(m['revenue_consistency'] for m in writer_metrics.values())
        max_popularity = max(m['avg_popularity'] for m in writer_metrics.values())
        
        revenue_norm = metrics['total_revenue'] / max_revenue if max_revenue > 0 else 0
        avg_norm = metrics['avg_revenue'] / max_avg_revenue if max_avg_revenue > 0 else 0
        consistency_norm = 1 - (metrics['revenue_consistency'] / max_revenue_consistency) if max_revenue_consistency > 0 else 0
        popularity_norm = metrics['avg_popularity'] / max_popularity if max_popularity > 0 else 0
        correlation_norm = abs(metrics['revenue_popularity_correlation'])
        
        metrics['composite_score'] = (
            0.3 * revenue_norm +           
            0.2 * avg_norm +               
            0.2 * consistency_norm +       
            0.2 * popularity_norm +        
            0.1 * correlation_norm         
        )

    # Get top writers based on composite score
    top_writers = sorted(writer_metrics.items(), key=lambda x: x[1]['composite_score'], reverse=True)[:n_writers]
    top_writer_cols = [f"crew_Writer_{writer}" for writer, _ in top_writers]

    # Print top writers and their metrics
    print("\nTop writers by composite score:")
    for writer, metrics in top_writers:
        print(f"{writer}:")
        print(f"  Movies: {metrics['movies_count']}")
        print(f"  Total Revenue: ${metrics['total_revenue']:,.2f}")
        print(f"  Avg Revenue: ${metrics['avg_revenue']:,.2f}")
        print(f"  Composite Score: {metrics['composite_score']:.3f}")

    # Process both DataFrames
    processed_dfs = []
    for df in [train_copy, test_copy]:
        # Keep original non-writer columns + top writer columns
        keep_cols = [col for col in df.columns if not col.startswith('crew_Writer_')] + top_writer_cols
        processed = df[keep_cols].copy()
        
        # Add popularity weighted features for top writers
        for writer_col in top_writer_cols:
            if writer_col in df.columns:
                writer_name = writer_col.replace('crew_Writer_', '')
                processed[f"{writer_col}_pop_weight"] = (
                    df[writer_col] * writer_metrics[writer_name]['avg_popularity']
                )
            else:
                processed[writer_col] = 0
                processed[f"{writer_col}_pop_weight"] = 0
        
        # Add other writer count
        other_writer_cols = [col for col in writer_cols if col not in top_writer_cols]
        processed['other_writer_count'] = df[other_writer_cols].sum(axis=1)
        processed_dfs.append(processed)

    # Save top writers data
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/top_revenue_writers.pkl', 'wb') as f:
        pickle.dump({'columns': top_writer_cols, 'metrics': writer_metrics}, f)

    return processed_dfs[0], processed_dfs[1]

In [12]:
# Call the writer function
print("\nStarting writer feature engineering...")
TrainSet_processed, TestSet_processed = top_revenue_writers(TrainSet, TestSet, n_writers=10)

## See what the processed data looks like
print("\nFeature Engineering Summary:")
print("-" * 5)
print("Top writers analyzed by:")
print("- Total revenue")
print("- Average popularity")
print("- Revenue-popularity correlation")
print("\nDataset Shapes:")
print(f"Processed train shape: {TrainSet_processed.shape}")
print(f"Processed test shape: {TestSet_processed.shape}")


Starting writer feature engineering...

Number of writer columns found: 11
First few writer columns: ['crew_Writer_David Zucker', 'crew_Writer_Ethan Coen', 'crew_Writer_Joel Coen', 'crew_Writer_Kevin Smith', 'crew_Writer_Luc Besson']

Found 11 writers

All writers and their metrics:
David Zucker: 4 movies, $216,441,753.00 total revenue
Ethan Coen: 3 movies, $128,013,309.00 total revenue
Joel Coen: 3 movies, $128,013,309.00 total revenue
Kevin Smith: 4 movies, $76,707,267.00 total revenue
Luc Besson: 4 movies, $277,191,408.00 total revenue
M. Night Shyamalan: 5 movies, $1,904,372,773.00 total revenue
Mike Leigh: 4 movies, $23,529,762.00 total revenue
Quentin Tarantino: 4 movies, $504,229,064.00 total revenue
Robert Rodriguez: 5 movies, $275,670,551.00 total revenue
Tyler Perry: 4 movies, $147,739,860.00 total revenue
Woody Allen: 5 movies, $358,668,130.00 total revenue

Top writers by composite score:
M. Night Shyamalan:
  Movies: 5
  Total Revenue: $1,904,372,773.00
  Avg Revenue: $38

### Function for top producers

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
 
def top_revenue_producers(TrainSet, TestSet, n_producers=20, min_movies=10):

    # Create copies and get producer columns once
    train_copy, test_copy = TrainSet.copy(), TestSet.copy()
    producer_cols = [col for col in TrainSet.columns if col.startswith('crew_Producer_')]

    print("\nNumber of producer columns found:", len(producer_cols))
    print("First few producer columns:", producer_cols[:5])

    

    # Calculate producer metrics in one pass
    producer_metrics = {}
    for col in producer_cols:
        producer_name = col.replace('crew_Producer_', '')
        movies_with_producer = TrainSet[TrainSet[col] == 1]
        movies_count = len(movies_with_producer)
        
        metrics = {
            'movies_count': movies_count,
            'total_revenue': movies_with_producer['revenue'].sum(),
            'avg_revenue': movies_with_producer['revenue'].mean(),
            'revenue_consistency': movies_with_producer['revenue'].std(),
            'avg_popularity': movies_with_producer['popularity'].mean(),
            'popularity_consistency': movies_with_producer['popularity'].std(),
            'revenue_popularity_correlation': movies_with_producer[['revenue', 'popularity']].corr().iloc[0,1]
        }
        producer_metrics[producer_name] = metrics

    # Filter producers with minimum movies threshold
    filtered_producer_metrics = {
        producer: metrics 
        for producer, metrics in producer_metrics.items() 
        if metrics['movies_count'] >= min_movies
    }

    print(f"\nProducers before filtering: {len(producer_metrics)}")
    print(f"Producers after filtering (min {min_movies} movies): {len(filtered_producer_metrics)}")


    # Calculate composite scores
    for producer, metrics in producer_metrics.items():
        revenue_norm = metrics['total_revenue'] / max(m['total_revenue'] for m in producer_metrics.values())
        avg_norm = metrics['avg_revenue'] / max(m['avg_revenue'] for m in producer_metrics.values())
        consistency_norm = 1 - (metrics['revenue_consistency'] / max(m['revenue_consistency'] for m in producer_metrics.values()))
        popularity_norm = metrics['avg_popularity'] / max(m['avg_popularity'] for m in producer_metrics.values())
        correlation_norm = abs(metrics['revenue_popularity_correlation'])
        
        metrics['composite_score'] = (
            0.3 * revenue_norm +           
            0.2 * avg_norm +               
            0.2 * consistency_norm +       
            0.2 * popularity_norm +        
            0.1 * correlation_norm         
        )

    # Get top producers based on composite score
    top_producers = sorted(producer_metrics.items(), key=lambda x: x[1]['composite_score'], reverse=True)[:n_producers]
    top_producer_cols = [f"crew_Producer_{producer}" for producer, _ in top_producers]

    # Process both DataFrames
    processed_dfs = []
    for df in [train_copy, test_copy]:
        # Keep original non-producer columns + top producer columns
        keep_cols = [col for col in df.columns if not col.startswith('crew_Producer_')] + top_producer_cols
        processed = df[keep_cols].copy()
        
        # Add popularity weighted features for top producers
        for producer_col in top_producer_cols:
            if producer_col in df.columns:
                producer_name = producer_col.replace('crew_Producer_', '')
                processed[f"{producer_col}_pop_weight"] = (
                    df[producer_col] * producer_metrics[producer_name]['avg_popularity']
                )
            else:
                processed[producer_col] = 0
                processed[f"{producer_col}_pop_weight"] = 0
        
        # Add other producer count
        other_producer_cols = [col for col in producer_cols if col not in top_producer_cols]
        processed['other_producer_count'] = df[other_producer_cols].sum(axis=1)
        processed_dfs.append(processed)

    # Save top producers data
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/top_revenue_producers.pkl', 'wb') as f:
        pickle.dump({'columns': top_producer_cols, 'metrics': producer_metrics}, f)

    return processed_dfs[0], processed_dfs[1]

In [14]:

# Call the producer function
print("\nStarting producer feature engineering...")
TrainSet_processed, TestSet_processed = top_revenue_producers(TrainSet, TestSet, n_producers=20)

## See what the processed data looks like
print("\nFeature Engineering Summary:")
print("-" * 20)
print("Top producers analyzed by:")
print("- Total revenue")
print("- Average popularity")
print("- Revenue-popularity correlation")
print("\nDataset Shapes:")
print(f"Processed train shape: {TrainSet_processed.shape}")
print(f"Processed test shape: {TestSet_processed.shape}")



Starting producer feature engineering...

Number of producer columns found: 462
First few producer columns: ['crew_Producer_A. Kitman Ho', 'crew_Producer_Aaron Ryder', 'crew_Producer_Adam McKay', 'crew_Producer_Adam Sandler', 'crew_Producer_Adam Shankman']

Producers before filtering: 462
Producers after filtering (min 10 movies): 77

Feature Engineering Summary:
--------------------
Top producers analyzed by:
- Total revenue
- Average popularity
- Revenue-popularity correlation

Dataset Shapes:
Processed train shape: (3840, 710)
Processed test shape: (961, 710)


MAIN ENGINEERED FUNCTION 
* combining all

In [15]:
def engineer_movie_features(TrainSet, TestSet):
    print("Starting feature engineering...")
    print(f"Initial columns: {TrainSet.shape[1]}")

    # Load encoders
    try:
        with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/encoders_and_filters.pkl', 'rb') as f:
            encoders_and_filters = pickle.load(f)
            print("Successfully loaded encoders and filters from cleaning stage")
    except Exception as e:
        print(f"Error loading encoders: {str(e)}")
        return None

    train_processed = TrainSet.copy()  
    test_processed = TestSet.copy()

    # Get genre columns 
    genre_columns = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 
                    'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 
                    'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 
                    'TV Movie', 'Thriller', 'War', 'Western']
    print(f"Using {len(genre_columns)} genre columns: {genre_columns}")

    # Remove movies with missing revenue
    train_processed = train_processed.dropna(subset=['revenue','runtime','budget'])
    test_processed = test_processed.dropna(subset=['revenue','runtime','budget'])

    # 1. BUDGET FEATURES
    print("\nEngineering budget features...")
    
    # First create budget_per_minute BEFORE trying to scale it
    train_processed['budget_per_minute'] = (train_processed['budget'] / train_processed['runtime'].replace(0, 1)).fillna(0)
    test_processed['budget_per_minute'] = test_processed['budget'] / test_processed['runtime'].replace(0, 1)
    
    # Now identify numerical columns to scale
    numeric_cols = [
        'budget', 
        'runtime', 
        'popularity',
        'budget_per_minute',  # Now this column exists
    ]

    # 2. RUNTIME FEATURES
    print("Engineering runtime features...")
    train_processed = train_processed[train_processed['runtime'] >= 90]
    test_processed = test_processed[test_processed['runtime'] >= 90]

    # 3. CAST/CREW FEATURES
    train_processed, test_processed = top_revenue_actors(train_processed, test_processed, n_actors=30)
    train_processed, test_processed = top_revenue_directors(train_processed, test_processed, n_directors=20)
    train_processed, test_processed = top_revenue_writers(train_processed, test_processed, n_writers=10)
    train_processed, test_processed = top_revenue_producers(train_processed, test_processed, n_producers=20)

    # Add popularity weight columns to numeric_cols AFTER crew features are added
    pop_weight_cols = [col for col in train_processed.columns if col.endswith('_pop_weight')]
    numeric_cols.extend(pop_weight_cols)

    # 4. LANGUAGE FEATURES
    english_code = encoders_and_filters['language_encoder'].transform(['en'])[0]
    train_processed['is_english'] = (train_processed['language_encoded'] == english_code).astype(int)
    test_processed['is_english'] = (test_processed['language_encoded'] == english_code).astype(int)



    # Before scaling, check for problematic columns
    print("\nChecking for problematic columns before scaling:")
    columns_to_exclude = []
    for col in train_processed.columns: 
        unique_vals = train_processed[col].nunique()
        has_nan = train_processed[col].isna().any()
        try:
            variance = train_processed[col].var()
        except:
            variance = 0
        if unique_vals == 1 or has_nan or variance == 0:
            print(f"\nPotentially problematic column: {col}")
            print(f"Unique values: {unique_vals}")
            print(f"Has NaN: {has_nan}")
            print(f"Variance: {variance}")
            if col in numeric_cols:
                columns_to_exclude.append(col)

    for col in numeric_cols:
        if col in train_processed.columns:
            train_processed[col] = train_processed[col].fillna(0)
            test_processed[col] = test_processed[col].fillna(0)

    columns_to_exclude = []
    # ... check for problematic columns ...
    numeric_cols = [col for col in numeric_cols if col not in columns_to_exclude]

    
    # 5. SCALING - now all columns exist before scaling
    print("Scaling numerical features...")
    scaler = StandardScaler()
    train_processed[numeric_cols] = scaler.fit_transform(train_processed[numeric_cols])
    test_processed[numeric_cols] = scaler.transform(test_processed[numeric_cols])

    # Save the scaler
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/feature_scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    # Save all transformation data
    transformation_data = {
        'feature_scaler': scaler,
        'encoders_and_filters': encoders_and_filters,
        'numeric_cols': numeric_cols,
        'genre_columns': genre_columns,
        'all_features': list(train_processed.columns),
        'train_stats': {
            'budget_mean': TrainSet['budget'].mean(),
            'budget_std': TrainSet['budget'].std(),
            'revenue_mean': TrainSet['revenue'].mean(),
            'revenue_std': TrainSet['revenue'].std()
        }
    }
    
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/full_transformation_data.pkl', 'wb') as f:
        pickle.dump(transformation_data, f)

    # Rest of your function remains the same...
    
    return train_processed , test_processed

In [16]:
engineer_movie_features(TrainSet,TestSet)

Starting feature engineering...
Initial columns: 1131
Successfully loaded encoders and filters from cleaning stage
Using 19 genre columns: ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']

Engineering budget features...
Engineering runtime features...
Number of columns after adding top actor features: 1192
Number of director columns found: 227
First few director columns: ['crew_Director_Aaron Seltzer', 'crew_Director_Adam McKay', 'crew_Director_Adam Shankman', 'crew_Director_Alejandro González Iñárritu', 'crew_Director_Alex Proyas']

Number of writer columns found: 11
First few writer columns: ['crew_Writer_David Zucker', 'crew_Writer_Ethan Coen', 'crew_Writer_Joel Coen', 'crew_Writer_Kevin Smith', 'crew_Writer_Luc Besson']

Found 11 writers

All writers and their metrics:
David Zucker: 0 movies, $0.00 total revenue
Ethan C

(        budget    revenue   runtime  language_encoded  popularity  Action  \
 3262 -0.743069   25288872 -0.933870                 7   -0.240590       0   
 4578 -0.743069          0 -1.033493                 7   -0.598643       0   
 1774 -0.137597    6673422 -0.385946                 7   -0.444570       0   
 1957 -0.404004   34670720  0.361223                 7   -0.255567       0   
 4288 -0.712795    6000000  0.311411                 7   -0.463468       0   
 ...        ...        ...       ...               ...         ...     ...   
 4426 -0.743069          0 -1.083304                 7   -0.679507       0   
 466   1.194442  123729176 -0.784437                 7    0.096981       1   
 3092 -0.500880    7022728 -0.784437                 7   -0.240274       0   
 3772 -0.646193    2426851 -1.083304                 7   -0.512779       0   
 860   0.588970   89519773  0.361223                 7    0.001468       0   
 
       Adventure  Animation  Comedy  Crime  ...  \
 3262      

## Feature Engineering Spreadsheet Summary


- Languages are properly encoded using LabelEncoder
- Genre columnes are already one-hot encoded
- Budget is both log- transformed and scaled
- Saved the encoders and scalers
- Target variable (revenue) is Lon-transformed to handle skewness and scaled using StandardScaler
- Processed datasets are saved.
    



# PUSH TO REPO

Save as one file

In [17]:

# Run feature engineering
print("Starting feature engineering process...")
train_processed, test_processed = engineer_movie_features(TrainSet, TestSet)

if train_processed is not None and test_processed is not None:
    print("\nFeature Engineering Results:")
    print(f"Final TrainSet shape: {train_processed.shape}")
    print(f"Final TestSet shape: {test_processed.shape}")
    
    # Save the processed datasets
    print("\nSaving processed datasets...")
    train_processed.to_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/train_df_engineered.pkl')
    test_processed.to_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/test_df_engineered.pkl')

Starting feature engineering process...
Starting feature engineering...
Initial columns: 1131
Successfully loaded encoders and filters from cleaning stage
Using 19 genre columns: ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']

Engineering budget features...
Engineering runtime features...
Number of columns after adding top actor features: 1192
Number of director columns found: 227
First few director columns: ['crew_Director_Aaron Seltzer', 'crew_Director_Adam McKay', 'crew_Director_Adam Shankman', 'crew_Director_Alejandro González Iñárritu', 'crew_Director_Alex Proyas']

Number of writer columns found: 11
First few writer columns: ['crew_Writer_David Zucker', 'crew_Writer_Ethan Coen', 'crew_Writer_Joel Coen', 'crew_Writer_Kevin Smith', 'crew_Writer_Luc Besson']

Found 11 writers

All writers and their metrics:
David Zucke

Split into X and Y 

In [19]:
# Load the saved processed datasets
train_processed = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/train_df_engineered.pkl')
test_processed = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/test_df_engineered.pkl')

print("\nFinal shapes after loading:")
print(f"TrainSet shape: {train_processed.shape}")
print(f"TestSet shape: {test_processed.shape}")

# Separate features (X) and target (y)
feature_columns = list(set([col for col in train_processed.columns if col != 'revenue']))

print(f"\nNumber of total feature columns: {len(feature_columns)}")

X_train = train_processed[feature_columns]
y_train = train_processed['revenue']

X_test = test_processed[feature_columns]
y_test = test_processed['revenue']

print(f"\nShapes after splitting:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

# Save the splits in the engineered directory
output_dir = '/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered'

print("\nSaving splits...")
X_train.to_pickle(f'{output_dir}/X_train.pkl')
X_test.to_pickle(f'{output_dir}/X_test.pkl')
y_train.to_pickle(f'{output_dir}/y_train.pkl')
y_test.to_pickle(f'{output_dir}/y_test.pkl')

print("Splits saved successfully!")

print(f"Training data shape: {train_processed.shape}")
print(f"Test data shape: {test_processed.shape}")
print("\nFeature engineering completed!")







Final shapes after loading:
TrainSet shape: (3284, 270)
TestSet shape: (813, 270)

Number of total feature columns: 269

Shapes after splitting:
X_train shape: (3284, 269)
y_train shape: (3284,)
X_test shape: (813, 269)
y_test shape: (813,)

Saving splits...
Splits saved successfully!
Training data shape: (3284, 270)
Test data shape: (813, 270)

Feature engineering completed!


Split into X and Y

In [None]:
# Load the saved processed datasets
train_processed = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/test_df_engineered.pkl')
test_processed = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/train_df_engineered.pkl')

print("Column names sample:")
print(list(train_processed.columns)[:10])  # Print first 10 column names


# Separate features (X) and target (y)
feature_columns = [col for col in train_processed.columns if col != 'revenue']

X_train = train_processed[feature_columns]
y_train = train_processed['revenue']

X_test = test_processed[feature_columns]
y_test = test_processed['revenue']


# Save the splits in the engineered directory
output_dir = '/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered'

# Print the shapes of the final datasets
print("\nDataset shapes:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

# Print a few sample rows from the datasets to inspect
print("\nSample rows from X_train:")
print(X_train.head())

print("\nSample rows from y_train:")
print(y_train.head())

print("\nSample rows from X_test:")
print(X_test.head())

print("\nSample rows from y_test:")
print(y_test.head())

# Print the list of final features
print("\nFeatures included in the final dataset:")
for feature in sorted(feature_columns):
    print(f"- {feature}")

print("\nFeature engineering completed!")
print(f"Training data shape: {train_processed.shape}")
print(f"Test data shape: {test_processed.shape}")

In [None]:
print(f"Number of unique feature columns: {len(set(feature_columns))}")
print(f"Number of feature columns: {len(feature_columns)}")


In [None]:
# Check for duplicate columns
print(f"Number of unique columns in TrainSet: {len(train_processed.columns.unique())}")
print(f"Number of columns in TrainSet: {len(train_processed.columns)}")
