# Feature Engineering



## Objectives

*   Engineer features for Regression model


## Inputs

* inputs/datasets/cleaned/test_df_cleaned.pkl
* inputs/datasets/cleaned/train_df_cleaned.pkl

## Outputs

* generate a list with variables to engineer

## Conclusions

* Feature Engineering Transformers
* 
* 

# Change working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/Film_Hit_prediction/jupyter_notebooks'

We want to make the parent of the current directory the new current directory.
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir


'/workspace/Film_Hit_prediction'

---

# Load Cleaned Data

Train Set

In [4]:
import os
import pandas as pd

# Correct path relative to the current directory
Train_set_path = "/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/train_df_cleaned.pkl"

try:
    TrainSet = pd.read_pickle(Train_set_path)
    print(TrainSet.head(3))
    print("Shape of the dataframe:", TrainSet.shape)
except FileNotFoundError:
    print(f"File not found at path: {Train_set_path}")

        budget   revenue  runtime  language_encoded  popularity  Action  \
2851  12000000  17292381     86.0                 7    1.081822       0   
3262         0  25288872     93.0                 7   14.969093       0   
4578         0         0     91.0                 7    3.291609       0   

      Adventure  Animation  Comedy  Crime  ...  cast_Whoopi Goldberg  \
2851          0          0       1      0  ...                     0   
3262          0          0       1      0  ...                     0   
4578          0          0       0      0  ...                     0   

      cast_Will Ferrell  cast_Will Smith  cast_Willem Dafoe  \
2851                  0                0                  0   
3262                  0                0                  0   
4578                  0                0                  0   

      cast_William Fichtner  cast_William H. Macy  cast_Winona Ryder  \
2851                      0                     0                  0   
3262         

Test Set

In [5]:
import os
import pandas as pd

# Correct path relative to the current directory
Test_set_path = "/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/test_df_cleaned.pkl"

try:
    TestSet = pd.read_pickle(Test_set_path)
    print(TestSet.head(3))
    print("Shape of the dataframe:", TestSet.shape)
except FileNotFoundError:
    print(f"File not found at path: {Test_set_path}")

        budget   revenue  runtime  language_encoded  popularity  Action  \
596   70000000  33561137     97.0                 7   13.267631       1   
4507    560000  12299668     88.0                 7   10.730056       0   
3049         0         0     89.0                 7    5.842299       0   

      Adventure  Animation  Comedy  Crime  ...  cast_Whoopi Goldberg  \
596           1          0       1      0  ...                     0   
4507          0          0       1      0  ...                     0   
3049          0          0       1      0  ...                     0   

      cast_Will Ferrell  cast_Will Smith  cast_Willem Dafoe  \
596                   0                0                  0   
4507                  0                0                  0   
3049                  0                0                  0   

      cast_William Fichtner  cast_William H. Macy  cast_Winona Ryder  \
596                       0                     0                  0   
4507         

# Data Exploration

Evaluate potential transformations to be made


In [6]:
import pandas as pd

# Load your dataset (replace with your dataset path)
# Example: TrainSet = pd.read_csv('your_dataset.csv')

# Sample a subset of the dataset (e.g., 1000 rows or 10%)
sampled_df = TrainSet.sample(n=min(1000, len(TrainSet)), random_state=42)

# Basic profiling
print("Dataset Overview:")
print(sampled_df.info())

print("\nSummary Statistics:")
print(sampled_df.describe())

print("\nMissing Values:")
print(sampled_df.isnull().sum())

print("\nTop 5 Rows:")
print(sampled_df.head())

# Save the sampled data if needed for further exploration
sampled_df.to_csv('sampled_dataset.csv', index=False)





Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 2638 to 1509
Columns: 1269 entries, budget to cast_Zooey Deschanel
dtypes: float64(2), int64(1248), uint8(19)
memory usage: 9.6 MB
None

Summary Statistics:
             budget       revenue      runtime  language_encoded   popularity  \
count  1.000000e+03  1.000000e+03  1000.000000       1000.000000  1000.000000   
mean   2.866518e+07  7.805203e+07   106.956000          7.453000    21.753048   
std    4.092802e+07  1.593674e+08    23.580121          3.018095    39.219849   
min    0.000000e+00  0.000000e+00     0.000000          1.000000     0.004998   
25%    6.437500e+05  0.000000e+00    94.000000          7.000000     5.010213   
50%    1.400000e+07  1.792979e+07   104.000000          7.000000    12.437092   
75%    3.825000e+07  9.008415e+07   117.000000          7.000000    27.921399   
max    2.800000e+08  1.519558e+09   338.000000         36.000000   875.581305   

            Action   Adventure 

# Correlation and PPS Analysis

* We don’t expect changes compared to the data cleaning notebook 

# Feature Engineering

## Custom function

Function for top actors

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

def top_revenue_actors(TrainSet, TestSet, n_actors=50):

    # Create copies of input data to avoid modifications
    train_copy = TrainSet.copy()
    test_copy = TestSet.copy()
    
    #Find actors most correlated with high revenue
    cast_cols = [col for col in TrainSet.columns if col.startswith('cast_')]

    # Calculate multiple metrics for each actor
    actor_metrics = {}
    for col in cast_cols:
        actor_name = col.replace('cast_', '')
        movies_count = TrainSet[col].sum()
        
        if movies_count >= 10:  
            movies_with_actor = TrainSet[TrainSet[col] == 1]
            
            metrics = {
                'movies_count': movies_count,
                'total_revenue': movies_with_actor['revenue'].sum(),
                'avg_revenue': movies_with_actor['revenue'].mean(),
                'revenue_consistency': movies_with_actor['revenue'].std(),
                'hit_rate': (movies_with_actor['revenue'] > movies_with_actor['revenue'].mean()).mean(),
                'avg_popularity': movies_with_actor['popularity'].mean(),
                'popularity_consistency': movies_with_actor['popularity'].std(),
                'revenue_popularity_correlation': movies_with_actor[['revenue', 'popularity']].corr().iloc[0,1]
            }
            
            actor_metrics[actor_name] = metrics
    
    # Sort actors by different metrics and print insights
    print("\nTop actors by total revenue:")
    top_by_total = sorted(actor_metrics.items(), key=lambda x: x[1]['total_revenue'], reverse=True)[:20]
    for actor, metrics in top_by_total:
        print(f"- {actor}: Total revenue ${metrics['total_revenue']:,.2f} ({metrics['movies_count']} movies)")
        
    print("\nTop actors by average popularity (minimum 10 movies):")
    top_by_popularity = [(actor, metrics) for actor, metrics in actor_metrics.items() 
                        if metrics['movies_count'] >= 10]
    top_by_popularity = sorted(top_by_popularity, key=lambda x: x[1]['avg_popularity'], reverse=True)[:20]
    for actor, metrics in top_by_popularity:
        print(f"- {actor}: Avg popularity {metrics['avg_popularity']:.2f} ({metrics['movies_count']} movies)")
        
    print("\nActors with strongest revenue-popularity correlation:")
    top_by_correlation = [(actor, metrics) for actor, metrics in actor_metrics.items() 
                         if metrics['movies_count'] >= 10]
    top_by_correlation = sorted(top_by_correlation, key=lambda x: abs(x[1]['revenue_popularity_correlation']), reverse=True)[:20]
    for actor, metrics in top_by_correlation:
        print(f"- {actor}: Correlation {metrics['revenue_popularity_correlation']:.3f}")

    # Enhanced composite score including popularity metrics
    for actor in actor_metrics:
        metrics = actor_metrics[actor]
        # Normalize each metric between 0 and 1
        revenue_norm = metrics['total_revenue'] / max(m['total_revenue'] for m in actor_metrics.values())
        avg_norm = metrics['avg_revenue'] / max(m['avg_revenue'] for m in actor_metrics.values())
        consistency_norm = 1 - (metrics['revenue_consistency'] / max(m['revenue_consistency'] for m in actor_metrics.values()))
        popularity_norm = metrics['avg_popularity'] / max(m['avg_popularity'] for m in actor_metrics.values())
        correlation_norm = abs(metrics['revenue_popularity_correlation'])
        
        # Composite score with popularity factors
        metrics['composite_score'] = (
            0.3 * revenue_norm +           # Total revenue importance
            0.2 * avg_norm +               # Average revenue importance
            0.2 * consistency_norm +       # Revenue consistency importance
            0.2 * popularity_norm +        # Popularity importance
            0.1 * correlation_norm         # Revenue-popularity correlation importance
        )

    # Get top actors based on composite score
    top_actors = sorted(actor_metrics.items(), key=lambda x: x[1]['composite_score'], reverse=True)[:n_actors]
    
    # Convert to column names for processing
    top_actor_cols = [f"cast_{actor}" for actor, _ in top_actors]

    print(f"Number of columns after adding top actor features: {len(train_copy.columns) + 2*n_actors}") 

    # Create actor_metrics_dict once
    actor_metrics_dict = {name: metrics for name, metrics in actor_metrics.items()}

    # Process train and test data
    processed_dfs = []
    for df in [train_copy, test_copy]:
        new_columns = {}
        
        # Add top actor columns
        for actor_col in top_actor_cols:
            if actor_col in df.columns:
                new_columns[actor_col] = df[actor_col]
                actor_name = actor_col.replace('cast_', '')
                if actor_name in actor_metrics:
                    new_columns[f"{actor_col}_pop_weight"] = (
                        df[actor_col] * actor_metrics[actor_name]['avg_popularity']
                    )
            else:
                new_columns[actor_col] = pd.Series(0, index=df.index)
                new_columns[f"{actor_col}_pop_weight"] = pd.Series(0, index=df.index)
    
        
        # Calculate other_actor_count and their average popularity
        all_cast_cols = [col for col in df.columns if col.startswith('cast_')]
        other_cast_cols = [col for col in all_cast_cols if col not in top_actor_cols]
        new_columns['other_actor_count'] = df[other_cast_cols].sum(axis=1)

        # Keep original columns that we want to keep
        keep_cols = [col for col in df.columns 
                    if not col.startswith('cast_') or col in top_actor_cols]
        
        # Create dataframes for concatenation
        original_df = df[keep_cols].reset_index(drop=True)
        new_df = pd.DataFrame(new_columns, index=df.index).reset_index(drop=True)

        print(f"Unique columns before concat: {len(set(original_df.columns) | set(new_df.columns))}")

       

        overlapping_cols = set(original_df.columns).intersection(new_df.columns)
        print(f"Overlapping columns before concat: {len(overlapping_cols)}")

         # Drop overlapping columns from new_df to avoid duplication
        new_df = new_df.drop(columns=overlapping_cols)
        
        print(f"Shape before concat - original_df: {original_df.shape}, new_df: {new_df.shape}")

        # Combine using concat
        processed = pd.concat([original_df, new_df], axis=1)

        print(f"Shape after concat: {processed.shape}")

        
        processed_dfs.append(processed)
        
    # Save top actors and their metrics for future use
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/top_revenue_actors.pkl', 'wb') as f:
        pickle.dump({'columns': top_actor_cols, 'metrics': actor_metrics}, f)
    
    print(f"Final shape - TrainSet_processed: {processed_dfs[0].shape}, TestSet_processed: {processed_dfs[1].shape}")
    return processed_dfs[0], processed_dfs[1]
    

In [8]:
TrainSet_processed, TestSet_processed = top_revenue_actors(TrainSet, TestSet, n_actors=50)


Top actors by total revenue:
- Stan Lee: Total revenue $13,030,512,458.00 (20 movies)
- John Ratzenberger: Total revenue $10,586,752,823.00 (20 movies)
- Samuel L. Jackson: Total revenue $10,531,032,751.00 (51 movies)
- Frank Welker: Total revenue $9,201,329,450.00 (26 movies)
- Hugo Weaving: Total revenue $9,156,461,213.00 (18 movies)
- Tom Hanks: Total revenue $8,134,372,182.00 (27 movies)
- Cate Blanchett: Total revenue $8,021,268,624.00 (24 movies)
- Morgan Freeman: Total revenue $7,927,404,828.00 (39 movies)
- Andy Serkis: Total revenue $7,810,516,395.00 (19 movies)
- Alan Tudyk: Total revenue $7,321,351,720.00 (22 movies)
- Ian McKellen: Total revenue $7,219,005,112.00 (15 movies)
- Stellan Skarsgård: Total revenue $7,104,154,955.00 (17 movies)
- Will Smith: Total revenue $6,953,647,823.00 (20 movies)
- Tom Cruise: Total revenue $6,875,112,325.00 (26 movies)
- Elizabeth Banks: Total revenue $6,626,195,492.00 (22 movies)
- Stanley Tucci: Total revenue $6,219,048,888.00 (30 movies

Function for top directors

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

def top_revenue_directors(TrainSet, TestSet, n_directors=50):
    # Create copies of input data to avoid modifications
    train_copy = TrainSet.copy()
    test_copy = TestSet.copy()

    # Find top revenue-generating directors
    director_cols = [col for col in TrainSet.columns if col.startswith('crew_Director_')]
    print(f"Number of director columns found: {len(director_cols)}")
    print("First few director columns:", director_cols[:5])
    
    # Calculate multiple metrics for each director
    director_metrics = {}
    for col in director_cols:
        director_name = col.replace('crew_Director_', '')
        movies_count = TrainSet[col].sum()
        
        if movies_count >= 5:
            movies_with_director = TrainSet[TrainSet[col] == 1]
            
            metrics = {
                'movies_count': movies_count,
                'total_revenue': movies_with_director['revenue'].sum(),
                'avg_revenue': movies_with_director['revenue'].mean(),
                'revenue_consistency': movies_with_director['revenue'].std(),
                'hit_rate': (movies_with_director['revenue'] > movies_with_director['revenue'].mean()).mean(),
                'avg_popularity': movies_with_director['popularity'].mean(),
                'popularity_consistency': movies_with_director['popularity'].std(),
                'revenue_popularity_correlation': movies_with_director[['revenue', 'popularity']].corr().iloc[0,1]
            }
            
            director_metrics[director_name] = metrics
    
    # Sort directors by different metrics and print insights
    print("\nTop directors by total revenue:")
    top_by_total = sorted(director_metrics.items(), key=lambda x: x[1]['total_revenue'], reverse=True)[:20]
    for director, metrics in top_by_total:
        print(f"- {director}: Total revenue ${metrics['total_revenue']:,.2f} ({metrics['movies_count']} movies)")
        
    print("\nTop directors by average popularity (minimum 10 movies):")
    top_by_popularity = [(director, metrics) for director, metrics in director_metrics.items() 
                        if metrics['movies_count'] >= 10]
    top_by_popularity = sorted(top_by_popularity, key=lambda x: x[1]['avg_popularity'], reverse=True)[:20]
    for director, metrics in top_by_popularity:
        print(f"- {director}: Avg popularity {metrics['avg_popularity']:.2f} ({metrics['movies_count']} movies)")
        
    print("\nDirectors with strongest revenue-popularity correlation:")
    top_by_correlation = [(director, metrics) for director, metrics in director_metrics.items() 
                         if metrics['movies_count'] >= 10]
    top_by_correlation = sorted(top_by_correlation, key=lambda x: abs(x[1]['revenue_popularity_correlation']), reverse=True)[:20]
    for director, metrics in top_by_correlation:
        print(f"- {director}: Correlation {metrics['revenue_popularity_correlation']:.3f}")

    # Enhanced composite score including popularity metrics
    for director in director_metrics:
        metrics = director_metrics[director]
        # Normalize each metric between 0 and 1
        revenue_norm = metrics['total_revenue'] / max(m['total_revenue'] for m in director_metrics.values())
        avg_norm = metrics['avg_revenue'] / max(m['avg_revenue'] for m in director_metrics.values())
        consistency_norm = 1 - (metrics['revenue_consistency'] / max(m['revenue_consistency'] for m in director_metrics.values()))
        popularity_norm = metrics['avg_popularity'] / max(m['avg_popularity'] for m in director_metrics.values())
        correlation_norm = abs(metrics['revenue_popularity_correlation'])
        
        # Composite score with popularity factors
        metrics['composite_score'] = (
            0.3 * revenue_norm +           # Total revenue importance
            0.2 * avg_norm +               # Average revenue importance
            0.2 * consistency_norm +       # Revenue consistency importance
            0.2 * popularity_norm +        # Popularity importance
            0.1 * correlation_norm         # Revenue-popularity correlation importance
        )

    # Get top directors based on composite score
    top_directors = sorted(director_metrics.items(), key=lambda x: x[1]['composite_score'], reverse=True)[:n_directors]
    
    # Convert to column names for processing
    top_director_cols = [f"crew_Director_{director}" for director, _ in top_directors]

    # Create director_metrics_dict once
    director_metrics_dict = {name: metrics for name, metrics in director_metrics.items()}

    # Process train and test data
    processed_dfs = []
    for df in [train_copy, test_copy]:
        new_columns = {}
        
        # Add top director columns
        for director_col in top_director_cols:
            if director_col in df.columns:
                new_columns[director_col] = df[director_col]
                director_name = director_col.replace('crew_Director_', '')
                if director_name in director_metrics:
                    new_columns[f"{director_col}_pop_weight"] = (
                        df[director_col] * director_metrics[director_name]['avg_popularity']
                    )
            else:
                new_columns[director_col] = pd.Series(0, index=df.index)
                new_columns[f"{director_col}_pop_weight"] = pd.Series(0, index=df.index)

        
        # Calculate other_director_count and their average popularity
        all_director_cols = [col for col in df.columns if col.startswith('crew_Director_')]
        other_director_cols = [col for col in all_director_cols if col not in top_director_cols]
        new_columns['other_director_count'] = df[other_director_cols].sum(axis=1)
        
        # Drop original director columns
        keep_cols = [col for col in df.columns 
                    if not col.startswith('crew_Director_') or col in top_director_cols]

        # Create dataframes for concatenation
        original_df = df[keep_cols]
        new_df = pd.DataFrame(new_columns, index=df.index)

        # Combine using concat
        processed = pd.concat([original_df, new_df], axis=1)
        
        processed_dfs.append(processed)
    
    # Save top directors and their metrics for future use
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/top_revenue_directors.pkl', 'wb') as f:
        pickle.dump({'columns': top_director_cols, 'metrics': director_metrics}, f)
    
    return processed_dfs[0], processed_dfs[1]

In [10]:
print("\nStarting director feature engineering...")
TrainSet_processed, TestSet_processed = top_revenue_directors(TrainSet, TestSet, n_directors=50)

# See what the processed data looks like
print("\nFeature Engineering Summary:")
print("-" * 30)
print("Top directors analyzed by:")
print("- Total revenue")
print("- Average popularity")
print("- Revenue-popularity correlation")
print("\nDataset Shapes:")
print(f"Processed train shape: {TrainSet_processed.shape}")
print(f"Processed test shape: {TestSet_processed.shape}")




Starting director feature engineering...
Number of director columns found: 227
First few director columns: ['crew_Director_Aaron Seltzer', 'crew_Director_Adam McKay', 'crew_Director_Adam Shankman', 'crew_Director_Alejandro González Iñárritu', 'crew_Director_Alex Proyas']

Top directors by total revenue:
- Steven Spielberg: Total revenue $7,982,777,107.00 (20 movies)
- Peter Jackson: Total revenue $5,542,623,032.00 (8 movies)
- James Cameron: Total revenue $5,363,569,439.00 (6 movies)
- Michael Bay: Total revenue $4,569,015,292.00 (10 movies)
- Chris Columbus: Total revenue $3,693,960,883.00 (9 movies)
- Robert Zemeckis: Total revenue $3,394,886,126.00 (12 movies)
- Tim Burton: Total revenue $3,199,920,276.00 (11 movies)
- Sam Raimi: Total revenue $3,107,859,462.00 (10 movies)
- Roland Emmerich: Total revenue $2,956,821,040.00 (8 movies)
- Francis Lawrence: Total revenue $2,952,457,182.00 (5 movies)
- Carlos Saldanha: Total revenue $2,793,148,786.00 (5 movies)
- Ron Howard: Total reven

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle


def top_revenue_writers(TrainSet, TestSet, n_writers=50):
    # Create copies of input data to avoid modifications
    train_copy = TrainSet.copy()
    test_copy = TestSet.copy()

    # Find top revenue-generating writers
    writer_cols = [col for col in TrainSet.columns if col.startswith('crew_Writer_')]
    
    # Calculate multiple metrics for each writer
    writer_metrics = {}
    for col in writer_cols:
        writer_name = col.replace('crew_Writer_', '')
        movies_count = TrainSet[col].sum()
        
        if movies_count >= 5:
            movies_with_writer = TrainSet[TrainSet[col] == 1]
            
            metrics = {
                'movies_count': movies_count,
                'total_revenue': movies_with_writer['revenue'].sum(),
                'avg_revenue': movies_with_writer['revenue'].mean(),
                'revenue_consistency': movies_with_writer['revenue'].std(),
                'hit_rate': (movies_with_writer['revenue'] > movies_with_writer['revenue'].mean()).mean(),
                'avg_popularity': movies_with_writer['popularity'].mean(),
                'popularity_consistency': movies_with_writer['popularity'].std(),
                'revenue_popularity_correlation': movies_with_writer[['revenue', 'popularity']].corr().iloc[0,1]
            }
            
            writer_metrics[writer_name] = metrics
    
    # Sort writers by different metrics and print insights
    print("\nTop writers by total revenue:")
    top_by_total = sorted(writer_metrics.items(), key=lambda x: x[1]['total_revenue'], reverse=True)[:20]
    for writer, metrics in top_by_total:
        print(f"- {writer}: Total revenue ${metrics['total_revenue']:,.2f} ({metrics['movies_count']} movies)")
        
    print("\nTop writers by average popularity (minimum 5 movies):")
    top_by_popularity = [(writer, metrics) for writer, metrics in writer_metrics.items() 
                        if metrics['movies_count'] >= 5]
    top_by_popularity = sorted(top_by_popularity, key=lambda x: x[1]['avg_popularity'], reverse=True)[:20]
    for writer, metrics in top_by_popularity:
        print(f"- {writer}: Avg popularity {metrics['avg_popularity']:.2f} ({metrics['movies_count']} movies)")
        
    print("\nWriters with strongest revenue-popularity correlation:")
    top_by_correlation = [(writer, metrics) for writer, metrics in writer_metrics.items() 
                         if metrics['movies_count'] >= 5]
    top_by_correlation = sorted(top_by_correlation, key=lambda x: abs(x[1]['revenue_popularity_correlation']), reverse=True)[:20]
    for writer, metrics in top_by_correlation:
        print(f"- {writer}: Correlation {metrics['revenue_popularity_correlation']:.3f}")

    # Enhanced composite score including popularity metrics
    for writer in writer_metrics:
        metrics = writer_metrics[writer]
        # Normalize each metric between 0 and 1
        revenue_norm = metrics['total_revenue'] / max(m['total_revenue'] for m in writer_metrics.values())
        avg_norm = metrics['avg_revenue'] / max(m['avg_revenue'] for m in writer_metrics.values())
        consistency_norm = 1 - (metrics['revenue_consistency'] / max(m['revenue_consistency'] for m in writer_metrics.values()))
        popularity_norm = metrics['avg_popularity'] / max(m['avg_popularity'] for m in writer_metrics.values())
        correlation_norm = abs(metrics['revenue_popularity_correlation'])
        
        # Composite score with popularity factors
        metrics['composite_score'] = (
            0.3 * revenue_norm +           # Total revenue importance
            0.2 * avg_norm +               # Average revenue importance
            0.2 * consistency_norm +       # Revenue consistency importance
            0.2 * popularity_norm +        # Popularity importance
            0.1 * correlation_norm         # Revenue-popularity correlation importance
        )

    # Get top writers based on composite score
    top_writers = sorted(writer_metrics.items(), key=lambda x: x[1]['composite_score'], reverse=True)[:n_writers]
    
    # Convert to column names for processing
    top_writer_cols = [f"crew_Writer_{writer}" for writer, _ in top_writers]

    # Process train and test data
    processed_dfs = []
    for df in [train_copy, test_copy]:
        processed = df.copy()
        
        # Add top writer columns
        for writer_col in top_writer_cols:
            if writer_col in df.columns:
                processed[writer_col] = df[writer_col]
                # Add popularity weighted presence
                writer_metrics_dict = {name: metrics for name, metrics in writer_metrics.items()}
                writer_name = writer_col.replace('crew_Writer_', '')
                if writer_name in writer_metrics_dict:
                    processed[f"{writer_col}_pop_weight"] = (
                        df[writer_col] * writer_metrics_dict[writer_name]['avg_popularity']
                    )
            else:
                processed[writer_col] = 0
                processed[f"{writer_col}_pop_weight"] = 0
        
        # Calculate other_writer_count
        all_writer_cols = [col for col in df.columns if col.startswith('crew_Writer_')]
        other_writer_cols = [col for col in all_writer_cols if col not in top_writer_cols]
        processed['other_writer_count'] = df[other_writer_cols].sum(axis=1)
        
        # Drop original writer columns
        cols_to_keep = [col for col in processed.columns 
                       if not col.startswith('crew_Writer_') or col in top_writer_cols 
                       or col.endswith('_pop_weight')]
        cols_to_keep.append('other_writer_count')
        processed = processed[cols_to_keep]
        
        processed_dfs.append(processed)
    
    # Save top writers and their metrics for future use
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/top_revenue_writers.pkl', 'wb') as f:
        pickle.dump({'columns': top_writer_cols, 'metrics': writer_metrics}, f)
    
    return processed_dfs[0], processed_dfs[1]

In [12]:
# Call the writer function
print("\nStarting writer feature engineering...")
TrainSet_processed, TestSet_processed = top_revenue_writers(TrainSet, TestSet, n_writers=50)

## See what the processed data looks like
print("\nFeature Engineering Summary:")
print("-" * 30)
print("Top directors analyzed by:")
print("- Total revenue")
print("- Average popularity")
print("- Revenue-popularity correlation")
print("\nDataset Shapes:")
print(f"Processed train shape: {TrainSet_processed.shape}")
print(f"Processed test shape: {TestSet_processed.shape}")


Starting writer feature engineering...

Top writers by total revenue:
- M. Night Shyamalan: Total revenue $1,904,372,773.00 (5 movies)
- Woody Allen: Total revenue $358,668,130.00 (5 movies)
- Robert Rodriguez: Total revenue $275,670,551.00 (5 movies)

Top writers by average popularity (minimum 5 movies):
- M. Night Shyamalan: Avg popularity 41.04 (5 movies)
- Woody Allen: Avg popularity 25.56 (5 movies)
- Robert Rodriguez: Avg popularity 15.49 (5 movies)

Writers with strongest revenue-popularity correlation:
- Woody Allen: Correlation 0.888
- M. Night Shyamalan: Correlation 0.837
- Robert Rodriguez: Correlation 0.233

Feature Engineering Summary:
------------------------------
Top directors analyzed by:
- Total revenue
- Average popularity
- Revenue-popularity correlation

Dataset Shapes:
Processed train shape: (3840, 1266)
Processed test shape: (961, 1266)


In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
    

def top_producers(TrainSet, TestSet, n_producers=50):

    # Create copies of input data to avoid modifications
    train_copy = TrainSet.copy()
    test_copy = TestSet.copy()

    # Find top revenue-generating producers
    producer_cols = [col for col in TrainSet.columns if col.startswith('crew_Producer_')]

    # Calculate multiple metrics for each producer
    producer_metrics = {}
    for col in producer_cols:
        producer_name = col.replace('crew_Producer_', '')
        movies_count = TrainSet[col].sum()
        
        if movies_count >= 5:
            movies_with_producer = TrainSet[TrainSet[col] == 1]
            
            metrics = {
                'movies_count': movies_count,
                'total_revenue': movies_with_producer['revenue'].sum(),
                'avg_revenue': movies_with_producer['revenue'].mean(),
                'revenue_consistency': movies_with_producer['revenue'].std(),
                'hit_rate': (movies_with_producer['revenue'] > movies_with_producer['revenue'].mean()).mean(),
                'avg_popularity': movies_with_producer['popularity'].mean(),
                'popularity_consistency': movies_with_producer['popularity'].std(),
                'revenue_popularity_correlation': movies_with_producer[['revenue', 'popularity']].corr().iloc[0,1]
            }
            
            producer_metrics[producer_name] = metrics
    
    # Sort producers by different metrics and print insights
    print("\nTop producers by total revenue:")
    top_by_total = sorted(producer_metrics.items(), key=lambda x: x[1]['total_revenue'], reverse=True)[:50]
    for producer, metrics in top_by_total:
        print(f"- {producer}: Total revenue ${metrics['total_revenue']:,.2f} ({metrics['movies_count']} movies)")
        
    print("\nTop producers by average popularity (minimum 5 movies):")
    top_by_popularity = [(producer, metrics) for producer, metrics in producer_metrics.items() 
                        if metrics['movies_count'] >= 5]
    top_by_popularity = sorted(top_by_popularity, key=lambda x: x[1]['avg_popularity'], reverse=True)[:20]
    for producer, metrics in top_by_popularity:
        print(f"- {producer}: Avg popularity {metrics['avg_popularity']:.2f} ({metrics['movies_count']} movies)")
        
    print("\nProducers with strongest revenue-popularity correlation:")
    top_by_correlation = [(producer, metrics) for producer, metrics in producer_metrics.items() 
                         if metrics['movies_count'] >= 5]
    top_by_correlation = sorted(top_by_correlation, key=lambda x: abs(x[1]['revenue_popularity_correlation']), reverse=True)[:20]
    for producer, metrics in top_by_correlation:
        print(f"- {producer}: Correlation {metrics['revenue_popularity_correlation']:.3f}")

    # Enhanced composite score including popularity metrics
    for producer in producer_metrics:
        metrics = producer_metrics[producer]
        # Normalize each metric between 0 and 1
        revenue_norm = metrics['total_revenue'] / max(m['total_revenue'] for m in producer_metrics.values())
        avg_norm = metrics['avg_revenue'] / max(m['avg_revenue'] for m in producer_metrics.values())
        consistency_norm = 1 - (metrics['revenue_consistency'] / max(m['revenue_consistency'] for m in producer_metrics.values()))
        popularity_norm = metrics['avg_popularity'] / max(m['avg_popularity'] for m in producer_metrics.values())
        correlation_norm = abs(metrics['revenue_popularity_correlation'])
        
        # Composite score with popularity factors
        metrics['composite_score'] = (
            0.3 * revenue_norm +           # Total revenue importance
            0.2 * avg_norm +               # Average revenue importance
            0.2 * consistency_norm +       # Revenue consistency importance
            0.2 * popularity_norm +        # Popularity importance
            0.1 * correlation_norm         # Revenue-popularity correlation importance
        )

    # Get top producers based on composite score
    top_producers = sorted(producer_metrics.items(), key=lambda x: x[1]['composite_score'], reverse=True)[:n_producers]
    
    # Convert to column names for processing
    top_producer_cols = [f"crew_Producer_{producer}" for producer, _ in top_producers]

    # Create producer_metrics_dict
    producer_metrics_dict = {name: metrics for name, metrics in producer_metrics.items()}


    # Process train and test data
    processed_dfs = []
    for df in [train_copy, test_copy]:
        processed = df.copy()
        # Create a dictionary to store all new columns
        new_columns = {}
        
        # Add top producer columns
        for producer_col in top_producer_cols:
            if producer_col in df.columns:
                new_columns[producer_col] = df[producer_col]
                # Add popularity weighted presence
                producer_name = producer_col.replace('crew_Producer_', '')
                if producer_name in producer_metrics:
                    new_columns[f"{producer_col}_pop_weight"] = (
                        df[producer_col] * producer_metrics_dict[producer_name]['avg_popularity']
                    )
            else:
                new_columns[producer_col] = 0
                new_columns[f"{producer_col}_pop_weight"] = 0
        
        # Calculate other_producer_count
        all_producer_cols = [col for col in df.columns if col.startswith('crew_Producer_')]
        other_producer_cols = [col for col in all_producer_cols if col not in top_producer_cols]
        new_columns['other_producer_count'] = df[other_producer_cols].sum(axis=1)
        
        # Drop original producer columns
        keep_cols = [col for col in processed.columns 
                       if not col.startswith('crew_Producer_') or col in top_producer_cols ]

        # Combine all columns efficiently
        processed = pd.concat([
            df[keep_cols],
            pd.DataFrame(new_columns, index=df.index)
        ], axis=1)            
        
        processed_dfs.append(processed)
    
    # Save top producers and their metrics for future use
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/top_revenue_producers.pkl', 'wb') as f:
        pickle.dump({'columns': top_producer_cols, 'metrics': producer_metrics}, f)
    
    return processed_dfs[0], processed_dfs[1]

In [14]:
# Call the producer function
print("\nStarting producer feature engineering...")
TrainSet_processed, TestSet_processed = top_producers(TrainSet, TestSet, n_producers=50)

# See what the processed data looks like
print("\nProcessed train shape:", TrainSet_processed.shape)
print("Processed test shape:", TestSet_processed.shape)



Starting producer feature engineering...

Top producers by total revenue:
- Kevin Feige: Total revenue $6,560,548,638.00 (9 movies)
- Peter Jackson: Total revenue $6,122,333,579.00 (9 movies)
- Kathleen Kennedy: Total revenue $5,464,498,339.00 (15 movies)
- Frank Marshall: Total revenue $5,323,094,360.00 (12 movies)
- Neal H. Moritz: Total revenue $5,268,147,935.00 (29 movies)
- Ian Bryce: Total revenue $5,204,097,785.00 (10 movies)
- Brian Grazer: Total revenue $4,987,608,298.00 (34 movies)
- Charles Roven: Total revenue $4,958,483,153.00 (15 movies)
- Joel Silver: Total revenue $4,819,815,902.00 (34 movies)
- Jerry Bruckheimer: Total revenue $4,736,272,808.00 (20 movies)
- David Heyman: Total revenue $4,519,054,357.00 (8 movies)
- Wyck Godfrey: Total revenue $4,418,199,081.00 (12 movies)
- Steven Spielberg: Total revenue $4,144,065,445.00 (17 movies)
- Lorenzo di Bonaventura: Total revenue $4,117,504,044.00 (17 movies)
- Simon Kinberg: Total revenue $4,046,393,876.00 (11 movies)
- A

In [15]:
# Print the shape of the Train and Test sets before feature engineering
print(f"Before feature engineering - Train set shape: {TrainSet.shape}")
print(f"Before feature engineering - Test set shape: {TestSet.shape}")


Before feature engineering - Train set shape: (3840, 1269)
Before feature engineering - Test set shape: (961, 1269)


In [17]:

def engineer_movie_features(TrainSet,TestSet):
    print("Starting feature engineering...")
    print(f"Initial columns: {TrainSet.shape[1]}")

    # Load encoders from cleaning stage
    try:
        with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/encoders_and_filters.pkl', 'rb') as f:
            encoders_and_filters = pickle.load(f)
            print("Successfully loaded encoders and filters from cleaning stage")
    except Exception as e:
        print(f"Error loading encoders: {str(e)}")
        return None

    train_processed = TrainSet.copy()  
    test_processed = TestSet.copy()

    # Remove movies with missing revenue
    train_processed = train_processed.dropna(subset=['revenue','runtime','budget'])
    test_processed = test_processed.dropna(subset=['revenue','runtime','budget'])

    print(f"Removed {len(TrainSet) - len(train_processed)} training movies with missing revenue, runtime, or budget")
    print(f"Removed {len(TestSet) - len(test_processed)} test movies with missing revenue, runtime, or budget")

    # 1. BUDGET FEATURES
    print("\nEngineering budget features...")

    # Remoet per minute
    train_processed['budget_per_minute'] = train_processed['budget'] / train_processed['runtime'].replace(0, np.nan)
    test_processed['budget_per_minute'] = test_processed['budget'] / test_processed['runtime'].replace(0, np.nan)
    
    print(f"Removed {len(TrainSet) - len(train_processed)} training movies without budget")  
    print(f"Removed {len(TestSet) - len(test_processed)} test movies without budget")  
    
    # 2. RUNTIME FEATURES
    print("Engineering runtime features...")

    # Remove movies shorter than 90 minutes
    train_processed = train_processed[train_processed['runtime'] >= 90]
    test_processed = test_processed[test_processed['runtime'] >= 90]
    print(f"Removed {len(TrainSet) - len(train_processed)} training movies shorter than 90 minutes")  # Fixed variable name
    print(f"Removed {len(TestSet) - len(test_processed)} test movies shorter than 90 minutes") 

    
    # 3. CAST/CREW FEATURES
    print("Engineering cast/crew features...")

    # Process cast and crew using our dedicated functions
    train_processed, test_processed = top_revenue_actors(train_processed, test_processed, n_actors=50)
    print(f"Columns after actors: {train_processed.shape[1]}")
    train_processed, test_processed = top_revenue_directors(train_processed, test_processed, n_directors=50 )
    print(f"Columns after directors: {train_processed.shape[1]}")
    train_processed, test_processed = top_revenue_writers(train_processed, test_processed, n_writers=50)
    print(f"Columns after writers: {train_processed.shape[1]}")
    train_processed, test_processed = top_producers(train_processed, test_processed, n_producers=50)
    print(f"Columns after producers: {train_processed.shape[1]}")


    # 5. LANGUAGE FEATURES
    print("Engineering language features...")

    # Binary flag for English language
    english_code = encoders_and_filters['language_encoder'].transform(['en'])[0]
    train_processed['is_english'] = (train_processed['language_encoded'] == english_code).astype(int)
    test_processed['is_english'] = (test_processed['language_encoded'] == english_code).astype(int)
    
    # 6. SCALING NUMERICAL FEATURES
    print("Scaling numerical features...")
    scaler = StandardScaler()

    # Identify numerical columns to scale
    numeric_cols = [
    'budget', 
    'revenue', 
    'popularity',
    'budget_per_minute',
    ]

    # Add popularity weight columns to numeric_cols
    pop_weight_cols = [col for col in train_processed.columns if col.endswith('_pop_weight')]
    numeric_cols.extend(pop_weight_cols)

    # Scale the identified numeric columns
    print(f"Scaling numerical columns: {numeric_cols}")

    # Fit scaler on training data and transform both datasets
    train_processed[numeric_cols] = scaler.fit_transform(train_processed[numeric_cols])
    test_processed[numeric_cols] = scaler.transform(test_processed[numeric_cols])

    # Save the scaler for future use
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/feature_scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    print(f"9. After scaling: {train_processed.shape[1]} columns")
    
    try:
     # Save the encoders and scaler together for future use
        with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/feature_engineering_data.pkl', 'wb') as f:
            pickle.dump({
                'scaler': scaler,
                'encoders': encoders_and_filters
            }, f)
            print("Data saved successfully!")
    except Exception as e:
            print(f"Error during saving: {e}")
    
    # Define genre columns and encoded features
    genre_columns = [
        'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Family', 'Fantasy', 'History',
        'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction',
        'TV Movie', 'Thriller', 'War', 'Western'
    ]
    
    encoded_features = ['language_encoded', 'companies_encoded', 'countries_encoded']

    # List all features being used
    all_features = (
        numeric_cols +
        genre_columns +
        encoded_features +
        [col for col in train_processed.columns if col.startswith('cast_')] +
        [col for col in train_processed.columns if col.startswith('crew_')] +
        ['is_long_movie', 'is_english', 'other_actor_count', 'other_director_count', 
         'other_writer_count', 'other_producer_count']
    )

    print("\nFeatures included in the final dataset:")
    for feature in sorted(all_features):
        print(f"- {feature}")

    print("\nFeature engineering completed!")
    print(f"Training data shape: {train_processed.shape}")
    print(f"Test data shape: {test_processed.shape}")

    # Save the list of all features
    with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/all_features.pkl', 'wb') as f:
        pickle.dump(all_features, f)


    print("All features saved successfully!")

    
    return train_processed, test_processed


In [18]:
engineer_movie_features(TrainSet,TestSet)

Starting feature engineering...
Initial columns: 1269
Successfully loaded encoders and filters from cleaning stage
Removed 0 training movies with missing revenue, runtime, or budget
Removed 0 test movies with missing revenue, runtime, or budget

Engineering budget features...
Removed 0 training movies without budget
Removed 0 test movies without budget
Engineering runtime features...
Removed 556 training movies shorter than 90 minutes
Removed 148 test movies shorter than 90 minutes
Engineering cast/crew features...

Top actors by total revenue:
- Stan Lee: Total revenue $13,030,512,458.00 (20 movies)
- John Ratzenberger: Total revenue $10,586,752,823.00 (20 movies)
- Samuel L. Jackson: Total revenue $10,201,483,439.00 (45 movies)
- Hugo Weaving: Total revenue $8,902,326,303.00 (17 movies)
- Cate Blanchett: Total revenue $8,021,268,624.00 (24 movies)
- Andy Serkis: Total revenue $7,746,057,079.00 (18 movies)
- Morgan Freeman: Total revenue $7,666,657,214.00 (36 movies)
- Tom Hanks: Tota

(        budget   revenue  runtime  language_encoded  popularity  Action  \
 0    -0.743069 -0.367131     93.0                 7   -0.240590       0   
 1    -0.743069 -0.514890     91.0                 7   -0.598643       0   
 2    -0.137597 -0.475899    104.0                 7   -0.444570       0   
 3    -0.404004 -0.312314    119.0                 7   -0.255567       0   
 4    -0.712795 -0.479833    118.0                 7   -0.463468       0   
 ...        ...       ...      ...               ...         ...     ...   
 3279 -0.743069 -0.514890     90.0                 7   -0.679507       0   
 3280  1.194442  0.208042     96.0                 7    0.096981       1   
 3281 -0.500880 -0.473858     96.0                 7   -0.240274       0   
 3282 -0.646193 -0.500711     90.0                 7   -0.512779       0   
 3283  0.588970  0.008161    119.0                 7    0.001468       0   
 
       Adventure  Animation  Comedy  Crime  ...  crew_Producer_Shawn Levy  \
 0       

In [53]:
'''
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
  

def process_new_movie (actor1, actor2, crew_director, crew_writer, crew_producer):
    """
    Process new movie input with two actors, director, writer, producer
    """
    try:
        # Load all saved data with correct file names
        with open('top_revenue_actors.pkl', 'rb') as f:
            actors_data = pickle.load(f)
            top_actors = actors_data['columns']
            actor_metrics = actors_data['metrics']

        with open('top_revenue_directors.pkl', 'rb') as f:
            directors_data = pickle.load(f)
            top_directors = directors_data['columns']
            director_metrics = directors_data['metrics']
            
        with open('top_revenue_writers.pkl', 'rb') as f:
            writers_data = pickle.load(f)
            top_writers = writers_data['columns']
            writer_metrics = writers_data['metrics']
            
        with open('top_revenue_producers.pkl', 'rb') as f:
            producers_data = pickle.load(f)
            top_producers = producers_data['columns']
            producer_metrics = producers_data['metrics']
        
        features = {}
        
        # Process actors
        actor1_col = f'cast_{actor1}'
        actor2_col = f'cast_{actor2}'
        
        # Initialize other_actors count
        features['other_actors'] = 0
        
        # Process first actor
        if actor1_col in top_actors:
            features[actor1_col] = 1
            if actor1 in actor_metrics:
                features[f"{actor1_col}_pop_weight"] = actor_metrics[actor1]['avg_popularity']
        else:
            features['other_actors'] += 1
            
        # Process second actor
        if actor2_col in top_actors:
            features[actor2_col] = 1
            if actor2 in actor_metrics:
                features[f"{actor2_col}_pop_weight"] = actor_metrics[actor2]['avg_popularity']
        else:
            features['other_actors'] += 1
            
        # Process director
        director_col = f'director_{crew_director}'
        if director_col in top_directors:
            features[director_col] = 1
            if crew_director in director_metrics:
                features[f"{director_col}_pop_weight"] = director_metrics[crew_director]['avg_popularity']
            features['other_directors'] = 0
        else:
            features['other_directors'] = 1
            
        # Process writer
        writer_col = f'writer_{crew_writer}'
        if writer_col in top_writers:
            features[writer_col] = 1
            if writer in writer_metrics:
                features[f"{writer_col}_pop_weight"] = writer_metrics[writer]['avg_popularity']
            features['other_writer_count'] = 0
        else:
            features['other_writers'] = 1
            
        # Process producer
        producer_col = f'producer_{crew_producer}'
        if producer_col in top_producers:
            features[producer_col] = 1
            if crew_director in director_metrics:
                features[f"{producer_col}_pop_weight"] = producer_metrics[producer]['avg_popularity']
            features['other_producers'] = 0
        else:
            features['other_producers'] = 1
        
        # Fill in zeros for all missing top people columns
        for col in top_actors:
            if col not in features:
                features[col] = 0
                features[f"{col}_pop_weight"] = 0
                
        for col in top_directors:
            if col not in features:
                features[col] = 0
                features[f"{col}_pop_weight"] = 0
                
        for col in top_writers:
            if col not in features:
                features[col] = 0
                features[f"{col}_pop_weight"] = 0
                
        for col in top_producers:
            if col not in features:
                features[col] = 0
                features[f"{col}_pop_weight"] = 0
        
        return features
        
    except FileNotFoundError as e:
        print(f"Error: Required data file not found - {str(e)}")
        print("Please run feature engineering first.")
        return None
    except Exception as e:
        print(f"Error processing movie: {str(e)}")
        return None



Test for a new movie

In [None]:

new_movie = process_new_movie(
    actor1="Brad Pitt",
    actor2="Leonardo DiCaprio",
    crew_director="Quentin Tarantino",
    crew_writer="Quentin Tarantino",
    crew_producer="Shannon McIntosh"
)

if new_movie:
    print("\nProcessed Features for the New Movie:")
    for key, value in new_movie.items():
        if value != 0:  # Only print non-zero features
            print(f"{key}: {value}")

## Feature Engineering Spreadsheet Summary


- Languages are properly encoded using LabelEncoder
- Genre columnes are already one-hot encoded
- Budget is both log- transformed and scaled
- Saved the encoders and scalers
- Target variable (revenue) is Lon-transformed to handle skewness and scaled using StandardScaler
- Processed datasets are saved.
    



# PUSH TO REPO

In [19]:

# Run feature engineering
print("Starting feature engineering process...")
train_processed, test_processed = engineer_movie_features(TrainSet, TestSet)

if train_processed is not None and test_processed is not None:
    print("\nFeature Engineering Results:")
    print(f"Final TrainSet shape: {train_processed.shape}")
    print(f"Final TestSet shape: {test_processed.shape}")
    
    # Save the processed datasets
    print("\nSaving processed datasets...")
    train_processed.to_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/train_df_engineered.pkl')
    test_processed.to_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/test_df_engineered.pkl')

Starting feature engineering process...
Starting feature engineering...
Initial columns: 1269
Successfully loaded encoders and filters from cleaning stage
Removed 0 training movies with missing revenue, runtime, or budget
Removed 0 test movies with missing revenue, runtime, or budget

Engineering budget features...
Removed 0 training movies without budget
Removed 0 test movies without budget
Engineering runtime features...
Removed 556 training movies shorter than 90 minutes
Removed 148 test movies shorter than 90 minutes
Engineering cast/crew features...

Top actors by total revenue:
- Stan Lee: Total revenue $13,030,512,458.00 (20 movies)
- John Ratzenberger: Total revenue $10,586,752,823.00 (20 movies)
- Samuel L. Jackson: Total revenue $10,201,483,439.00 (45 movies)
- Hugo Weaving: Total revenue $8,902,326,303.00 (17 movies)
- Cate Blanchett: Total revenue $8,021,268,624.00 (24 movies)
- Andy Serkis: Total revenue $7,746,057,079.00 (18 movies)
- Morgan Freeman: Total revenue $7,666,

In [20]:
# Load the saved processed datasets
train_processed = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/test_df_engineered.pkl')
test_processed = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/train_df_engineered.pkl')

print("\nFinal shapes after loading:")
print(f"TrainSet shape: {train_processed.shape}")
print(f"TestSet shape: {test_processed.shape}")

# Separate features (X) and target (y)
feature_columns = list(set([col for col in train_processed.columns if col != 'revenue']))

print(f"\nNumber of total feature columns: {len(feature_columns)}")

X_train = train_processed[feature_columns]
y_train = train_processed['revenue']

X_test = test_processed[feature_columns]
y_test = test_processed['revenue']

print(f"\nShapes after splitting:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

# Save the splits in the engineered directory
output_dir = '/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered'

print("\nSaving splits...")
X_train.to_pickle(f'{output_dir}/X_train.pkl')
X_test.to_pickle(f'{output_dir}/X_test.pkl')
y_train.to_pickle(f'{output_dir}/y_train.pkl')
y_test.to_pickle(f'{output_dir}/y_test.pkl')

print("Splits saved successfully!")

print(f"Training data shape: {train_processed.shape}")
print(f"Test data shape: {test_processed.shape}")
print("\nFeature engineering completed!")







Final shapes after loading:
TrainSet shape: (813, 1355)
TestSet shape: (3284, 1355)

Number of total feature columns: 551

Shapes after splitting:
X_train shape: (813, 1354)
y_train shape: (813,)
X_test shape: (3284, 1354)
y_test shape: (3284,)

Saving splits...
Splits saved successfully!
Training data shape: (813, 1355)
Test data shape: (3284, 1355)

Feature engineering completed!


Split into X and Y

In [21]:
# Load the saved processed datasets
train_processed = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/test_df_engineered.pkl')
test_processed = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/train_df_engineered.pkl')

print("Column names sample:")
print(list(train_processed.columns)[:10])  # Print first 10 column names


# Separate features (X) and target (y)
feature_columns = [col for col in train_processed.columns if col != 'revenue']

X_train = train_processed[feature_columns]
y_train = train_processed['revenue']

X_test = test_processed[feature_columns]
y_test = test_processed['revenue']


# Save the splits in the engineered directory
output_dir = '/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered'

# Print the shapes of the final datasets
print("\nDataset shapes:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

# Print a few sample rows from the datasets to inspect
print("\nSample rows from X_train:")
print(X_train.head())

print("\nSample rows from y_train:")
print(y_train.head())

print("\nSample rows from X_test:")
print(X_test.head())

print("\nSample rows from y_test:")
print(y_test.head())

# Print the list of final features
print("\nFeatures included in the final dataset:")
for feature in sorted(feature_columns):
    print(f"- {feature}")

print("\nFeature engineering completed!")
print(f"Training data shape: {train_processed.shape}")
print(f"Test data shape: {test_processed.shape}")

Column names sample:
['budget', 'revenue', 'runtime', 'language_encoded', 'popularity', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime']

Dataset shapes:
X_train shape: (813, 13466)
y_train shape: (813,)
X_test shape: (3284, 13466)
y_test shape: (3284,)

Sample rows from X_train:
     budget  runtime  language_encoded  popularity  Action  Adventure  \
0  0.952253     97.0                 7   -0.292760       1          1   
1 -0.743069    111.0                 7   -0.603839       0          0   
2  5.311652    153.0                 7    2.332448       0          1   
3  0.952253    101.0                 7    0.617578       1          1   
4 -0.643771    135.0                 7   -0.576037       0          0   

   Animation  Comedy  Crime  Documentary  ...  crew_Producer_Akiva Goldsman  \
0          0       1      0            0  ...                             0   
1          0       0      1            0  ...                             0   
2          0       0      0          

In [None]:
print(f"Number of unique feature columns: {len(set(feature_columns))}")
print(f"Number of feature columns: {len(feature_columns)}")


In [None]:
# Check for duplicate columns
print(f"Number of unique columns in TrainSet: {len(train_processed.columns.unique())}")
print(f"Number of columns in TrainSet: {len(train_processed.columns)}")
