# Classification

## Objectives

*  Fit and evaluate a classification model to predict if a prospect will churn or not.

## Inputs

* outputs/datasets/collection/TelcoCustomerChurn.csv
* Instructions on which variables to use for data cleaning and feature engineering. They are found in each respective notebook.

## Outputs

* Train set (features and target)
* Test set (features and target)
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* Feature importance plot



---

# Change working directory

Change the working directory from its current folder to its parent folder


In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/Film_Hit_prediction/jupyter_notebooks'

Make the parent of the current directory the new current directory.


In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/Film_Hit_prediction'

---

# Load Data 

Load Data

In [4]:
import pandas as pd
import pickle

# Load the splits for modeling
X_train = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/X_train.pkl')
X_test = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/X_test.pkl')
y_train = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/y_train.pkl')
y_test = pd.read_pickle('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/y_test.pkl')


print("Dataset shapes:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


Dataset shapes:
X_train shape: (3284, 66587)
y_train shape: (3284,)
X_test shape: (813, 66587)
y_test shape: (813,)


---

# Step 2: ML Pipeline with all data

## ML Pipeline for Modelling and Hyperparameter Optimisation

In [5]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Define models

models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.1, 1.0, 10.0]
        }
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.1, 1.0, 10.0]
        }
    },
    'Random Forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
        }
    },
 
}


# Grid Search

In [6]:


results = {}
best_models = {}

for name, model_info in models.items():
    print(f"\nTraining {name}...")

    # GridSearchCV performs the hyperparameter optimization
    grid_search = GridSearchCV(
        model_info['model'],
        model_info['params'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs= 2
    )

  


Training Linear Regression...

Training Ridge...

Training Lasso...

Training Random Forest...


# Fit the model with different parameter combinations

In [7]:
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

 Save best model and its parameters

In [None]:
results = {}
best_models = {}

for name, model_info in models.items():
    print(f"\nTraining {name}...")

    # GridSearchCV performs the hyperparameter optimization
    grid_search = GridSearchCV(
        model_info['model'],
        model_info['params'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )


Save

In [None]:
best_model = None
best_score = float('-inf')

for name, metrics in results.items():
    if metrics['R2 Score'] > best_score:
        best_score = metrics['R2 Score']
        best_model = best_models[name]

os.makedirs('/workspace/Film_Hit_prediction/outputs/models', exist_ok=True)
joblib.dump(best_model, '/workspace/Film_Hit_prediction/outputs/models/movie_revenue_predictor.joblib')

## Assess feature importance

In [None]:
# Get feature importance from GradientBoostingRegressor model
feature_importance = pd.DataFrame({
    'Feature': X_train_final.columns,
    'Importance': best_models['Random Forest'].feature_importances_
})

# Print feature importance
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

# bar plot of feature importance
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.bar(feature_importance['Feature'], feature_importance['Importance'])
plt.xticks(rotation=45, ha='right')
plt.title('Feature Importance for Revenue Prediction')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.show()

# Prediction Model


Create a function to make predictions for new movies based on parameters:
-  budget (float): Movie budget in dollars
- language (str): Original language (e.g., 'en' for English)
- genres (list): List of genres (e.g., ['Action', 'Adventure'])
    
 Returns:
- float: Predicted revenue and profit

New model

In [None]:
def prepare_prediction_matrix(
    budget,
    runtime,
    genres,  # list of genres
    language,
    production_company,
    production_country,
    actor1,
    actor2,
    crew_director,
    crew_writer,
    crew_producer
):
    """
    Prepare a prediction matrix for a new movie
    
    Parameters:
    -----------
    budget : float
        Movie budget in dollars
    runtime : int
        Movie runtime in minutes
    genres : list
        List of genres (e.g., ['Action', 'Adventure'])
    language : str
        Movie language
    production_company : str
        Production company name
    production_country : str
        Production country
    actor1, actor2 : str
        Names of two main actors
    crew_director : str
        Name of director
    crew_writer : str
        Name of writer
    crew_producer : str
        Name of producer
    
    Returns:
    --------
    pd.DataFrame
        Matrix ready for prediction
    """
    try:
        # Load necessary encoders and scalers
        with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/cleaned/encoders_and_filters.pkl', 'rb') as f:
            encoders = pickle.load(f)
        
        # Create initial feature dictionary
        features = {}
        
        # 1. Process numeric features
        features['budget'] = budget
        features['runtime'] = runtime
        features['budget_per_minute'] = budget / runtime if runtime > 0 else 0
        features['is_long_movie'] = 1 if runtime > 120 else 0
        
        # 2. Process language
        language_encoded = encoders['language_encoder'].transform([language])[0]
        features['language_encoded'] = language_encoded
        features['is_english'] = 1 if language == 'en' else 0
        
        # 3. Process genres (all genre columns should be 0 by default)
        genre_columns = [
            'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
            'Documentary', 'Drama', 'Family', 'Fantasy', 'History',
            'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction',
            'TV Movie', 'Thriller', 'War', 'Western'
        ]
        for genre in genre_columns:
            features[genre] = 1 if genre in genres else 0
            
        # 4. Process production company and country
        features['companies_encoded'] = encoders['company_encoder'].transform([production_company])[0]
        features['countries_encoded'] = encoders['country_encoder'].transform([production_country])[0]
        
        # 5. Process cast and crew
        # Process using the existing function
        crew_features = process_new_movie(
            actor1=actor1,
            actor2=actor2,
            crew_director=crew_director,
            crew_writer=crew_writer,
            crew_producer=crew_producer
        )
        
        # Combine all features
        features.update(crew_features)
        
        # Convert to DataFrame
        pred_matrix = pd.DataFrame([features])
        
        # Ensure all required columns are present
        with open('/workspace/Film_Hit_prediction/jupyter_notebooks/outputs/engineered/feature_columns.pkl', 'rb') as f:
            required_columns = pickle.load(f)
            
        # Add missing columns with 0s
        for col in required_columns:
            if col not in pred_matrix.columns:
                pred_matrix[col] = 0
                
        # Reorder columns to match training data
        pred_matrix = pred_matrix[required_columns]
        
        return pred_matrix
        
    except Exception as e:
        print(f"Error preparing prediction matrix: {str(e)}")
        return None

# Example usage:
if __name__ == "__main__":
    # Example inputs
    test_input = {
        'budget': 150000000,
        'runtime': 120,
        'genres': ['Action', 'Adventure', 'Science Fiction'],
        'language': 'en',
        'production_company': 'Marvel Studios',
        'production_country': 'United States of America',
        'actor1': 'Chris Hemsworth',
        'actor2': 'Robert Downey Jr.',
        'crew_director': 'Joss Whedon',
        'crew_writer': 'Joss Whedon',
        'crew_producer': 'Kevin Feige'
    }
    
    # Create prediction matrix
    pred_matrix = prepare_prediction_matrix(**test_input)
    
    if pred_matrix is not None:
        print("\nPrediction matrix shape:", pred_matrix.shape)
        print("\nNon-zero features:")
        non_zero = pred_matrix.iloc[0][pred_matrix.iloc[0] != 0]
        for col, val in non_zero.items():
            print(f"{col}: {val}")

In [None]:
# Create prediction matrix for new movie
pred_matrix = prepare_prediction_matrix(
    budget=150000000,
    runtime=120,
    genres=['Action', 'Adventure'],
    language='en',
    production_company='Marvel Studios',
    production_country='United States of America',
    actor1='Chris Hemsworth',
    actor2='Robert Downey Jr.',
    crew_director='Joss Whedon',
    crew_writer='Joss Whedon',
    crew_producer='Kevin Feige'
)

# Load your trained model
model = joblib.load('/workspace/Film_Hit_prediction/outputs/models/movie_revenue_predictor.joblib')

# Make prediction
predicted_revenue = model.predict(pred_matrix)[0]
print(f"Predicted Revenue: ${predicted_revenue:,.2f}")

old modeling

In [None]:


# Function to make predictions
def predict_movie_metrix(budget, language, genres):
    engineered_path = "/workspace/Film_Hit_prediction/outputs/datasets/engineered/"
    model_path = "/workspace/Film_Hit_prediction/outputs/models/"

    # Load necessary encoders and model
    le_language = joblib.load(engineered_path + 'language_encoder.joblib')
    scaler = joblib.load(engineered_path + 'budget_scaler.joblib')
    scaler_y = joblib.load(engineered_path + 'revenue_scaler.joblib')
    model = joblib.load(model_path + 'movie_revenue_predictor.joblib')

    all_genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
                    'Documentary', 'Drama', 'Family', 'Fantasy', 'History',
                    'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction',
                    'TV Movie', 'Thriller', 'War', 'Western','Foreign']

# All gengres as 0 initialy
    genre_dict ={genre: 0 for genre in all_genres}

# Set selected gengres to 1
    for genre in genres:
        if genre in genre_dict:
            genre_dict[genre] = 1

# Process budget
    budget_logged = np.log1p(budget)
    budget_scaled = scaler.transform([[budget_logged]])[0][0]

# Process language
    language_encoded = le_language.transform([language])[0]

# Create feature array
    features = {
        'language_encoded': language_encoded,
        'budget_scaled': budget_scaled,
        **genre_dict
    }
    input_df = pd.DataFrame([features])
    input_df = input_df[X_train_final.columns]

 # Get revenue prediction directly from model
    predicted_revenue = model.predict(input_df)[0]
    print(f"Raw prediction: {predicted_revenue}")

    predicted_revenue = scaler_y.inverse_transform([[predicted_revenue]])[0][0] * budget / 10
    print(f"Final prediction: {predicted_revenue}")

    

    # Calculate metrics
    profit_loss = predicted_revenue - budget

    result= {
        'predicted_revenue':predicted_revenue,
        'budget':budget,
        'profit_loss': profit_loss,
        'is_profitable': profit_loss > 0,
        'profit_amount': max(0, profit_loss),
        'loss_amount': abs(min(0, profit_loss)),
        'roi': (profit_loss / budget) * 100 if budget > 0 else 0
        }


    # Print results
    print(f"Budget: ${result['budget']:,.2f}")
    print(f"Predicted Revenue: ${result['predicted_revenue']:,.2f}")
    if result['is_profitable']:
        print(f"PROFIT: ${result['profit_amount']:,.2f}")
    else:
        print(f"LOSS: ${result['loss_amount']:,.2f}")
    print(f"ROI: {result['roi']:.2f}%")

    return result



# Example usage:
test_movie = {
    'budget': 100000000,
    'language': 'en',
    'genres': ['fantasy']
}

result = predict_movie_metrix(
    test_movie['budget'],
    test_movie['language'],
    test_movie['genres']
)







Testing

In [None]:
test_cases = [
    {'budget': 2000000, 'language': 'en', 'genres': ['horror']},  # Low budget horror
    {'budget': 300000000, 'language': 'en', 'genres': ['action', 'adventure']},  # Blockbuster
    {'budget': 90000000, 'language': 'fr', 'genres': ['comedy']},  # Mid-budget foreign
]

for movie in test_cases:
    print("\nTesting:", movie)
    result = predict_movie_metrix(movie['budget'], movie['language'], movie['genres'])

# Evaluation


Load model

In [None]:
import joblib

model_path = "/workspace/Film_Hit_prediction/outputs/models/"  

model = joblib.load(model_path + 'movie_revenue_predictor.joblib')

print("Model loaded:", type(model))

Make predictions on test data 

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Define Genres
all_genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
              'Documentary', 'Drama', 'Family', 'Fantasy', 'History',
              'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction',
              'TV Movie', 'Thriller', 'War', 'Western', 'Foreign']


#Evaluation
test_predictions = []
test_profit = []

# Get predictions for test set
y_pred = model.predict(X_test_final)
test_predictions = y_pred.tolist()


#Caluclate metrics

print("Revenue Prediction Metrics:")
print(f"R2 Score: {r2_score(y_test_final, test_predictions):.3f}")
print(f"MAE: ${mean_absolute_error(y_test_final, test_predictions):,.2f}")
print(f"RMSE: ${mean_squared_error(y_test_final, test_predictions, squared=False):,.2f}")




Vizualization predictions vs actual values

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test_final, y_pred, alpha=0.5)
plt.plot([y_test_final.min(), y_test_final.max()], [y_test_final.min(), y_test_final.max()], 'r--', lw=2)

plt.xlabel('Actual Revenue')
plt.ylabel('Predicted Revenue')
plt.title('Predicted vs Actual Movie Revenue')

# correlation coefficient
correlation = np.corrcoef(y_test_final.squeeze(), y_pred)[0,1]
plt.text(0.05, 0.95, f'Correlation: {correlation:.2f}', transform=plt.gca().transAxes)

plt.tight_layout()
plt.show()

Residual Analysis

verify size 

Match sizes

In [None]:
print("Training set shape:", X_train_final.shape)
print("Test set shape:", X_test_final.shape)
print("y_train shape:", y_train_final.shape)
print("y_test shape:", y_test_final.shape)
print("y_pred shape:", y_pred.shape)

In [None]:
print([var for var in globals() if 'y_' in var])

plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

residuals = y_test_final.values.ravel() - y_pred

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')

plt.subplot(1, 2, 2)
sns.histplot(residuals, kde=True)
plt.title('Distribution of Residuals')
plt.show()

Learning Curves

In [None]:
from sklearn.model_selection import learning_curve 

train_sizes, train_scores, test_scores = learning_curve(
    model, 
    X_train_final.values, 
    y_train_final.values,
    cv=5, 
    n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10)
)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), label='Training score')
plt.plot(train_sizes, test_scores.mean(axis=1), label='Cross-validation score')
plt.xlabel('Training Size')
plt.ylabel('Score')
plt.title('Learning Curves')
plt.legend()
plt.show()

In [None]:
print("\nModel Evaluation Metrics:")
print(f"R² Score: {r2_score(y_test_final, y_pred):.3f}")
print(f"MAE: ${mean_absolute_error(y_test_final, y_pred):.2f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(y_test_final, y_pred)):.2f}")