In [None]:
#pipeline EXEMPLE

pipeline = Pipeline([
    ('genre_processing', GenreTransformer(drop_foreign=True)),
    ('language_encoding', LanguageEncoder())
])

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

# Custom column selector to filter out unwanted columns
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

# Step 1: Define a simple pipeline where all preprocessing steps are added.
pipeline = Pipeline(steps=[
    ('column_selector', ColumnSelector(columns=['genres', 'original_language', 'budget', 'revenue', 'runtime', 
                                                'production_companies', 'production_countries', 'cast', 'crew', 'popularity'])),
    ('missing_value_filler', SimpleImputer(strategy='most_frequent')),  # Handle missing values for categorical columns
    ('ordinal_encoder', OrdinalEncoder()),  # Encoding for categorical columns
    ('scaler', StandardScaler()),  # Scaling numerical columns
    ('model', XGBClassifier())  # Model step
])

# Example data
df_movies = pd.DataFrame({
    'genres': ['Action', 'Drama'],
    'original_language': ['English', 'French'],
    'budget': [100000000, 200000000],
    'revenue': [150000000, 300000000],
    'runtime': [120, 150],
    'production_companies': ['Company A', 'Company B'],
    'production_countries': ['USA', 'France'],
    'cast': ['Actor 1, Actor 2', 'Actor 3, Actor 4'],
    'crew': ['Director 1, Writer 1', 'Director 2, Writer 2'],
    'popularity': [10, 20],
    'release_date': [None, '2020-01-01'],
    'homepage': [None, None],
    'overview': [None, None],
    'tagline': [None, None]
})

# Fit the pipeline on your training data
pipeline.fit(df_movies, [1, 0])  # Replace [1, 0] with your actual labels

# Example prediction
new_data = pd.DataFrame({
    'genres': ['Action'],
    'original_language': ['English'],
    'budget': [250000000],
    'revenue': [300000000],
    'runtime': [130],
    'production_companies': ['Company C'],
    'production_countries': ['USA'],
    'cast': ['Actor 5, Actor 6'],
    'crew': ['Director 3, Writer 3'],
    'popularity': [15],
    'release_date': ['2021-01-01'],
    'homepage': [None],
    'overview': [None],
    'tagline': [None]
})

# Make predictions using the pipeline
predictions = pipeline.predict(new_data)
print(predictions)
