In [1]:
import pandas as pd
import utils.TmdbHelper as Th

In [32]:
movie_df = pd.read_csv('../data/movies.csv', parse_dates=['release_date'])

In [33]:
movie_df.head()

Unnamed: 0,adult,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,production_companies,production_countries,genre,revenue(mil),budget(mil),month,runtime(min),year
0,False,823464,English,Godzilla x Kong: The New Empire,3404.57,2024-03-27,Godzilla x Kong: The New Empire,6.8,422,"['Legendary Pictures', 'Warner Bros. Pictures']",['United States of America'],"['Action', 'Adventure', 'Science Fiction']",214.0,135.0,3,115.0,2024
1,False,1011985,English,Kung Fu Panda 4,2101.694,2024-03-02,Kung Fu Panda 4,6.773,598,['DreamWorks Animation'],['United States of America'],"['Action', 'Adventure', 'Animation', 'Comedy',...",347.0,85.0,3,94.0,2024
2,False,634492,English,Madame Web,1146.356,2024-02-14,Madame Web,5.659,971,"['Columbia Pictures', 'di Bonaventura Pictures']",['United States of America'],"['Action', 'Fantasy']",99.0,80.0,2,116.0,2024
3,False,856289,Chinese,封神第一部：朝歌风云,887.604,2023-07-20,Creation of the Gods I: Kingdom of Storms,6.845,177,"['Beijing Culture', 'Century Changshengtian Fi...",['China'],"['Action', 'Fantasy', 'War']",372.0,85.0,7,148.0,2023
4,False,940551,English,Migration,875.175,2023-12-06,Migration,7.5,1077,['Illumination'],['United States of America'],"['Action', 'Adventure', 'Animation', 'Comedy',...",292.0,72.0,12,83.0,2023


In [34]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1353 entries, 0 to 1352
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   adult                 1353 non-null   bool          
 1   id                    1353 non-null   int64         
 2   original_language     1353 non-null   object        
 3   original_title        1353 non-null   object        
 4   popularity            1353 non-null   float64       
 5   release_date          1353 non-null   datetime64[ns]
 6   title                 1353 non-null   object        
 7   vote_average          1353 non-null   float64       
 8   vote_count            1353 non-null   int64         
 9   production_companies  1353 non-null   object        
 10  production_countries  1353 non-null   object        
 11  genre                 1353 non-null   object        
 12  revenue(mil)          1353 non-null   float64       
 13  budget(mil)       

In [35]:
movie_df = movie_df.dropna()

In [36]:
from sklearn.model_selection import train_test_split

X_cols = movie_df[['budget(mil)','vote_count','vote_average','release_date','adult', 'genre', 'production_companies', 'popularity', 'production_countries','original_language']]
y_cols = movie_df['revenue(mil)']
X_train, X_test, y_train, y_test = train_test_split(X_cols, y_cols, shuffle=True, test_size=0.2, random_state=42)

In [37]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1082, 10) (271, 10) (1082,) (271,)


In [38]:
genres = []
for genre in Th.get_genres():
    genres.append(genre['name'])
genres

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [39]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin


class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        encoder = OneHotEncoder()
        X['year'] = X['release_date'].dt.year
        
        X['month'] = X['release_date'].dt.strftime('%b')
        matrix = encoder.fit_transform(X[['month']]).toarray()
        columns_names = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
        for i in range(len(matrix.T)):
            X[columns_names[i]] = matrix.T[i]
        
        X['day'] = X['release_date'].dt.strftime('%a')
        weekend = ['Sat', 'Sun']
        X['day'] = X['day'].apply(lambda x: 'Weekend' if x in weekend else 'Weekday')
        matrix = encoder.fit_transform(X[['day']]).toarray()
        columns_names = ['Weekend', 'Weekday']
        for i in range(len(matrix.T)):
            X[columns_names[i]] = matrix.T[i]
        
       
        return X

In [40]:
class FeatureEncoder(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    @staticmethod
    def encoder(X, column_name):
        X[column_name] =X[column_name].apply(eval)
        # unique_values_set = set(np.concatenate(X[column_name].tolist()).flat)
        unique_values_list = genres
        item_lists = X[column_name]
    
        bool_dict = {}

        for i, item in enumerate(unique_values_list):
            bool_dict[item] = item_lists.apply(lambda x: 1 if item in x else 0)
        bool_dict = pd.DataFrame(bool_dict)
        X = pd.concat([X, pd.DataFrame(bool_dict)], axis=1) 
        X = X.map(str)
    
        return X
    
    def transform(self, X):
        X['adult'] = X['adult'].map({True:1,False:0})
        encoder = OneHotEncoder()
        columns = ['genre']
        for column in columns:
            X = self.encoder(X, column)
        
        # matrix = encoder.fit_transform(X[['original_language']]).toarray()
        # columns_names = df['original_language'].unique().tolist()
        # for i in range(len(matrix.T)):
        #     X[columns_names[i]] = matrix.T[i]
        
        X.columns = X.columns.astype('str')
        X= X.rename(str,axis="columns")
        return X

In [41]:
class FeatureDropper(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=['production_companies','production_countries','genre', 'original_language','release_date', 'year', 'month', 'day'])

In [42]:
from sklearn.pipeline import Pipeline

ML_pipeline = Pipeline([
    ('date_Transformer', DateTransformer()),
    ('encoder', FeatureEncoder()),
    ('dropper', FeatureDropper())
])

In [43]:
X_train = ML_pipeline.fit_transform(X_train)

In [44]:
X_train.head()

Unnamed: 0,budget(mil),vote_count,vote_average,adult,popularity,Jan,Feb,Mar,Apr,May,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
966,110.0,9952,6.682,0,50.412,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
932,7.0,3337,7.197,0,38.307,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
109,37.0,13038,6.29,0,135.867,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
714,8.0,950,6.926,0,45.035,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
1200,90.0,3429,6.435,0,39.288,0.0,0.0,0.0,1.0,0.0,...,0,0,0,1,0,0,0,1,0,0


In [47]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=120, max_depth=None, min_samples_leaf=5)
clf.fit(X_train, y_train)

In [46]:
X_test = ML_pipeline.fit_transform(X_test)

In [64]:
clf.score(X_test, y_test)

0.6871320282235902

## Conclusion:

The regression model achieved an R-squared value of 0.68, indicating that approximately 68% of the variance in the dependent variable (the outcome) is explained by the independent variables (the predictors) included in the model. 

### Interpretation:

- An R-squared value of 0.68 suggests that the model captures a substantial portion of the variability in the data and demonstrates a moderate level of predictive performance.

- The model's ability to explain 68% of the variance in the response variable indicates a reasonably good fit to the observed data.

### Future Direction:

While achieving an R-squared value of 0.68 is a promising start, it's essential to recognize that this is just the beginning of my journey. Further evaluation and refinement of the model will be necessary to improve its predictive accuracy and robustness.

### Next Steps:

- **Model Refinement:** Explore additional predictors, feature engineering techniques, or alternative model specifications to enhance the model's performance further.

- **Cross-Validation:** Validate the model's performance on unseen data using techniques such as cross-validation to assess its generalizability and stability.

- **Diagnostic Checks:** Conduct diagnostic checks, such as residual analysis, to identify areas for improvement and ensure that model assumptions are met.

### summary:

In summary, while the model has shown promise with an R-squared value of 0.68, there is still much to be done to develop a reliable and accurate predictive model. By continuing to iterate, evaluate, and refine the model, I would strive to enhance its performance and unlock deeper insights into the underlying relationships within the data.

This journey represents an ongoing pursuit of excellence in data-driven decision-making and underscores the importance of continuous learning and improvement in predictive modeling endeavors.
Arigato!!

In [66]:
import pickle

# Save the model to a file
with open('../models/RFR_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [67]:
with open('../models/RFR_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)