In [23]:
import pandas as pd
import lib.helper as Tmdb

In [69]:
movie_df = pd.read_csv('../data/movies.csv', parse_dates=['release_date'])

In [70]:
movie_df.head()

Unnamed: 0,adult,original_language,popularity,release_date,title,production_companies,production_countries,genre,revenue(mil),budget(mil),month,runtime(min)
0,False,English,6334.004,2024-07-24,Deadpool & Wolverine,"['Marvel Studios', 'Maximum Effort', '21 Laps ...",['United States of America'],"['Action', 'Comedy', 'Science Fiction']",1147.848023,200.0,Jul,128.0
1,False,English,3413.374,2024-06-11,Inside Out 2,"['Walt Disney Pictures', 'Pixar']",['United States of America'],"['Adventure', 'Animation', 'Comedy', 'Family']",1625.983694,200.0,Jun,97.0
2,False,English,2373.617,2024-07-10,Twisters,"['Universal Pictures', 'Warner Bros. Pictures'...",['United States of America'],"['Action', 'Adventure', 'Drama', 'Thriller']",333.439315,155.0,Jul,123.0
3,False,English,2163.327,2024-06-20,Despicable Me 4,"['Universal Pictures', 'Illumination']",['United States of America'],"['Action', 'Animation', 'Comedy', 'Family']",847.329715,100.0,Jun,94.0
4,False,English,1840.966,2024-06-05,Bad Boys: Ride or Die,"['Westbrook', 'Columbia Pictures', 'Don Simpso...",['United States of America'],"['Action', 'Comedy', 'Crime', 'Thriller']",400.124003,100.0,Jun,115.0


In [71]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   adult                 1980 non-null   bool          
 1   original_language     1980 non-null   object        
 2   popularity            1980 non-null   float64       
 3   release_date          1978 non-null   datetime64[ns]
 4   title                 1980 non-null   object        
 5   production_companies  1980 non-null   object        
 6   production_countries  1980 non-null   object        
 7   genre                 1980 non-null   object        
 8   revenue(mil)          1980 non-null   float64       
 9   budget(mil)           1980 non-null   float64       
 10  month                 1978 non-null   object        
 11  runtime(min)          1980 non-null   float64       
dtypes: bool(1), datetime64[ns](1), float64(4), object(6)
memory usage: 172.2+ KB

In [45]:
from sklearn.model_selection import train_test_split

X_cols = movie_df[['budget(mil)','release_date','adult', 'genre', 'production_companies', 'popularity', 'production_countries','original_language']]
y_cols = movie_df['revenue(mil)']
X_train, X_test, y_train, y_test = train_test_split(X_cols, y_cols, shuffle=True, test_size=0.2, random_state=42)

In [46]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(927, 8) (232, 8) (927,) (232,)


In [47]:
genres = []
for genre in Tmdb.get_genres():
    genres.append(genre['name'])
genres

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [75]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin


class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        encoder = OneHotEncoder()
        X['year'] = X['release_date'].dt.year
        
        X['month'] = X['release_date'].dt.strftime('%b')
        matrix = encoder.fit_transform(X[['month']]).toarray()
        columns_names = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
        for i in range(len(matrix.T)):
            X[columns_names[i]] = matrix.T[i]
        
        X['day'] = X['release_date'].dt.strftime('%a')
        weekend = ['Sat', 'Sun']
        X['day'] = X['day'].apply(lambda x: 'Weekend' if x in weekend else 'Weekday')
        matrix = encoder.fit_transform(X[['day']]).toarray()
        columns_names = ['Weekend', 'Weekday']
        for i in range(len(matrix.T)):
            X[columns_names[i]] = matrix.T[i]
        
       
        return X

In [49]:
class FeatureEncoder(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    @staticmethod
    def encoder(X, column_name):
        X[column_name] =X[column_name].apply(eval)
        # unique_values_set = set(np.concatenate(X[column_name].tolist()).flat)
        unique_values_list = [x for sublist in X[column_name] for x in sublist]
        item_lists = X[column_name]
    
        bool_dict = {}

        for i, item in enumerate(unique_values_list):
            bool_dict[item] = item_lists.apply(lambda x: 1 if item in x else 0)
        bool_dict = pd.DataFrame(bool_dict)
        X = pd.concat([X, pd.DataFrame(bool_dict)], axis=1) 
        X = X.map(str)
    
        return X
    
    def transform(self, X):
        X['adult'] = X['adult'].map({True:1,False:0})
        
        columns = ['genre']
        for column in columns:
            X = self.encoder(X, column)
        
        # matrix = encoder.fit_transform(X[['original_language']]).toarray()
        # columns_names = df['original_language'].unique().tolist()
        # for i in range(len(matrix.T)):
        #     X[columns_names[i]] = matrix.T[i]
        
        X.columns = X.columns.astype('str')
        X= X.rename(str,axis="columns")
        return X

In [50]:
class FeatureDropper(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=['production_companies','production_countries','genre', 'original_language','release_date', 'year', 'month', 'day'])

In [51]:
from sklearn.pipeline import Pipeline

ML_pipeline = Pipeline([
    ('date_Transformer', DateTransformer()),
    ('encoder', FeatureEncoder()),
    ('dropper', FeatureDropper())
])

In [52]:
X_train = ML_pipeline.fit_transform(X_train)

In [55]:
X_train.head()

Unnamed: 0,budget(mil),adult,popularity,Jan,Feb,Mar,Apr,May,Jun,Jul,...,JA Entertainment,Pen Studios,Ajay Kapoor Productions,Shaw Brothers,Groucho II Film Partnership,Spelling Films,Joint Effort,SNF Productions,Universal 1440 Entertainment,Section Eight
58,90.0,0,185.317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
333,150.0,0,80.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
332,45.0,0,80.156,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1065,25.0,0,43.187,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
63,460.0,0,183.616,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=120, max_depth=None, min_samples_leaf=5)
clf.fit(X_train, y_train)

In [57]:
X_test = ML_pipeline.fit_transform(X_test)

In [58]:
clf.score(X_test, y_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- 100 Zeros
- 2AM
- 2K
- 30WEST
- 3dot Productions
- ...
Feature names seen at fit time, yet now missing:
- 16:14 Entertainment
- 21 Laps Entertainment
- 22nd & Indiana Pictures
- 3Beep
- 3Pas Studios
- ...


## Conclusion:

The regression model achieved an R-squared value of 0.68, indicating that approximately 68% of the variance in the dependent variable (the outcome) is explained by the independent variables (the predictors) included in the model. 

### Interpretation:

- An R-squared value of 0.68 suggests that the model captures a substantial portion of the variability in the data and demonstrates a moderate level of predictive performance.

- The model's ability to explain 68% of the variance in the response variable indicates a reasonably good fit to the observed data.

### Future Direction:

While achieving an R-squared value of 0.68 is a promising start, it's essential to recognize that this is just the beginning of my journey. Further evaluation and refinement of the model will be necessary to improve its predictive accuracy and robustness.

### Next Steps:

- **Model Refinement:** Explore additional predictors, feature engineering techniques, or alternative model specifications to enhance the model's performance further.

- **Cross-Validation:** Validate the model's performance on unseen data using techniques such as cross-validation to assess its generalizability and stability.

- **Diagnostic Checks:** Conduct diagnostic checks, such as residual analysis, to identify areas for improvement and ensure that model assumptions are met.

### summary:

In summary, while the model has shown promise with an R-squared value of 0.68, there is still much to be done to develop a reliable and accurate predictive model. By continuing to iterate, evaluate, and refine the model, I would strive to enhance its performance and unlock deeper insights into the underlying relationships within the data.

This journey represents an ongoing pursuit of excellence in data-driven decision-making and underscores the importance of continuous learning and improvement in predictive modeling endeavors.
Arigato!!

In [39]:
import pickle

# Save the model to a file
with open('../models/RFR_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [40]:
with open('../models/RFR_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)