# Combined Datasets

In [18]:
# Importing dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression

# Data

Data explanation placeholder

(Talk about original two notebooks?)

In [19]:
# Reading in data
df_movies = pd.read_csv("./Resources/movies_data.csv")
df_economics = pd.read_csv("./Resources/economics_data.csv")

### Movie Data

### Economics Data

# Combining Data

In [20]:
# Creating a 'Date' for a datetime index
df_movies['Date'] = pd.to_datetime({
    'year': df_movies['released_year'],
    'month': df_movies['released_month'],
    'day': df_movies['released_day']
})

# Setting `Date` as index
df_movies.set_index('Date', inplace=True)

# Ensuring index is sorted with ascending dates
df_movies.sort_index(inplace=True)

In [21]:
# Creating a `Year` and `Month` for concatenation
df_economics['Year'] = df_economics['Date'].str.slice(0,4).astype(int)
df_economics['Month'] = df_economics['Date'].str.slice(5,7).astype(int)

# Renaming to `Year` and `Month` for concatenation
df_movies.rename(columns={
'released_year': 'Year',
'released_month': 'Month'
}, inplace=True)

In [22]:
# Confirming total records before concatenation
print(f'Total ecomonic records: {df_economics.shape[0]}')
print(f'Total movie records: {df_movies.shape[0]}')

Total ecomonic records: 507
Total movie records: 15363


In [23]:
# Combining datasets through concatenation
df_combined = pd.merge(df_economics, df_movies, how='left', on=['Year', 'Month'])

# Confirming total records after concatenation
print(f'Total records: {df_combined.shape[0]}')

Total records: 12188


# EDA

In [24]:
# Creating the eventual `Target` for modelling
df_combined['Target'] = df_combined['critical_success'] + ' ' +\
                        df_combined['financial_success'] + ' ' +\
                        df_combined['Economic Climate']

In [25]:
df_combined['Target'].value_counts()

Target
panned failure Lean to Bad                                    2200
well liked failure Lean to Bad                                1897
well liked failure Comfortable to Good                        1316
panned failure Comfortable to Good                            1195
alright failure Lean to Bad                                   1050
alright failure Comfortable to Good                            753
well liked excellent returns Lean to Bad                       709
well liked excellent returns Comfortable to Good               527
critical success failure Lean to Bad                           477
well liked extraordinary returns Lean to Bad                   292
critical success failure Comfortable to Good                   251
well liked modest returns Lean to Bad                          221
well liked extraordinary returns Comfortable to Good           201
well liked moderate returns Lean to Bad                        193
well liked modest returns Comfortable to Good          

In [26]:
# Creating a list of features to drop
cols_to_drop = [
    'Date',
    'CCI Rolling Mean',
    'CPI Rolling Mean',
    'Unemployment Rate (%) Rolling Mean',
    'Economic Climate',
    'Year',
    'Month',
    'id',
    'cast',
    'original_language',
    'director',
    'writers',
    'producers',
    'popularity', 
    'critical_success',
    'financial_success',
    'release_date',
    'released_day',
    'production_countries',
    'status',
    'spoken_languages'
]

# Dropping unneeded features
df_combined.drop(columns=cols_to_drop, inplace=True)

In [27]:
# Dropping `NaN` records
df_combined.dropna(inplace=True)

In [28]:
df_combined.shape

(10025, 20)

# Train Test Splitting

In [29]:
# Defining columns to scale and encode
col_to_scale = [
    'CCI Value', 'CCI Rolling Percent Change', 'CPI Value',
    'CPI Rolling Percent Change', 'Unemployment Rate (%)', 
    'Unemployment Rate Rolling Percent Change','vote_average', 'vote_count',
    'revenue','runtime','budget', 'roi'
]

col_to_encode = [
    'CCI Rolling Percent Change Flag', 'CPI Rolling Percent Change Flag',
    'Unemployment Rate Rolling Percent Change Flag', 'title', 'original_title',
    'genres', 'production_companies'
]

# Setup X and y variables
X = df_combined.drop(columns='Target')
y = df_combined['Target']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

# Scaling and Econding

In [30]:
scaler = StandardScaler()
X_train_scaled = scaler.fit(X_train[col_to_scale])

In [31]:
X_train_scaled = scaler.transform(X_train[col_to_scale])
X_test_scaled = scaler.transform(X_test[col_to_scale])

In [32]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=scaler.get_feature_names_out())
X_test_scaled = pd.DataFrame(X_train_scaled, columns=scaler.get_feature_names_out())

In [33]:
encoder_x = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoder_x.fit(X_train[col_to_encode].values.reshape(-1,1))

encoder_y = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoder_y.fit(y_train.values.reshape(-1,1))

# Transform each column into numpy arrays
X_train_encoded = encoder_x.transform(X_train[col_to_encode].values.reshape(-1,1))
X_test_encoded = encoder_x.transform(X_test[col_to_encode].values.reshape(-1,1))

y_train_encoded = encoder_y.transform(y_train.values.reshape(-1,1))
y_test_encoded = encoder_y.transform(y_test.values.reshape(-1,1))

# Reorganize the numpy arrays into a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder_x.get_feature_names_out())
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder_x.get_feature_names_out())

# Concatenate the encoded columns with the scaled columns
X_train = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_test = pd.concat([X_test_scaled, X_test_encoded], axis=1)

MemoryError: Unable to allocate 5.40 GiB for an array with shape (52626, 13781) and data type float64

# Modeling

Playtime!!

# Eric's Space

# Funda's Space

# Kalvin's Space

# Odele's Space

# Peta's Space

In [None]:
# Declare a logistic regression model.
logistic_regression_model = LogisticRegression()

In [None]:
# Fit and save the logistic regression model using the training data
df_combined_lr_model = logistic_regression_model.fit(X_train, y_train)

In [None]:
# Validate the model
print(f"Training Data Score: {df_combined_lr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {df_combined_lr_model.score(X_test, y_test)}")

In [None]:
# Generate predictions from the logistic regression model using the test data
lr_predictions = logistic_regression_model.predict(X_test)

# Review the predictions
testing_predections

In [None]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, testing_predections)

In [None]:
# Display the precision score for the test dataset.
precision_score(y_test, testing_predections)

# Vadim's Space

# Findings

# Additional