## 1. Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

## 2. Load and Clean Data

In [None]:
# Load Excel data
df = pd.read_excel('Model Titles.xlsx', sheet_name='Model Data')

# Replace infinite values with NaN and drop rows with missing values
df = df.replace([np.inf, -np.inf], np.nan).dropna()

## 3. Log-transform Relevant Columns

In [None]:
df['New Release Revenue'] = np.log(df['New Release Revenue'])
df['UK Box Office'] = np.log(df['UK Box Office'])
df['Days to Release'] = np.log(df['Days to Release'])

## 4. Quick Data Inspection

In [None]:
print(df.head())
print(df.shape)
print(df.describe())
print(df.dtypes)

## 5. Visualise Relationships

In [None]:
sns.pairplot(df, hue='Genre')
plt.show()

## 6. Prepare Data for Modelling

In [None]:
# Select relevant columns
df2 = df[['Genre', 'Days to Release', 'New Release Revenue', 'UK Box Office']]

# Display correlation between revenue and box office
print(df2[['New Release Revenue', 'UK Box Office']].corr())

## 7. Feature Engineering

In [None]:
# Define features and target
X = df2[['Genre', 'Days to Release', 'UK Box Office']]
X = pd.get_dummies(X, drop_first=True, dtype=int)
y = df2['New Release Revenue']

## 8. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

## 9. Fit Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

## 10. Model Summary

In [None]:
print("Intercept:", model.intercept_)
coeff_parameter = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coeff_parameter)

## 11. Predictions and Visualisation

In [None]:
predictions = model.predict(X_test)

# Plot actual vs predicted values
sns.regplot(x=y_test, y=predictions)
plt.xlabel("Actual New Release Revenue")
plt.ylabel("Predicted New Release Revenue")
plt.title("Actual vs Predicted Revenue")
plt.show()

## 12. OLS Summary using Statsmodels

In [None]:
X_train_sm = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_sm).fit()
print(ols_model.summary())

## 13. Example Prediction

In [None]:
# Create a test example for prediction
test_example = {
    'Days to Release': [np.log(45)],
    'UK Box Office': [np.log(5500000)],
    'Genre_1': [0],
    'Genre_2': [0],
    'Genre_3': [0],
    'Genre_4': [0],
    'Genre_5': [1]
}
dftest = pd.DataFrame(test_example)

# Predict and convert back from log scale
predicted_log = model.predict(dftest)
predicted_value = np.exp(predicted_log)
print("Predicted New Release Revenue:", predicted_value)