In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Load data from CSV into a DataFrame
imdb_df = pd.read_csv('../../resources/movie_metadata.csv')

# print all columns
for column in imdb_df.columns:
    print(column)


In [None]:
# Remove specific columns
columns_to_remove = ['aspect_ratio', 'movie_facebook_likes', 'movie_imdb_link', 'facenumber_in_poster']
imdb_df = imdb_df.drop(columns=columns_to_remove)
imdb_df.head()

In [None]:
imdb_df.info()
imdb_df.describe()

In [None]:
# Create a data frame with numeric columns
imdb_df_numeric = imdb_df.select_dtypes(include='number')

# Find correlation between numeric columns with imdb_score
correlation = imdb_df_numeric.corr()['imdb_score'].sort_values(ascending=False)
correlation

In [None]:
# Check for missing values
missing_values = imdb_df_numeric.isnull().sum().sort_values(ascending=False)
missing_values

In [None]:
# Define the columns for which you want to remove null values
columns_to_clean = ['color', 'num_critic_for_reviews', 'duration', 'director_facebook_likes', 
                    'actor_3_facebook_likes', 'actor_1_facebook_likes', 'plot_keywords', 
                    'language', 'content_rating', 'actor_2_facebook_likes']

# Remove rows where any of these columns have null values
imdb_df = imdb_df.dropna(subset=columns_to_clean)

# Reset the index after dropping rows
imdb_df_clean = imdb_df.reset_index(drop=True)

# Display the new shape of the DataFrame
print(f"DataFrame shape after removing additional nulls: {imdb_df.shape}")

# Check for missing values in the cleaned DataFrame
print(imdb_df.isnull().sum().sort_values(ascending=False))

In [None]:
imdb_df.info()
imdb_df.describe()

In [None]:
# Drop rows where either 'gross' or 'budget' is null
imdb_df_clean = imdb_df.dropna(subset=['gross', 'budget'])

# Reset the index after dropping rows
imdb_df_clean = imdb_df_clean.reset_index(drop=True)

# Display the new shape of the DataFrame
print(f"Original DataFrame shape: {imdb_df.shape}")
print(f"Cleaned DataFrame shape: {imdb_df_clean.shape}")
missing_values_clean = imdb_df_clean.isnull().sum().sort_values(ascending=False)

 Replacing null values with a placeholder.

In [None]:
# Define a placeholder for missing values
PLACEHOLDER = 'Unknown'

# Replace NaN with the placeholder for actor name columns
for col in ['actor_1_name', 'actor_2_name', 'actor_3_name']:
    imdb_df_clean[col] = imdb_df_clean[col].fillna(PLACEHOLDER)

# Verify the changes
print(imdb_df_clean[['actor_1_name', 'actor_2_name', 'actor_3_name']].head(10))  


Preserve the original names for identification and further analysis.

In [None]:
# Store original actor names in new columns
imdb_df_clean['original_actor_1_name'] = imdb_df_clean['actor_1_name']
imdb_df_clean['original_actor_2_name'] = imdb_df_clean['actor_2_name']
imdb_df_clean['original_actor_3_name'] = imdb_df_clean['actor_3_name']
imdb_df_clean['original_director_name'] = imdb_df_clean['director_name']
#imdb_df.head()
print(imdb_df_clean[['original_actor_1_name', 'original_actor_2_name', 'original_actor_3_name', 'original_director_name']].head())

Target Encoding (map these mean scores back to the respective actor columns in the original DataFrame).
The encoded actor columns can now serve as features, where the actor names are replaced by their corresponding mean IMDb score.

In [None]:
# Preprocess names
columns_to_preprocess = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']
for col in columns_to_preprocess:
    imdb_df_clean[col] = imdb_df_clean[col].astype(str).str.strip().str.lower()

# Create Actor and Director Mean Mapping
melted_df = imdb_df_clean.melt(
    id_vars=['imdb_score'], 
    value_vars=columns_to_preprocess, 
    value_name='act_dir'
)
melted_df['act_dir'] = melted_df['act_dir'].astype(str).str.strip().str.lower()
person_mean_mapping = melted_df.groupby('act_dir')['imdb_score'].mean()

# Apply Target Encoding
for col in columns_to_preprocess:
    imdb_df_clean[col] = imdb_df_clean[col].map(person_mean_mapping).fillna(imdb_df_clean['imdb_score'].mean())

# Example: Get mean IMDb score for a specific actor
tom_cruise_score = person_mean_mapping.get('tom cruise', None)
print(f"Mean IMDb Score for Tom Cruise: {tom_cruise_score}")

# Verify the DataFrame
print(imdb_df_clean[['original_actor_1_name', 'original_actor_2_name', 'original_actor_3_name', 'original_director_name', 
               'actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']].head())

In [None]:
# Define features as all columns in imdb_df_clean
features = imdb_df_clean.columns

# Check for missing values in all features
print(imdb_df_clean[features].isnull().sum())

In [None]:
# Prepare the data
features = ['num_voted_users', 'num_critic_for_reviews', 'num_user_for_reviews', 
            'duration', 'gross', 'director_facebook_likes', 'cast_total_facebook_likes',
            'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes',
            'budget', 'title_year', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']

X = imdb_df_clean[features]
y = imdb_df_clean['imdb_score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR()
}

# Function to evaluate model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

# Train and evaluate models
results = {}
for name, model in models.items():
    mse, r2 = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    results[name] = {"MSE": mse, "R2": r2}

# Print results
print("Model Performance:")
for model, metrics in results.items():
    print(f"{model}:")
    print(f"  MSE: {metrics['MSE']:.4f}")
    print(f"  R2 Score: {metrics['R2']:.4f}")
    print()

# Feature importance for Random Forest if used
if "Random Forest" in models:
    rf_model = models["Random Forest"]
    rf_model.fit(X_train_scaled, y_train)
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    print("Feature Importance (Random Forest):")
    print(feature_importance)


# PCA for feature reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Number of components to explain 95% variance: {pca.n_components_}")

In [None]:
# Define the models and their performance metrics
models = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Random Forest', 'Gradient Boosting', 'SVR']
mse_scores = [0.2387, 0.2387, 0.9286, 0.2267, 0.2241, 0.2061]
r2_scores = [0.7423, 0.7424, -0.0022, 0.7553, 0.7581, 0.7776]

# Set up the figure and axes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Plot MSE
bars = ax1.bar(models, mse_scores)
ax1.set_title('Mean Squared Error (MSE) by Model')
ax1.set_ylabel('MSE')
ax1.set_xlabel('Model')
ax1.set_ylim(0, max(mse_scores) * 1.1)  # Adjust y-axis limit for better visibility

# Angle the x-axis labels for MSE plot
ax1.set_xticklabels(models, rotation=45, ha='right')

# Plot R² Score
bars = ax2.bar(models, r2_scores)
ax2.set_title('R² Score by Model')
ax2.set_ylabel('R² Score')
ax2.set_xlabel('Model')
ax2.set_ylim(min(r2_scores) - 0.05, 1)  # Adjust y-axis limit for better visibility

# Angle the x-axis labels for R² Score plot
ax2.set_xticklabels(models, rotation=45, ha='right')
             

In [None]:
# Filter out rows where gross is null or zero to avoid log issues
filtered_df = imdb_df_clean[imdb_df_clean['gross'] > 0]

# Log transform gross for better visualization of the relationship
filtered_df['gross_rev'] = np.log1p(filtered_df['gross'])

# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(filtered_df['gross_rev'], filtered_df['imdb_score'], alpha=0.5)

# Add labels and title
plt.xlabel('Gross Revenue')
plt.ylabel('IMDB Score')
plt.title('Relationship between Gross Revenue and IMDB Score')

# Add a trend line
z = np.polyfit(filtered_df['gross_rev'], filtered_df['imdb_score'], 1)
p = np.poly1d(z)
plt.plot(filtered_df['gross_rev'], p(filtered_df['gross_rev']), "r--")

# Add grid for better readability
plt.grid(True, linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:

plt.figure(figsize=(10, 6))

# Create the hexbin plot
hb = plt.hexbin(imdb_df_clean['director_name'], imdb_df_clean['imdb_score'], 
                gridsize=20, cmap='viridis', bins='log')

# Add labels and title
plt.xlabel('Number of movies directed (By Director)')
plt.ylabel('IMDB Score')
plt.title('Density of Movie Scores by Director Experience')

# Add a color bar
plt.colorbar(hb, label='Count of Movies')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns

# Assuming your DataFrame is named imdb_df_clean
# Filter out movies with zero or null gross to avoid issues with log transformation
filtered_df = imdb_df_clean[imdb_df_clean['gross'] > 0]

# Log transform gross for better visualization
filtered_df['gross_rev'] = np.log1p(filtered_df['gross'])

plt.figure(figsize=(12, 8))

# Create the scatter plot using seaborn for enhanced aesthetics
sns.scatterplot(x='gross_rev', y='imdb_score', data=filtered_df, alpha=0.6)

# Add labels and title
plt.xlabel('Log(Gross Revenue)')
plt.ylabel('IMDB Score')
plt.title('IMDB Score vs Log(Gross Revenue)')

# Add a trend line
z = np.polyfit(filtered_df['gross_rev'], filtered_df['imdb_score'], 1)
p = np.poly1d(z)
plt.plot(filtered_df['gross_rev'], p(filtered_df['gross_rev']), "r--", label='Trend Line')

# Add grid for better readability
plt.grid(True, linestyle='--', alpha=0.7)

# Add legend
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()