In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import locale

In [None]:
# set pandas options
pd.set_option('display.max_columns', None)

In [None]:
# fetch the data 
df = pd.read_csv("../../resources/movie_metadata.csv")
df.head()

In [None]:
df["budget"].isnull().sum()

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    print(df['budget'].describe())

In [None]:
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
print(f"Maximum Gross: {locale.currency(df['budget'].max(), grouping=True)}")
print(f"Minimum Gross: {locale.currency(df['budget'].min(), grouping=True)}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Boxplot
axes[0].boxplot(df['budget'].dropna(), vert=False, patch_artist=True)
axes[0].set_title("Boxplot of Budget Values")
axes[0].set_xlabel("Budget")

# Histogram
axes[1].hist(df['budget'].dropna(), bins=10, edgecolor='k', alpha=0.7)
axes[1].set_title("Histogram of Budget Values")
axes[1].set_xlabel("Budget")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
gross_cleaned = df['budget'].dropna()

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = gross_cleaned.quantile(0.25)
Q3 = gross_cleaned.quantile(0.75)

# Compute IQR
IQR = Q3 - Q1

# Calculate outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = gross_cleaned[(gross_cleaned < lower_bound) | (gross_cleaned > upper_bound)]

print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
print("Outliers:")
print(outliers)

In [None]:
df['budget_capped'] = df['budget'].clip(lower=lower_bound, upper=upper_bound)
df['budget_capped'].count()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Boxplot
axes[0].boxplot(df['budget_capped'].dropna(), vert=False, patch_artist=True)
axes[0].set_title("Boxplot of Budget Capped Values")
axes[0].set_xlabel("Budget")

# Histogram
axes[1].hist(df['budget_capped'].dropna(), bins=10, edgecolor='k', alpha=0.7)
axes[1].set_title("Histogram of Budget Capped Values")
axes[1].set_xlabel("Budget")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()