# Detailed Hypothesis Testing on Movie Dataset
This notebook performs a series of hypothesis tests on a movie dataset to determine if various features are associated with movie success.

**Tests Included:**
1. Budget vs. Success (t-test)
2. Genre vs. Success (Chi-squared)
3. Vote Average vs. Success (t-test)
4. Runtime vs. Success (t-test)
5. Vote Count vs. Success (t-test)
6. Certification vs. Success (Chi-squared)
7. Country vs. Success (Chi-squared)

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

alpha = 0.05

In [None]:
try:
    df = pd.read_csv('dataset/moviesDb.csv')
    print("Dataset loaded successfully for detailed hypothesis testing!")
except FileNotFoundError:
    print("Error: moviesDb.csv not found. Cannot perform detailed hypothesis testing.")
    df = None

df.head() if df is not None else None

## Hypothesis Test: Budget vs. Success

In [None]:
if df is not None:
    group1 = df[df['success'] == True]['budget'].dropna()
    group2 = df[df['success'] == False]['budget'].dropna()
    display(group1.describe())
    display(group2.describe())

    if len(group1) > 1 and len(group2) > 1:
        ttest = stats.ttest_ind(group1, group2, equal_var=False, alternative='greater')
        print(f"t-statistic = {ttest.statistic:.4f}, p-value = {ttest.pvalue:.4f}")

        n1, n2 = len(group1), len(group2)
        mean1, mean2 = group1.mean(), group2.mean()
        std1, std2 = group1.std(), group2.std()
        pooled_std = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2)/(n1 + n2 - 2))
        cohen_d = (mean1 - mean2) / pooled_std
        print(f"Cohen's d = {cohen_d:.4f}")
    else:
        print("Insufficient data for t-test on budget.")

## Hypothesis Test: Vote Average vs. Success

In [None]:
if df is not None:
    group1 = df[df['success'] == True]['vote_average'].dropna()
    group2 = df[df['success'] == False]['vote_average'].dropna()
    display(group1.describe())
    display(group2.describe())

    if len(group1) > 1 and len(group2) > 1:
        ttest = stats.ttest_ind(group1, group2, equal_var=False, alternative='greater')
        print(f"t-statistic = {ttest.statistic:.4f}, p-value = {ttest.pvalue:.4f}")

        n1, n2 = len(group1), len(group2)
        mean1, mean2 = group1.mean(), group2.mean()
        std1, std2 = group1.std(), group2.std()
        pooled_std = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2)/(n1 + n2 - 2))
        cohen_d = (mean1 - mean2) / pooled_std
        print(f"Cohen's d = {cohen_d:.4f}")
    else:
        print("Insufficient data for t-test on vote_average.")

## Hypothesis Test: Runtime vs. Success

In [None]:
if df is not None:
    group1 = df[df['success'] == True]['runtime'].dropna()
    group2 = df[df['success'] == False]['runtime'].dropna()
    display(group1.describe())
    display(group2.describe())

    if len(group1) > 1 and len(group2) > 1:
        ttest = stats.ttest_ind(group1, group2, equal_var=False, alternative='two-sided')
        print(f"t-statistic = {ttest.statistic:.4f}, p-value = {ttest.pvalue:.4f}")

        n1, n2 = len(group1), len(group2)
        mean1, mean2 = group1.mean(), group2.mean()
        std1, std2 = group1.std(), group2.std()
        pooled_std = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2)/(n1 + n2 - 2))
        cohen_d = (mean1 - mean2) / pooled_std
        print(f"Cohen's d = {cohen_d:.4f}")
    else:
        print("Insufficient data for t-test on runtime.")

## Hypothesis Test: Vote Count vs. Success

In [None]:
if df is not None:
    group1 = df[df['success'] == True]['vote_count'].dropna()
    group2 = df[df['success'] == False]['vote_count'].dropna()
    display(group1.describe())
    display(group2.describe())

    if len(group1) > 1 and len(group2) > 1:
        ttest = stats.ttest_ind(group1, group2, equal_var=False, alternative='greater')
        print(f"t-statistic = {ttest.statistic:.4f}, p-value = {ttest.pvalue:.4f}")

        n1, n2 = len(group1), len(group2)
        mean1, mean2 = group1.mean(), group2.mean()
        std1, std2 = group1.std(), group2.std()
        pooled_std = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2)/(n1 + n2 - 2))
        cohen_d = (mean1 - mean2) / pooled_std
        print(f"Cohen's d = {cohen_d:.4f}")
    else:
        print("Insufficient data for t-test on vote_count.")

## Hypothesis Test: Genre vs. Success

In [None]:
if df is not None:
    top_values = df['genre'].value_counts().nlargest(15).index.tolist()
    filtered_df = df[df['genre'].isin(top_values)].copy()

    if not filtered_df.empty:
        crosstab = pd.crosstab(filtered_df['genre'], filtered_df['success'])
        if crosstab.shape[0] > 1 and crosstab.shape[1] > 1:
            chi2, p, dof, expected = stats.chi2_contingency(crosstab)
            display(crosstab)
            display(pd.DataFrame(expected, index=crosstab.index, columns=crosstab.columns))
            print(f"Chi-squared: {chi2:.4f}, p-value: {p:.4f}, dof: {dof}")

            n = crosstab.sum().sum()
            min_dim = min(crosstab.shape) - 1
            cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else float('nan')
            print(f"Cramer's V: {cramers_v:.4f}")

            low_expected = (expected < 5).sum()
            total_cells = expected.size
            if low_expected > 0:
                print(f"Warning: {low_expected} out of {total_cells} cells have expected count < 5.")
        else:
            print("Insufficient dimensions for Chi-squared test.")
    else:
        print("Filtered data is empty.")

## Hypothesis Test: Certification vs. Success

In [None]:
if df is not None:
    top_values = df['certification_US'].dropna().unique().tolist()
    filtered_df = df.copy()

    if not filtered_df.empty:
        crosstab = pd.crosstab(filtered_df['certification_US'], filtered_df['success'])
        if crosstab.shape[0] > 1 and crosstab.shape[1] > 1:
            chi2, p, dof, expected = stats.chi2_contingency(crosstab)
            display(crosstab)
            display(pd.DataFrame(expected, index=crosstab.index, columns=crosstab.columns))
            print(f"Chi-squared: {chi2:.4f}, p-value: {p:.4f}, dof: {dof}")

            n = crosstab.sum().sum()
            min_dim = min(crosstab.shape) - 1
            cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else float('nan')
            print(f"Cramer's V: {cramers_v:.4f}")

            low_expected = (expected < 5).sum()
            total_cells = expected.size
            if low_expected > 0:
                print(f"Warning: {low_expected} out of {total_cells} cells have expected count < 5.")
        else:
            print("Insufficient dimensions for Chi-squared test.")
    else:
        print("Filtered data is empty.")

## Hypothesis Test: Country vs. Success

In [None]:
if df is not None:
    top_values = df['country'].value_counts().nlargest(10).index.tolist()
    filtered_df = df[df['country'].isin(top_values)].copy()

    if not filtered_df.empty:
        crosstab = pd.crosstab(filtered_df['country'], filtered_df['success'])
        if crosstab.shape[0] > 1 and crosstab.shape[1] > 1:
            chi2, p, dof, expected = stats.chi2_contingency(crosstab)
            display(crosstab)
            display(pd.DataFrame(expected, index=crosstab.index, columns=crosstab.columns))
            print(f"Chi-squared: {chi2:.4f}, p-value: {p:.4f}, dof: {dof}")

            n = crosstab.sum().sum()
            min_dim = min(crosstab.shape) - 1
            cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else float('nan')
            print(f"Cramer's V: {cramers_v:.4f}")

            low_expected = (expected < 5).sum()
            total_cells = expected.size
            if low_expected > 0:
                print(f"Warning: {low_expected} out of {total_cells} cells have expected count < 5.")
        else:
            print("Insufficient dimensions for Chi-squared test.")
    else:
        print("Filtered data is empty.")

## Summary and Interpretation Guidance

In [None]:
print(f"Alpha level used: {alpha}")
print("Refer to earlier cells for detailed test outputs, p-values, and effect sizes.")