In [None]:
def eda_summary(df, file_name):
    print(f"\n{'='*80}")
    print(f"ANALYSIS: {file_name}")
    print(f"{'='*80}")
    print(f"\nShape: {df.shape[0]} rows Ã— {df.shape[1]} columns")
    print(f"\nColumn Names and Types:")
    print(df.dtypes)
    print(f"\nFirst few rows:")
    print(df.head())
    print(f"\nBasic Statistics:")
    print(df.describe())
    print(f"\nMissing Values:")
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(missing[missing > 0])
    else:
        print("No missing values")
    print(f"\nDuplicate Rows: {df.duplicated().sum()}")
    print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

csv_files = sorted([f for f in files if f.endswith('.csv')])
data_dict = {}

QUICK_RUN = True
DEFAULT_NROWS = 200 if QUICK_RUN else None

for csv_file in csv_files:
    file_path = os.path.join(data_folder, csv_file)
    try:
        nrows = DEFAULT_NROWS
        df = pd.read_csv(file_path, nrows=nrows)
        data_dict[csv_file] = df
        eda_summary(df, csv_file)
    except Exception as e:
        print(f"Error loading {csv_file}: {e}")

# Detailed File-by-File Data Analysis

In [None]:
if 'videoInfo.csv' in data_dict:
    df_video = data_dict['videoInfo.csv']

    numeric_cols = df_video.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df_video[col].dropna(), bins=50, edgecolor='black', alpha=0.7)
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
if 'categoryVotes.csv' in data_dict:
    df_catvotes = data_dict['categoryVotes.csv']

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    numeric_cols = df_catvotes.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        col = numeric_cols[0]
        axes[0].hist(df_catvotes[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='gold')
        axes[0].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        axes[0].grid(axis='y', alpha=0.3)

    categorical_cols = df_catvotes.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        col = categorical_cols[0]
        value_counts = df_catvotes[col].value_counts()
        axes[1].bar(range(len(value_counts)), value_counts.values, edgecolor='black', alpha=0.7, color='lightskyblue')
        axes[1].set_xticks(range(len(value_counts)))
        axes[1].set_xticklabels(value_counts.index, rotation=45)
        axes[1].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[1].set_ylabel('Count')
        axes[1].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
if 'casualVoteTitles.csv' in data_dict:
    df_casualvt = data_dict['casualVoteTitles.csv']

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    numeric_cols = df_casualvt.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        col = numeric_cols[0]
        axes[0].hist(df_casualvt[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='papayawhip')
        axes[0].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        axes[0].grid(axis='y', alpha=0.3)

    categorical_cols = df_casualvt.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        col = categorical_cols[0]
        value_counts = df_casualvt[col].value_counts()
        axes[1].bar(range(len(value_counts)), value_counts.values, edgecolor='black', alpha=0.7, color='lightseagreen')
        axes[1].set_xticks(range(len(value_counts)))
        axes[1].set_xticklabels(value_counts.index, rotation=45)
        axes[1].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[1].set_ylabel('Count')
        axes[1].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
if 'casualVoteTitles.csv' in data_dict:
    df_casualvt = data_dict['casualVoteTitles.csv']
    print("="*80)
    print("casualVoteTitles.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_casualvt.shape}")
    print(f"\nColumns: {list(df_casualvt.columns)}")
    print(f"\nData Types:\n{df_casualvt.dtypes}")
    print(f"\nFirst rows:\n{df_casualvt.head()}")
    print(f"\nBasic Statistics:\n{df_casualvt.describe()}")
    print(f"\nMissing Values:\n{df_casualvt.isnull().sum()}")

## casualVoteTitles.csv

In [None]:
if 'unlistedVideos.csv' in data_dict:
    df_unlisted = data_dict['unlistedVideos.csv']

    numeric_cols = df_unlisted.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df_unlisted[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='navajowhite')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
if 'unlistedVideos.csv' in data_dict:
    df_unlisted = data_dict['unlistedVideos.csv']
    print("="*80)
    print("unlistedVideos.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_unlisted.shape}")
    print(f"\nColumns: {list(df_unlisted.columns)}")
    print(f"\nData Types:\n{df_unlisted.dtypes}")
    print(f"\nFirst rows:\n{df_unlisted.head()}")
    print(f"\nBasic Statistics:\n{df_unlisted.describe()}")
    print(f"\nMissing Values:\n{df_unlisted.isnull().sum()}")

## unlistedVideos.csv

In [None]:
if 'warnings.csv' in data_dict:
    df_warnings = data_dict['warnings.csv']

    numeric_cols = df_warnings.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df_warnings[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='moccasin')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
if 'warnings.csv' in data_dict:
    df_warnings = data_dict['warnings.csv']
    print("="*80)
    print("warnings.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_warnings.shape}")
    print(f"\nColumns: {list(df_warnings.columns)}")
    print(f"\nData Types:\n{df_warnings.dtypes}")
    print(f"\nFirst rows:\n{df_warnings.head()}")
    print(f"\nBasic Statistics:\n{df_warnings.describe()}")
    print(f"\nMissing Values:\n{df_warnings.isnull().sum()}")

## warnings.csv

In [None]:
if 'lockCategories.csv' in data_dict:
    df_lock = data_dict['lockCategories.csv']

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    numeric_cols = df_lock.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        col = numeric_cols[0]
        axes[0].hist(df_lock[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='bisque')
        axes[0].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        axes[0].grid(axis='y', alpha=0.3)

    categorical_cols = df_lock.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        col = categorical_cols[0]
        value_counts = df_lock[col].value_counts()
        axes[1].bar(range(len(value_counts)), value_counts.values, edgecolor='black', alpha=0.7, color='peachpuff')
        axes[1].set_xticks(range(len(value_counts)))
        axes[1].set_xticklabels(value_counts.index, rotation=45)
        axes[1].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[1].set_ylabel('Count')
        axes[1].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
if 'lockCategories.csv' in data_dict:
    df_lock = data_dict['lockCategories.csv']
    print("="*80)
    print("lockCategories.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_lock.shape}")
    print(f"\nColumns: {list(df_lock.columns)}")
    print(f"\nData Types:\n{df_lock.dtypes}")
    print(f"\nFirst rows:\n{df_lock.head()}")
    print(f"\nBasic Statistics:\n{df_lock.describe()}")
    print(f"\nMissing Values:\n{df_lock.isnull().sum()}")

## lockCategories.csv

In [None]:
if 'thumbnailTimestamps.csv' in data_dict:
    df_thumbts = data_dict['thumbnailTimestamps.csv']

    numeric_cols = df_thumbts.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df_thumbts[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='lavender')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
if 'thumbnailTimestamps.csv' in data_dict:
    df_thumbts = data_dict['thumbnailTimestamps.csv']
    print("="*80)
    print("thumbnailTimestamps.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_thumbts.shape}")
    print(f"\nColumns: {list(df_thumbts.columns)}")
    print(f"\nData Types:\n{df_thumbts.dtypes}")
    print(f"\nFirst rows:\n{df_thumbts.head()}")
    print(f"\nBasic Statistics:\n{df_thumbts.describe()}")
    print(f"\nMissing Values:\n{df_thumbts.isnull().sum()}")

## thumbnailTimestamps.csv

In [None]:
if 'thumbnailVotes.csv' in data_dict:
    df_thumbvotes = data_dict['thumbnailVotes.csv']

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    numeric_cols = df_thumbvotes.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        col = numeric_cols[0]
        axes[0].hist(df_thumbvotes[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='plum')
        axes[0].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        axes[0].grid(axis='y', alpha=0.3)

    categorical_cols = df_thumbvotes.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        col = categorical_cols[0]
        value_counts = df_thumbvotes[col].value_counts()
        axes[1].bar(range(len(value_counts)), value_counts.values, edgecolor='black', alpha=0.7, color='khaki')
        axes[1].set_xticks(range(len(value_counts)))
        axes[1].set_xticklabels(value_counts.index, rotation=45)
        axes[1].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[1].set_ylabel('Count')
        axes[1].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
if 'thumbnailVotes.csv' in data_dict:
    df_thumbvotes = data_dict['thumbnailVotes.csv']
    print("="*80)
    print("thumbnailVotes.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_thumbvotes.shape}")
    print(f"\nColumns: {list(df_thumbvotes.columns)}")
    print(f"\nData Types:\n{df_thumbvotes.dtypes}")
    print(f"\nFirst rows:\n{df_thumbvotes.head()}")
    print(f"\nBasic Statistics:\n{df_thumbvotes.describe()}")
    print(f"\nMissing Values:\n{df_thumbvotes.isnull().sum()}")

## thumbnailVotes.csv

In [None]:
if 'thumbnails.csv' in data_dict:
    df_thumb = data_dict['thumbnails.csv']

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    numeric_cols = df_thumb.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        col = numeric_cols[0]
        axes[0].hist(df_thumb[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='pink')
        axes[0].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        axes[0].grid(axis='y', alpha=0.3)

    categorical_cols = df_thumb.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        col = categorical_cols[0]
        value_counts = df_thumb[col].value_counts().head(10)
        axes[1].bar(range(len(value_counts)), value_counts.values, edgecolor='black', alpha=0.7, color='lightsteelblue')
        axes[1].set_xticks(range(len(value_counts)))
        axes[1].set_xticklabels(value_counts.index, rotation=45, ha='right')
        axes[1].set_title(f'Top 10 {col}', fontsize=12, fontweight='bold')
        axes[1].set_ylabel('Count')
        axes[1].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
if 'thumbnails.csv' in data_dict:
    df_thumb = data_dict['thumbnails.csv']
    print("="*80)
    print("thumbnails.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_thumb.shape}")
    print(f"\nColumns: {list(df_thumb.columns)}")
    print(f"\nData Types:\n{df_thumb.dtypes}")
    print(f"\nFirst rows:\n{df_thumb.head()}")
    print(f"\nBasic Statistics:\n{df_thumb.describe()}")
    print(f"\nMissing Values:\n{df_thumb.isnull().sum()}")

## thumbnails.csv

In [None]:
if 'vipUsers.csv' in data_dict:
    df_vip = data_dict['vipUsers.csv']

    numeric_cols = df_vip.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df_vip[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='lightcyan')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
if 'vipUsers.csv' in data_dict:
    df_vip = data_dict['vipUsers.csv']
    print("="*80)
    print("vipUsers.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_vip.shape}")
    print(f"\nColumns: {list(df_vip.columns)}")
    print(f"\nData Types:\n{df_vip.dtypes}")
    print(f"\nFirst rows:\n{df_vip.head()}")
    print(f"\nBasic Statistics:\n{df_vip.describe()}")
    print(f"\nMissing Values:\n{df_vip.isnull().sum()}")

## vipUsers.csv

In [None]:
if 'userNames.csv' in data_dict:
    df_users = data_dict['userNames.csv']

    numeric_cols = df_users.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df_users[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='skyblue')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
if 'userNames.csv' in data_dict:
    df_users = data_dict['userNames.csv']
    print("="*80)
    print("userNames.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_users.shape}")
    print(f"\nColumns: {list(df_users.columns)}")
    print(f"\nData Types:\n{df_users.dtypes}")
    print(f"\nFirst rows:\n{df_users.head()}")
    print(f"\nBasic Statistics:\n{df_users.describe()}")
    print(f"\nMissing Values:\n{df_users.isnull().sum()}")

## userNames.csv

In [None]:
if 'sponsorTimes.csv' in data_dict:
    df_sponsor = data_dict['sponsorTimes.csv']

    numeric_cols = df_sponsor.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df_sponsor[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='orange')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
if 'sponsorTimes.csv' in data_dict:
    df_sponsor = data_dict['sponsorTimes.csv']
    print("="*80)
    print("sponsorTimes.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_sponsor.shape}")
    print(f"\nColumns: {list(df_sponsor.columns)}")
    print(f"\nData Types:\n{df_sponsor.dtypes}")
    print(f"\nFirst rows:\n{df_sponsor.head()}")
    print(f"\nBasic Statistics:\n{df_sponsor.describe()}")
    print(f"\nMissing Values:\n{df_sponsor.isnull().sum()}")

## sponsorTimes.csv

In [None]:
if 'ratings.csv' in data_dict:
    df_ratings = data_dict['ratings.csv']

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    numeric_cols = df_ratings.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        col = numeric_cols[0]
        axes[0].hist(df_ratings[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='salmon')
        axes[0].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        axes[0].grid(axis='y', alpha=0.3)

    categorical_cols = df_ratings.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        col = categorical_cols[0]
        value_counts = df_ratings[col].value_counts().sort_index()
        axes[1].bar(range(len(value_counts)), value_counts.values, edgecolor='black', alpha=0.7, color='lightgreen')
        axes[1].set_xticks(range(len(value_counts)))
        axes[1].set_xticklabels(value_counts.index, rotation=45)
        axes[1].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[1].set_ylabel('Count')
        axes[1].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
if 'ratings.csv' in data_dict:
    df_ratings = data_dict['ratings.csv']
    print("="*80)
    print("ratings.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_ratings.shape}")
    print(f"\nColumns: {list(df_ratings.columns)}")
    print(f"\nData Types:\n{df_ratings.dtypes}")
    print(f"\nFirst rows:\n{df_ratings.head()}")
    print(f"\nBasic Statistics:\n{df_ratings.describe()}")
    print(f"\nMissing Values:\n{df_ratings.isnull().sum()}")

## ratings.csv

In [None]:
if 'categoryVotes.csv' in data_dict:
    df_catvotes = data_dict['categoryVotes.csv']
    print("="*80)
    print("categoryVotes.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_catvotes.shape}")
    print(f"\nColumns: {list(df_catvotes.columns)}")
    print(f"\nData Types:\n{df_catvotes.dtypes}")
    print(f"\nFirst rows:\n{df_catvotes.head()}")
    print(f"\nBasic Statistics:\n{df_catvotes.describe()}")
    print(f"\nMissing Values:\n{df_catvotes.isnull().sum()}")

## categoryVotes.csv

In [None]:
if 'casualVotes.csv' in data_dict:
    df_casual = data_dict['casualVotes.csv']

    numeric_cols = df_casual.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df_casual[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='mediumpurple')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
if 'casualVotes.csv' in data_dict:
    df_casual = data_dict['casualVotes.csv']
    print("="*80)
    print("casualVotes.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_casual.shape}")
    print(f"\nColumns: {list(df_casual.columns)}")
    print(f"\nData Types:\n{df_casual.dtypes}")
    print(f"\nFirst rows:\n{df_casual.head()}")
    print(f"\nBasic Statistics:\n{df_casual.describe()}")
    print(f"\nMissing Values:\n{df_casual.isnull().sum()}")

## casualVotes.csv

In [None]:
if 'titleVotes.csv' in data_dict:
    df_tvotes = data_dict['titleVotes.csv']

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    numeric_cols = df_tvotes.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        col = numeric_cols[0]
        axes[0].hist(df_tvotes[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='seagreen')
        axes[0].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        axes[0].grid(axis='y', alpha=0.3)

    categorical_cols = df_tvotes.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        col = categorical_cols[0]
        value_counts = df_tvotes[col].value_counts()
        axes[1].bar(range(len(value_counts)), value_counts.values, edgecolor='black', alpha=0.7, color='lightcoral')
        axes[1].set_xticks(range(len(value_counts)))
        axes[1].set_xticklabels(value_counts.index, rotation=45)
        axes[1].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[1].set_ylabel('Count')
        axes[1].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
if 'titleVotes.csv' in data_dict:
    df_tvotes = data_dict['titleVotes.csv']
    print("="*80)
    print("titleVotes.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_tvotes.shape}")
    print(f"\nColumns: {list(df_tvotes.columns)}")
    print(f"\nData Types:\n{df_tvotes.dtypes}")
    print(f"\nFirst rows:\n{df_tvotes.head()}")
    print(f"\nBasic Statistics:\n{df_tvotes.describe()}")
    print(f"\nMissing Values:\n{df_tvotes.isnull().sum()}")

## titleVotes.csv

In [None]:
if 'titles.csv' in data_dict:
    df_titles = data_dict['titles.csv']

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    if 'title' in df_titles.columns:
        title_lengths = df_titles['title'].str.len()
        axes[0, 0].hist(title_lengths.dropna(), bins=50, edgecolor='black', alpha=0.7, color='steelblue')
        axes[0, 0].set_title('Distribution of Title Lengths', fontsize=12, fontweight='bold')
        axes[0, 0].set_xlabel('Title Length (characters)')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].grid(axis='y', alpha=0.3)

    numeric_cols = df_titles.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        for idx, col in enumerate(numeric_cols[:3]):
            row, col_idx = divmod(idx + 1, 2)
            axes[row, col_idx].hist(df_titles[col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='coral')
            axes[row, col_idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[row, col_idx].set_xlabel(col)
            axes[row, col_idx].set_ylabel('Frequency')
            axes[row, col_idx].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
if 'titles.csv' in data_dict:
    df_titles = data_dict['titles.csv']
    print("="*80)
    print("titles.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_titles.shape}")
    print(f"\nColumns: {list(df_titles.columns)}")
    print(f"\nData Types:\n{df_titles.dtypes}")
    print(f"\nFirst rows:\n{df_titles.head()}")
    print(f"\nBasic Statistics:\n{df_titles.describe()}")
    print(f"\nMissing Values:\n{df_titles.isnull().sum()}")

## titles.csv

In [None]:
if 'videoInfo.csv' in data_dict:
    df_video = data_dict['videoInfo.csv']
    print("="*80)
    print("videoInfo.csv - Analysis")
    print("="*80)
    print(f"\nShape: {df_video.shape}")
    print(f"\nColumns: {list(df_video.columns)}")
    print(f"\nData Types:\n{df_video.dtypes}")
    print(f"\nFirst rows:\n{df_video.head()}")
    print(f"\nBasic Statistics:\n{df_video.describe()}")
    print(f"\nMissing Values:\n{df_video.isnull().sum()}")