# Task
Analyze the provided IMDb datasets ("https://developer.imdb.com/non-commercial-datasets/") to uncover trends and insights related to how factors like genres, directors, actors, and runtime influence movie ratings and popularity. Address the exploratory questions provided, build an interactive visualization showing a director's top 3 movies by rating and runtime, and perform predictive and descriptive analysis, documenting the methodology and findings.

## Load and inspect data

### Subtask:
Load the relevant TSV files into pandas DataFrames and inspect their structure, including columns, data types, and missing values.

**Reasoning**:
Load the specified TSV files into pandas DataFrames and inspect their structure as instructed.

## Analyze runtime correlation
### Subtask:
Examine the correlation between movie length (runtime) and rating or popularity using 'title.basics.tsv.gz' and 'title.ratings.tsv.gz'.

In [None]:
# IMDb Data Mining Project â€” Cleaned & Focused Visualization (Top 30K Rows)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler

# Step 1: Load Datasets Safely with Row Limit
def load_tsv(path, nrows=30000):
    return pd.read_csv(path, sep='\t', low_memory=False, na_values='\\N', nrows=nrows)

title_basics_df = load_tsv('/content/title.basics.tsv.gz')
title_ratings_df = load_tsv('/content/title.ratings.tsv.gz')

print("Data Loaded (Top 30K rows from each dataset)")
print(f"Title Basics shape: {title_basics_df.shape}")
print(f"Title Ratings shape: {title_ratings_df.shape}")

# Step 2: Clean and Convert Datatypes
title_basics_df['startYear'] = pd.to_numeric(title_basics_df['startYear'], errors='coerce')
title_ratings_df['averageRating'] = pd.to_numeric(title_ratings_df['averageRating'], errors='coerce')
title_ratings_df['numVotes'] = pd.to_numeric(title_ratings_df['numVotes'], errors='coerce')

# Step 3: Merge title_basics and title_ratings
merged_df = pd.merge(title_basics_df, title_ratings_df, on='tconst', how='inner')
merged_df = merged_df.dropna(subset=['averageRating', 'numVotes', 'startYear'])
merged_df = merged_df[merged_df['titleType'] == 'movie']

print(f"Merged DataFrame Shape (after filtering): {merged_df.shape}")
display(merged_df.head(3))

# Step 4: Sampling for Visualization
eda_sample = merged_df.sample(n=min(5000, len(merged_df)), random_state=42)

# 1. Average IMDb Rating Trend Over the Years
plt.figure(figsize=(10,6))
yearly_ratings = merged_df.groupby('startYear')['averageRating'].mean().dropna()
sns.lineplot(x=yearly_ratings.index, y=yearly_ratings.values, color='blue', marker='o')
plt.title('Average IMDb Movie Rating Trend Over Years', fontsize=14, fontweight='bold')
plt.xlabel('Release Year')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()

# 2. Genre Popularity by Count
if 'genres' in merged_df.columns:
    genre_df = merged_df.dropna(subset=['genres'])
    genre_df = genre_df.assign(genres=genre_df['genres'].str.split(',')).explode('genres')
    top_genres = genre_df['genres'].value_counts().head(10)

    plt.figure(figsize=(10,6))
    sns.barplot(x=top_genres.values, y=top_genres.index, palette='mako')
    plt.title('Top 10 Most Common Movie Genres', fontsize=14, fontweight='bold')
    plt.xlabel('Number of Movies')
    plt.ylabel('Genre')
    plt.show()

# 3. Ratings vs Votes (Popularity vs Quality)
plt.figure(figsize=(8,6))
sns.scatterplot(x='numVotes', y='averageRating', data=eda_sample, alpha=0.5, color='green')
plt.title('Movie Popularity vs Quality', fontsize=14, fontweight='bold')
plt.xlabel('Number of Votes (log scale)')
plt.ylabel('Average Rating')
plt.xscale('log')
plt.grid(True)
plt.show()

print("\nClean Visualization Completed Successfully (No Clustering, High Impact Graphs)!")


: 

In [None]:
import pandas as pd
import numpy as np

# 1. Load the title.basics.tsv.gz and title.ratings.tsv.gz datasets into pandas DataFrames.
title_basics_df = pd.read_csv('/content/title.basics.tsv.gz', sep='\t', low_memory=False)
title_ratings_df = pd.read_csv('/content/title.ratings.tsv.gz', sep='\t')

# 2. Filter the title_basics_df to include only 'movie' and 'tvMovie' title types.
filtered_basics_df = title_basics_df[title_basics_df['titleType'].isin(['movie', 'tvMovie'])].copy()

# 3. Handle the runtimeMinutes column in the filtered title_basics_df: replace '\N' values with NaN and convert the column to a numeric type.
filtered_basics_df['runtimeMinutes'] = filtered_basics_df['runtimeMinutes'].replace('\\N', np.nan)
filtered_basics_df['runtimeMinutes'] = pd.to_numeric(filtered_basics_df['runtimeMinutes'], errors='coerce')

# 4. Merge the filtered title_basics_df and title_ratings_df DataFrames on the 'tconst' column using an inner join.
merged_df = pd.merge(filtered_basics_df, title_ratings_df, on='tconst', how='inner')

# 5. Drop rows with missing values in the 'runtimeMinutes', 'averageRating', and 'numVotes' columns from the merged DataFrame.
cleaned_merged_df = merged_df.dropna(subset=['runtimeMinutes', 'averageRating', 'numVotes']).copy()

# 6. Calculate the correlation matrix between 'runtimeMinutes', 'averageRating', and 'numVotes'.
correlation_matrix = cleaned_merged_df[['runtimeMinutes', 'averageRating', 'numVotes']].corr()

# 7. Display the correlation matrix.
print("Correlation matrix between runtimeMinutes, averageRating, and numVotes:")
display(correlation_matrix)

: 