# üé¨ Will I Like This Movie? - SOLUTIONS
## ‚ö†Ô∏è Spoiler Alert! Try the workshop first!

This notebook contains the complete solutions. Only look here if you're stuck!

---

## Checkpoint 1: üîß Environment Setup

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots look nice
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Show plots inline
%matplotlib inline

print("‚úÖ All libraries imported successfully!")
print(f"üìä Pandas version: {pd.__version__}")
print(f"üî¢ NumPy version: {np.__version__}")

## Checkpoint 2: üìÇ Load the Data

In [None]:
# Load the movie dataset
df = pd.read_csv('../data/movies.csv')

print(f"üé¨ Loaded {len(df)} movies!")

In [None]:
# SOLUTION: Display the first 5 rows of the dataset
df.head()

In [None]:
# SOLUTION: Check the shape of the dataset
df.shape

In [None]:
# SOLUTION: List all column names
df.columns

## Checkpoint 3: üîç Explore the Data

In [None]:
# Get summary statistics
df.describe()

In [None]:
# SOLUTION: Average movie rating
average_rating = df['vote_average'].mean()
print(f"‚≠ê Average movie rating: {average_rating:.2f}")

In [None]:
# SOLUTION: Highest-rated movie
best_movie = df.loc[df['vote_average'].idxmax()]
print(f"üèÜ Highest rated movie: {best_movie['title']} ({best_movie['vote_average']})")

In [None]:
# SOLUTION: Lowest-rated movie
worst_movie = df.loc[df['vote_average'].idxmin()]
print(f"üëé Lowest rated movie: {worst_movie['title']} ({worst_movie['vote_average']})")

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

## Checkpoint 4: üìä Visualize Patterns

In [None]:
# SOLUTION: Histogram of movie ratings
plt.figure(figsize=(10, 6))
plt.hist(df['vote_average'], bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Rating')
plt.ylabel('Number of Movies')
plt.title('Distribution of Movie Ratings')
plt.show()

In [None]:
# SOLUTION: Scatter plot of budget vs revenue
plt.figure(figsize=(10, 6))
plt.scatter(df['budget'], df['revenue'], alpha=0.5)
plt.xlabel('Budget ($)')
plt.ylabel('Revenue ($)')
plt.title('Budget vs Revenue')
plt.show()

In [None]:
# SOLUTION: Bar chart of average rating by genre
plt.figure(figsize=(12, 6))
df.groupby('genre')['vote_average'].mean().sort_values(ascending=False).plot(kind='bar')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.title('Average Rating by Genre')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Average rating per year
plt.figure(figsize=(12, 6))
yearly_avg = df.groupby('release_year')['vote_average'].mean()
yearly_avg.plot(kind='line', marker='o', markersize=3)
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.title('Average Movie Rating Over Time')
plt.show()

## Checkpoint 5: üîó Find Correlations

In [None]:
# Select numeric columns for correlation
numeric_cols = ['budget', 'revenue', 'runtime', 'popularity', 'vote_average', 'vote_count']
correlation_matrix = df[numeric_cols].corr()

# Display correlation with vote_average
print("Correlation with ratings (vote_average):")
print(correlation_matrix['vote_average'].sort_values(ascending=False))

In [None]:
# SOLUTION: Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## Checkpoint 6: ü§ñ Build a Simple Predictor

In [None]:
# Import machine learning tools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

print("‚úÖ ML libraries imported!")

In [None]:
# Prepare the data
features = ['budget', 'runtime', 'popularity', 'vote_count']
target = 'vote_average'

# Remove rows with missing values
df_clean = df[features + [target]].dropna()

X = df_clean[features]
y = df_clean[target]

print(f"üìä Using {len(X)} movies for training")
print(f"üéØ Features: {features}")
print(f"üéØ Target: {target}")

In [None]:
# SOLUTION: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"üèãÔ∏è Training set: {len(X_train)} movies")
print(f"üß™ Testing set: {len(X_test)} movies")

In [None]:
# SOLUTION: Train the model
model = LinearRegression()
model.fit(X_train, y_train)

print("‚úÖ Model trained!")

In [None]:
# SOLUTION: Make predictions
y_pred = model.predict(X_test)

print("üîÆ Predictions made!")
print(f"\nFirst 5 predictions: {y_pred[:5]}")
print(f"Actual values:       {y_test[:5].values}")

## Checkpoint 7: üìà Evaluate Your Model

In [None]:
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("üìä Model Performance:")
print(f"   Root Mean Squared Error: {rmse:.2f}")
print(f"   R¬≤ Score: {r2:.3f}")
print(f"\nüí° Interpretation:")
print(f"   On average, our predictions are off by {rmse:.2f} rating points")
print(f"   Our model explains {r2*100:.1f}% of the variance in ratings")

In [None]:
# SOLUTION: Visualize predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([0, 10], [0, 10], 'r--', label='Perfect Prediction')
plt.xlabel('Actual Rating')
plt.ylabel('Predicted Rating')
plt.title('Predicted vs Actual Ratings')
plt.legend()
plt.show()

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("üìä Feature Importance:")
print(feature_importance)

## üéâ Congratulations!

In [None]:
print("="*50)
print("üéâ CONGRATULATIONS! üéâ")
print("="*50)
print("\nYou've completed your first Data Science project!")
print("\nYou now know how to:")
print("  üìÇ Load and explore datasets")
print("  üìä Create insightful visualizations")
print("  üîó Analyze correlations")
print("  ü§ñ Build and evaluate ML models")
print("\nWelcome to the world of Data Science & AI! üöÄ")