# MOVIE RATINGS,PROGRESS AND MORE

##### Imports

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

##### Data creation

In [2]:

def create_sample_movie_data():
    np.random.seed(42)
    n = 1000
    
    data = {
        'title': [f'Movie_{i}' for i in range(n)],
        'year': np.random.randint(1990, 2023, n),
        'genre': np.random.choice(['Action', 'Comedy', 'Drama', 'Horror', 'Sci-Fi', 'Romance'], n),
        'rating': np.round(np.random.normal(6.5, 1.5, n), 1),
        'budget': np.random.randint(1, 200, n) * 1000000,
        'revenue': np.random.randint(1, 500, n) * 1000000,
        'runtime': np.random.randint(80, 180, n),
        'production_company': np.random.choice(['Warner Bros', 'Disney', 'Universal', 'Paramount', 'Sony'], n),
        'votes': np.random.randint(1000, 1000000, n)
    }
    
    # Ensure revenue is generally higher than budget
    data['revenue'] = np.maximum(data['revenue'], data['budget'] * np.random.uniform(0.5, 5, n))
    
    # Create some correlation between rating and revenue
    data['rating'] = np.clip(data['rating'] + (data['revenue'] / 100000000) * 0.1, 1, 10)
    
    return pd.DataFrame(data)

# Load data
df = create_sample_movie_data()

# Basic exploration
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

Dataset Shape: (1000, 9)

First 5 rows:
     title  year   genre    rating     budget       revenue  runtime  \
0  Movie_0  2018  Sci-Fi  4.146000  126000000  4.460000e+08      106   
1  Movie_1  2004  Horror  7.798164  123000000  4.981639e+08      133   
2  Movie_2  1997   Drama  4.945000   47000000  1.450000e+08      145   
3  Movie_3  2010   Drama  3.698416  118000000  3.984162e+08      128   
4  Movie_4  2008  Action  7.347000   39000000  2.470000e+08       95   

  production_company   votes  
0        Warner Bros  213204  
1        Warner Bros  251694  
2               Sony  250836  
3               Sony  759849  
4          Paramount  886829  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               1000 non-null   object 
 1   year                1000 non-null   int32  
 2   genre               

#### Data cleaning

In [3]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Create additional useful columns
df['profit'] = df['revenue'] - df['budget']
df['profit_margin'] = (df['profit'] / df['revenue']) * 100
df['roi'] = (df['profit'] / df['budget']) * 100

# Convert to millions for easier reading
df['budget_millions'] = df['budget'] / 1000000
df['revenue_millions'] = df['revenue'] / 1000000
df['profit_millions'] = df['profit'] / 1000000

print("\nData after feature engineering:")
print(df[['budget_millions', 'revenue_millions', 'profit_millions', 'profit_margin', 'roi']].describe())

Missing values:
title                 0
year                  0
genre                 0
rating                0
budget                0
revenue               0
runtime               0
production_company    0
votes                 0
dtype: int64

Data after feature engineering:
       budget_millions  revenue_millions  profit_millions  profit_margin  \
count      1000.000000       1000.000000      1000.000000    1000.000000   
mean        101.624000        372.977561       271.353561      69.630465   
std          56.284941        172.735231       149.152577      20.414774   
min           1.000000         20.548868       -81.000000     -83.505155   
25%          56.000000        247.092357       160.986545      62.512959   
50%         101.500000        368.203875       269.248400      74.451890   
75%         150.250000        469.505413       363.000000      80.090145   
max         199.000000        937.041648       748.041648      99.776286   

                roi  
count   1000.00