<div style="background-color: #333; padding: 40px; border: 2px solid #ffd700; border-radius: 10px; color: #ffd700; text-align: center; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);">

<h1 style="font-size: 48px; font-weight: bold; color: #ffd700;">🌟 Most Popular Movies 🎬</h1>

<img src="https://media.tenor.com/eAlVU8Jd7GQAAAAd/memories-of-murder-bong-joon-ho.gif" alt="Movie Reel" style="width: 500px; margin: 20px auto; border-radius: 10px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);">
    
</div>

<div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 180%; text-align: center; color: #ffd700; font-weight: bold;">🎬 Table of Contents 🍿
</div>

<ul class="list-group" id="list-tab" role="tablist">
    <li><a href="#1.-Import-Libraries">1. Import Libraries</a></li><br>
    <li><a href="#2.-Load-data">2. Load data</a></li><br>
    <li><a href="#3.-Exploratory-Data-Analysis">3. Exploratory Data Analysis</a></li><br>
    <li><a href="#4.-Predictive-Analysis">4. Predictive Analysis</a></li><br>
</ul>

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">1. Import Libraries</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

warnings.filterwarnings('ignore')

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">2. Load data</div>

In [None]:
df = pd.read_csv("/kaggle/input/top-10000-most-popular-movies-from-imdb/popular_10000_movies_tmdb.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">3. Exploratory Data Analysis</div>

## <div style="border-radius: 10px; border: 2px solid #333; padding: 15px; background-color: #ffd700; font-size: 120%; text-align: left; color: #333; font-weight: bold;">3.1 Data quality</div>

### I | Check duplicates

In [None]:
duplicates = df.duplicated().sum()
print(duplicates)

### II | Check null and missing values

In [None]:
missing_values = df.isnull().sum()
total_missing_values = (missing_values).sum()
total_cells = np.product(df.shape)
percent_missing_values = (total_missing_values / total_cells)*100
print("Percent of data that is missing", percent_missing_values)
print(missing_values)

### III | Check unique values in each columns

In [None]:
for column in df.columns:
    num_distinct_values = len(df[column].unique())
    print(f"{column}: {num_distinct_values} distinct values")

### IV | Correlation Analysis

In [None]:
df.corr()

In [None]:
fig, ax = plt.subplots() 
fig.set_size_inches(15,10)
sns.heatmap(df.corr(), vmax =.8, square = True, annot = True,cmap='YlGn' )
plt.title('Correlation Matrix',fontsize=15);

## <div style="border-radius: 10px; border: 2px solid #333; padding: 15px; background-color: #ffd700; font-size: 120%; text-align: left; color: #333; font-weight: bold;">3.2 Univariative Analysis</div>

### V | Budget and Revenue Analysis

In [None]:
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_year'] = df['release_date'].dt.year

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

ax1.bar(df['release_year'], df['budget'], color='blue')
ax1.set_ylabel('Budget')
ax1.set_title('Comparaison Budget vs Revenus')

ax2.bar(df['release_year'], df['revenue'], color='red')
ax2.set_ylabel("Revenue")
ax2.set_xlabel("Release date")

plt.tight_layout()

plt.show()

### VI | Popularity Distribution and Original Language Analysis

In [None]:
plt.figure(figsize=(16,8))
plt.title("Popularity Distribution by Original Language")
sns.histplot(data=df, x="popularity", y="original_language")
plt.show()

### VII | Frequency of Movie Genres

In [None]:
# Create a function to add commas before uppercase letters
def add_comma_before_uppercase(word):
    word_with_comma = word[0]  # Gardez la première lettre inchangée
    for i in range(1, len(word)):
        if word[i].isupper():
            word_with_comma += ',' + word[i]
        else:
            word_with_comma += word[i]
    return word_with_comma

def extract_genres(df):
    genre_list = []  
    genre_list_unique = []  
    final_genre_list = []  
    
    # Loop through each 'genres' entry in the DataFrame
    for genres in df['genres']:
        if isinstance(genres, str):  # Check if it's a string (not a list)
            genre_list.extend(genres.split(','))  # Split the string into a list of genres
        else:
            genre_list.append(genres)  # Add the existing list of genres
        
    # Loop through each genre in the genre_list
    for chaine in genre_list:
        words = re.findall(r'\w+', chaine)  # Use regular expressions to find words in the genre
        genre_list_unique.extend(words)  # Extend the unique genre list with the words
    
    # Loop through each word in the unique genre list
    for res in genre_list_unique:
        comma = add_comma_before_uppercase(res)  # Call a function to add commas before uppercase letters
        final_genre_list.append(comma)  # Append the modified word to the final genre list

    # Replace commas with spaces and split each entry into a list of words
    liste_sans_virgules = [chaine.replace(',', ' ').split() for chaine in final_genre_list]

    # Flatten the list of lists into a single list
    result = [mot for sublist in liste_sans_virgules for mot in sublist]
    
    return result  # Return the final list of genres

In [None]:
genres_counter = Counter(extract_genres(df))

genres = list(genres_counter.keys())
frequences = list(genres_counter.values())

#Sort the genres by decreasing frequency.
genres, frequences = zip(*sorted(zip(genres, frequences), key=lambda x: x[1], reverse=True))

plt.figure(figsize=(10, 6))
plt.bar(genres, frequences)
plt.xlabel('Genres')
plt.ylabel('Frequences')
plt.title('Frequency of Movie Genres')
plt.xticks(rotation=45)  # Pour faire pivoter les étiquettes des mots si nécessaire
plt.show()

We can see that the most frequent genres in movies are : Drama, Comedy, Action

### VIII | Budget and Runtime Analysis

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df["runtime"], df["budget"])
plt.xlabel("Budget")
plt.ylabel("Runtime")
plt.title("Budget vs Duration of Movies")
plt.show()

## IX | Distribution of movie runtimes

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df["runtime"]/60 ,bins=30 ,color = "red", edgecolor='black')
plt.xlabel("Runtime(hours)")
plt.ylabel("Frequency")
plt.title("Budget vs Duration of Movies")
plt.show()

## X | Top 10 highest grossing movies 

In [None]:
df_sorted = df.sort_values(by='revenue', ascending=False)
top_10_movies = df_sorted.head(10)

plt.figure(figsize=(12, 6))
plt.bar(top_10_movies["title"], top_10_movies['revenue'], color="purple")
plt.title('Top 10 Highest-Grossing Movies')
plt.xlabel('Movie')
plt.ylabel('Gross Revenue (in billions)')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

## XI | Top 10 movies with the highest vote

In [None]:
df_sorted_vote = df.sort_values(by='vote_count', ascending=False)
top_10_voted_movies = df_sorted_vote.head(10)

plt.figure(figsize=(12, 6))
plt.bar(top_10_voted_movies["title"], top_10_voted_movies['vote_count'], color="green")
plt.title('Top 10 movies with the highest vote')
plt.xlabel('Movies')
plt.ylabel('Number of vote')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">4. Predictive Analysis</div>

In [None]:
# Drop non relevant features for the model
X = df.drop(["revenue", "tagline", "id","overview","release_date"], axis=1)
y = (df['revenue'] / 1000000).astype(int)

#Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
numeric_features = ['vote_average', 'vote_count', 'popularity', 'budget', 'runtime']
categorical_features = ['title', 'genres', 'original_language', 'production_companies']
target = 'revenue'

# Create dictionaries to store RMSE and MAE results
rmse_results = {}
mae_results = {}

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Create a ColumnTransformer to apply transformations to the respective feature groups
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#Create the final pipeline with preprocessing and modeling steps
models = [(XGBRegressor(), "XGBoost"), (RandomForestRegressor(), "RF"), (LinearRegression(), "LR")]

for model, model_name in models:
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)])
    
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    rmse_results[model_name] = rmse
    mae_results[model_name] = mae
    print("Model :",model_name)
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print("--------")

In [None]:
# Create bar charts to visualize RMSE and MAE
fig, axes = plt.subplots(2, 1, figsize=(10, 8))

# Plot RMSE results
axes[0].bar(rmse_results.keys(), rmse_results.values(), color='skyblue')
axes[0].set_title('Root Mean Squared Error (RMSE)')
axes[0].set_ylabel('RMSE Value')

# Plot MAE results
axes[1].bar(mae_results.keys(), mae_results.values(), color='lightcoral')
axes[1].set_title('Mean Absolute Error (MAE)')
axes[1].set_ylabel('MAE Value')

plt.tight_layout()
plt.show()