In [1]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('/content/movies_cleaned_final.csv')

# Display the first 5 rows of the DataFrame
display(df.head())

Unnamed: 0,Title,Rating,Runtime,Directors,Writers,Stars,Genres,Popularity,Budget,Gross Worldwide,URL
0,Închisoarea îngerilor,9.3,2h 22m,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","Epic, Period Drama, Prison Drama, Drama",55.0,25000000.0,29334030.0,https://www.imdb.com/title/tt0111161/
1,Cavalerul negru,9.1,2h 32m,Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Goyer","Christian Bale, Heath Ledger, Aaron Eckhart, M...","Action Epic, Epic, Psychological Drama, Psycho...",92.0,185000000.0,1009243000.0,https://www.imdb.com/title/tt0468569/
2,Începutul,8.8,2h 28m,Christopher Nolan,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...","Action Epic, Adventure Epic, Epic, Psychologic...",84.0,160000000.0,839786500.0,https://www.imdb.com/title/tt1375666/
3,Fight Club - Sala de lupte,8.8,2h 19m,David Fincher,"Chuck Palahniuk, Jim Uhls","Edward Norton, Brad Pitt, Meat Loaf, Zach Gren...","Dark Comedy, Psychological Drama, Psychologica...",148.0,63000000.0,101321000.0,https://www.imdb.com/title/tt0137523/
4,Pulp Fiction,8.8,2h 34m,Quentin Tarantino,"Quentin Tarantino, Roger Avary","Tim Roth, Amanda Plummer, Laura Lovelace, John...","Dark Comedy, Drug Crime, Gangster, Crime, Drama",141.0,8000000.0,213928800.0,https://www.imdb.com/title/tt0110912/


In [5]:
import re

def convert_runtime_to_minutes(runtime_str):
    if pd.isna(runtime_str): # Handle NaN values
        return None

    # Initialize hours and minutes
    total_minutes = 0

    # Extract hours (e.g., '2h')
    hours_match = re.search(r'(\d+)h', runtime_str)
    if hours_match:
        total_minutes += int(hours_match.group(1)) * 60

    # Extract minutes (e.g., '22m')
    minutes_match = re.search(r'(\d+)m', runtime_str)
    if minutes_match:
        total_minutes += int(minutes_match.group(1))

    return total_minutes

# Apply the conversion function to create the new column
df['Runtime (minutes)'] = df['Runtime'].apply(convert_runtime_to_minutes)

# Display the first 5 rows with the new column
display(df[['Title', 'Runtime', 'Runtime (minutes)']].sample(n=5))

Unnamed: 0,Title,Runtime,Runtime (minutes)
3008,Yeopgijeogin geunyeo,2h 17m,137
6262,Foul Play,1h 56m,116
3123,Scanners,1h 43m,103
361,Scott Pilgrim împotriva tuturor,1h 52m,112
4459,Ultimul radio show,1h 45m,105


In [7]:
df['ROI'] = df['Gross Worldwide'] / df['Budget']

# Display a sample of the DataFrame with the new 'ROI' column
display(df[['Title', 'Budget', 'Gross Worldwide', 'ROI']].sample(n=5))

Unnamed: 0,Title,Budget,Gross Worldwide,ROI
3321,Tae Guk Gi - Frăția războiului,12800000.0,81407286.0,6.359944
6602,Munje!,20000000.0,1833.0,9.2e-05
8721,Soul Survivors,17000000.0,4299141.0,0.252891
122,O fată de milioane,30000000.0,216763646.0,7.225455
3366,Jucaria 2,13000000.0,35763605.0,2.751047


In [22]:
# Initialize an empty list to store all individual directors
all_directors = []

# Iterate through the 'Directors' column
for directors_list in df['Directors']:
    if pd.isna(directors_list):
        continue
    # Split the string by comma and strip whitespace from each name
    directors = [d.strip() for d in directors_list.split(',')]
    all_directors.extend(directors)

# Create a pandas Series from the list of all directors and count their frequencies
director_frequency = pd.Series(all_directors).value_counts().reset_index()
director_frequency.columns = ['Director', 'Frequency']

# Display the new DataFrame with director frequencies
display(director_frequency)
display(director_frequency["Director"].size)
# director_frequency.to_csv('director_frequency.csv', index=False)


Unnamed: 0,Director,Frequency
0,Woody Allen,47
1,Alfred Hitchcock,38
2,Clint Eastwood,34
3,Steven Spielberg,29
4,Martin Scorsese,24
...,...,...
4234,Alexandre de La Patellière,1
4235,Stéphane Foenkinos,1
4236,David Foenkinos,1
4237,Andrew Sipes,1


4239

In [24]:
director_freq_dict = director_frequency.set_index('Director')['Frequency'].to_dict()

def calculate_directors_freq(directors_str):
    if pd.isna(directors_str):
        return None

    individual_directors = [d.strip() for d in directors_str.split(',')]
    total_frequency = 0

    for director in individual_directors:
        total_frequency += director_freq_dict.get(director, 0) # Get frequency, default to 0 if not found

    # Calculate the average frequency for this movie
    return total_frequency / director_frequency["Director"].size

# Apply the function to create the new 'directors_freq' column
df['directors_freq'] = df['Directors'].apply(calculate_directors_freq)

# Display a sample of the DataFrame with the new column
display(df[['Title', 'Directors', 'directors_freq']].sample(n=5))

Unnamed: 0,Title,Directors,directors_freq
4895,Mecanismul,David Mamet,0.001887
9656,Contracted,Eric England,0.000236
1256,Scream 4: Coșmarul continuă,Wes Craven,0.00401
675,Kung Fu Panda 2,Jennifer Yuh Nelson,0.000472
6857,Shanghai Kiss,"Kern Konwiser, David Ren",0.000472


In [30]:
# Initialize an empty list to store all individual writers
all_writers = []

# Iterate through the 'Writers' column
for writers_list in df['Writers']:
    if pd.isna(writers_list):
        continue
    # Split the string by comma and strip whitespace from each name
    writers = [w.strip() for w in writers_list.split(',')]
    all_writers.extend(writers)

# Create a pandas Series from the list of all writers and count their frequencies
writer_frequency = pd.Series(all_writers).value_counts().reset_index()
writer_frequency.columns = ['Writer', 'Frequency']

# Display the new DataFrame with writer frequencies
display(writer_frequency)
display(writer_frequency["Writer"].size)
writer_frequency.to_csv('writer_frequency.csv', index=False)

Unnamed: 0,Writer,Frequency
0,Woody Allen,49
1,Luc Besson,40
2,Stephen King,40
3,William Shakespeare,31
4,John Hughes,29
...,...,...
10547,Manu Rishi Chadha,1
10548,Karl Vollmöller,1
10549,Carl Zuckmayer,1
10550,Heinrich Mann,1


10552

In [29]:
writer_freq_dict = writer_frequency.set_index('Writer')['Frequency'].to_dict()

def calculate_writers_freq(writers_str):
    if pd.isna(writers_str):
        return None

    individual_writers = [w.strip() for w in writers_str.split(',')]
    total_frequency = 0

    for writer in individual_writers:
        total_frequency += writer_freq_dict.get(writer, 0) # Get frequency, default to 0 if not found

    # Calculate the average frequency for this movie
    return total_frequency / writer_frequency["Writer"].size

# Apply the function to create the new 'writers_freq' column
df['writers_freq'] = df['Writers'].apply(calculate_writers_freq)

# Display a sample of the DataFrame with the new column
display(df[['Title', 'Writers', 'writers_freq']].sample(n=5))

Unnamed: 0,Title,Writers,writers_freq
3449,The Hunter,"Alice Addison, Wain Fimeri, Daniel Nettheim",0.000284
753,Marele maestru Ip Man,"Edmond Wong, Tai-Lee Chan",0.000569
6518,Howling II: Stirba - Werewolf Bitch,"Gary Brandner, Robert Sarno",0.000284
7643,Those Magnificent Men in Their Flying Machines...,"Jack Davies, Ken Annakin",0.00019
7086,Ivan cel Groaznic,Sergei Eisenstein,0.000474


In [34]:
# Initialize an empty list to store all individual stars
all_stars = []

# Iterate through the 'Stars' column
for stars_list in df['Stars']:
    if pd.isna(stars_list):
        continue
    # Split the string by comma and strip whitespace from each name
    stars = [s.strip() for s in stars_list.split(',')]
    all_stars.extend(stars)

# Create a pandas Series from the list of all stars and count their frequencies
star_frequency = pd.Series(all_stars).value_counts().reset_index()
star_frequency.columns = ['Star', 'Frequency']

# Display the new DataFrame with star frequencies
display(star_frequency)
display(star_frequency["Star"].size)
star_frequency.to_csv('star_frequency.csv', index=False)

Unnamed: 0,Star,Frequency
0,Robert De Niro,77
1,Samuel L. Jackson,69
2,Bruce Willis,62
3,Morgan Freeman,58
4,Nicolas Cage,57
...,...,...
43947,Toralv Maurstad,1
43948,Rolf Just Nilsen,1
43949,Helge Reiss,1
43950,Frank Robert,1


43952



In [35]:
star_freq_dict = star_frequency.set_index('Star')['Frequency'].to_dict()

def calculate_stars_freq(stars_str):
    if pd.isna(stars_str):
        return None

    individual_stars = [s.strip() for s in stars_str.split(',')]
    total_frequency = 0

    for star in individual_stars:
        total_frequency += star_freq_dict.get(star, 0) # Get frequency, default to 0 if not found

    # Calculate the average frequency for this movie
    return total_frequency / star_frequency["Star"].size

# Apply the function to create the new 'stars_freq' column
df['stars_freq'] = df['Stars'].apply(calculate_stars_freq)

# Display a sample of the DataFrame with the new column
display(df[['Title', 'Stars', 'stars_freq']].sample(n=5))

Unnamed: 0,Title,Stars,stars_freq
7668,Stan și Bran: Departe spre vest,"Stan Laurel, Oliver Hardy, Sharon Lynn, James ...",0.000364
162,În sălbăticie,"Emile Hirsch, Marcia Gay Harden, William Hurt,...",0.004346
6562,Staten Island,"Ethan Hawke, Vincent D'Onofrio, Seymour Cassel...",0.002161
3796,Doc Hollywood,"Michael J. Fox, Julie Warner, Barnard Hughes, ...",0.002776
9638,Fata din vale,"Nicolas Cage, Deborah Foreman, Elizabeth Daily...",0.002139


In [38]:
# Initialize an empty list to store all individual genres
all_genres = []

# Iterate through the 'Genres' column
for genres_list in df['Genres']:
    if pd.isna(genres_list):
        continue
    # Split the string by comma and strip whitespace from each name
    genres = [g.strip() for g in genres_list.split(',')]
    all_genres.extend(genres)

# Create a pandas Series from the list of all genres and count their frequencies
genre_frequency = pd.Series(all_genres).value_counts().reset_index()
genre_frequency.columns = ['Genre', 'Frequency']

# Display the new DataFrame with genre frequencies
display(genre_frequency)
display(genre_frequency["Genre"].size)
genre_frequency.to_csv('genre_frequency.csv', index=False)

Unnamed: 0,Genre,Frequency
0,Drama,5792
1,Comedy,3773
2,Thriller,2915
3,Romance,2461
4,Action,2133
...,...,...
198,Stand-Up,1
199,Documentary,1
200,Music Documentary,1
201,Punjabi,1


203

In [40]:
genre_freq_dict = genre_frequency.set_index('Genre')['Frequency'].to_dict()

def calculate_genres_freq(genres_str):
    if pd.isna(genres_str):
        return None

    individual_genres = [g.strip() for g in genres_str.split(',')]
    total_frequency = 0

    for genre in individual_genres:
        total_frequency += genre_freq_dict.get(genre, 0) # Get frequency, default to 0 if not found

    # Calculate the average frequency for this movie
    return total_frequency / genre_frequency["Genre"].size

# Apply the function to create the new 'genres_freq' column
df['genres_freq'] = df['Genres'].apply(calculate_genres_freq)

# Display a sample of the DataFrame with the new column
display(df[['Title', 'Genres', 'genres_freq']].sample(n=5))

Unnamed: 0,Title,Genres,genres_freq
8952,Branded,"Conspiracy Thriller, Drama, Fantasy, Sci-Fi, T...",52.975369
5106,My Blue Heaven,"Buddy Comedy, Comedy, Crime",29.970443
6550,Arabesque,"Screwball Comedy, Spy, Action, Adventure, Come...",108.699507
9745,Tarzan,"Jungle Adventure, Adventure, Animation, Drama,...",41.807882
4764,Tillsammans,"Swedish, Comedy, Drama, Romance",59.502463


In [41]:
display(df.head())

Unnamed: 0,Title,Rating,Runtime,Directors,Writers,Stars,Genres,Popularity,Budget,Gross Worldwide,URL,Runtime (minutes),ROI,directors_freq,writers_freq,stars_freq,genres_freq
0,Închisoarea îngerilor,9.3,2h 22m,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","Epic, Period Drama, Prison Drama, Drama",55.0,25000000.0,29334030.0,https://www.imdb.com/title/tt0111161/,142,1.173361,0.000944,0.004265,0.004004,33.359606
1,Cavalerul negru,9.1,2h 32m,Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Goyer","Christian Bale, Heath Ledger, Aaron Eckhart, M...","Action Epic, Epic, Psychological Drama, Psycho...",92.0,185000000.0,1009243000.0,https://www.imdb.com/title/tt0468569/,152,5.455367,0.002123,0.002559,0.005825,73.546798
2,Începutul,8.8,2h 28m,Christopher Nolan,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...","Action Epic, Adventure Epic, Epic, Psychologic...",84.0,160000000.0,839786500.0,https://www.imdb.com/title/tt1375666/,148,5.248665,0.002123,0.000853,0.003618,41.704433
3,Fight Club - Sala de lupte,8.8,2h 19m,David Fincher,"Chuck Palahniuk, Jim Uhls","Edward Norton, Brad Pitt, Meat Loaf, Zach Gren...","Dark Comedy, Psychological Drama, Psychologica...",148.0,63000000.0,101321000.0,https://www.imdb.com/title/tt0137523/,139,1.60827,0.002359,0.000379,0.002457,63.610837
4,Pulp Fiction,8.8,2h 34m,Quentin Tarantino,"Quentin Tarantino, Roger Avary","Tim Roth, Amanda Plummer, Laura Lovelace, John...","Dark Comedy, Drug Crime, Gangster, Crime, Drama",141.0,8000000.0,213928800.0,https://www.imdb.com/title/tt0110912/,154,26.741095,0.002595,0.001801,0.005734,45.305419


In [63]:
new_df = df.drop(columns=["Title","Runtime","Directors","Writers","Stars","Genres","Budget","Gross Worldwide","URL"])

In [66]:
import numpy as np

new_df.to_csv('freq_encoding_final.csv', index=False)



new_df["ROI"] = np.log1p(new_df["ROI"])
display(new_df.head())

Unnamed: 0,Rating,Popularity,Runtime (minutes),ROI,directors_freq,writers_freq,stars_freq,genres_freq
0,9.3,55.0,142,0.776275,0.000944,0.004265,0.004004,33.359606
1,9.1,92.0,152,1.864912,0.002123,0.002559,0.005825,73.546798
2,8.8,84.0,148,1.832368,0.002123,0.000853,0.003618,41.704433
3,8.8,148.0,139,0.958687,0.002359,0.000379,0.002457,63.610837
4,8.8,141.0,154,3.322915,0.002595,0.001801,0.005734,45.305419


# Task
Build a neural network model to predict 'ROI' using the features in the `new_df` DataFrame.

## Prepare Data for Neural Network

### Subtask:
Separate features (X) and target (y), handle any missing values, scale the features, and split the data into training and testing sets.


**Reasoning**:
The first step is to separate the target variable 'ROI' into `y` and the remaining columns of `new_df` into `X`. Then, I will handle any missing values in `X` and `y` by filling them with the mean of their respective columns.



In [67]:
y = new_df['ROI']
X = new_df.drop('ROI', axis=1)

# Identify and fill missing values in X
for col in X.columns:
    if X[col].isnull().sum() > 0:
        X[col] = X[col].fillna(X[col].mean())

# Identify and fill missing values in y
if y.isnull().sum() > 0:
    y = y.fillna(y.mean())

print("Missing values in X after handling:\n", X.isnull().sum().sum())
print("Missing values in y after handling:\n", y.isnull().sum())

Missing values in X after handling:
 0
Missing values in y after handling:
 0


**Reasoning**:
I will now scale the features in `X` using `StandardScaler` and then split the scaled features `X_scaled` and the target `y` into training and testing sets using `train_test_split`.



In [68]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (7798, 7)
Shape of X_test: (1950, 7)
Shape of y_train: (7798,)
Shape of y_test: (1950,)


## Define and Compile the Neural Network Model

### Subtask:
Create a TensorFlow/Keras sequential model with appropriate layers, activation functions, and compile it with an optimizer and loss function suitable for regression.


**Reasoning**:
I will import the necessary Keras modules, define a sequential neural network model with input, hidden, and output layers, and then compile it with an appropriate optimizer, loss function, and metrics for regression.



In [93]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 1. Initialize a Sequential model
model = Sequential()

# 2. Add the input layer
# input_dim should be the number of features in X_train
model.add(Dense(128, input_dim=X_train.shape[1], activation='tanh'))

# 3. Add one or more hidden layers
model.add(Dense(32, activation='tanh'))
# model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='tanh'))
model.add(Dense(8, activation='tanh'))

# 4. Add the output layer for regression (1 unit, no activation)
model.add(Dense(1))

# 5. Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Display the model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [94]:
from sklearn.metrics import r2_score

history = model.fit(
    X_train,
    y_train,
    epochs=100,  # You can adjust the number of epochs
    batch_size=128, # You can adjust the batch size
    validation_split=0.2, # Use a portion of training data for validation
    verbose=1
)

# Evaluate the model on the test data
loss, mae = model.evaluate(X_test, y_test, verbose=0)

# Get predictions on the test set
y_pred = model.predict(X_test)

# Calculate R2 score
r2 = r2_score(y_test, y_pred)

print(f"Test Loss (Mean Squared Error): {loss:.4f}")
print(f"Test Mean Absolute Error: {mae:.4f}")
print(f"Test R2 Score: {r2:.4f}")

Epoch 1/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - loss: 0.9989 - mean_absolute_error: 0.6741 - val_loss: 0.5559 - val_mean_absolute_error: 0.5622
Epoch 2/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6271 - mean_absolute_error: 0.5656 - val_loss: 0.5572 - val_mean_absolute_error: 0.5651
Epoch 3/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6255 - mean_absolute_error: 0.5592 - val_loss: 0.5179 - val_mean_absolute_error: 0.5442
Epoch 4/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.5657 - mean_absolute_error: 0.5402 - val_loss: 0.5244 - val_mean_absolute_error: 0.5570
Epoch 5/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.5852 - mean_absolute_error: 0.5482 - val_loss: 0.5010 - val_mean_absolute_error: 0.5400
Epoch 6/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 