In [None]:
import sys
sys.path.insert(0, '../')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import utils.preprocess_util as preproc


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Get the data from the IMDb dataFrame
imdb_df = pd.read_csv('../../resources/movie_metadata.csv')
imdb_df.head()

In [None]:
# drop columns that are not useful
drop_columns = ['movie_imdb_link','aspect_ratio', 'plot_keywords']
imdb_df_filtered = imdb_df.drop(columns=drop_columns)
imdb_df_filtered = imdb_df_filtered.dropna(subset='title_year')


In [None]:
imdb_X_train_filtered = imdb_df_filtered[sorted(imdb_df_filtered.columns)]

In [None]:
# create a targeted dataframe with gross and budget over $1000, country is USA and title year is after 1994
imdb_df_filtered = imdb_df_filtered[(imdb_df_filtered['gross']>1_000) & 
                                    (imdb_df_filtered['budget']>1_000) & 
                                    (imdb_df_filtered['country'] == 'USA') &
                                    (imdb_df['title_year']>1994)].drop(columns='country').reset_index(drop=True)
imdb_df_filtered

In [None]:
# drop null values
imdb_df_filtered.dropna(inplace=True)

In [None]:
# prepare for splitting the data
X = imdb_df_filtered.drop(columns=['imdb_score'])
y = imdb_df_filtered['imdb_score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
X_train

In [None]:
X_train.info()

In [None]:
# perform preprocessing on frequency
X_train = preproc._director_frequency(X_train)
X_train


In [None]:
X_train.info()

In [None]:
# perform preprocessing on genres
X_train = preproc._process_genres(X_train)
X_train

In [None]:
X_train.info()

In [None]:
# perform preprocessing on content rating
content_rating_replaced_df = preproc._bucket_contentRatings(X_train)
content_rating_replaced_df

In [None]:
content_rating_replaced_df.info()

In [None]:
X_train.info()

In [None]:
# encode the content rating
encoder = OneHotEncoder(sparse_output=False)  
content_rating_encoded = encoder.fit_transform(content_rating_replaced_df[["rating_bin"]])
content_rating_encoded = pd.DataFrame(content_rating_encoded, columns=encoder.get_feature_names_out(["rating_bin"]))
content_rating_encoded

In [None]:
content_rating_encoded.info()

In [None]:
X_train.info()

In [None]:
X_train = pd.concat([X_train.reset_index(drop=True), content_rating_encoded], axis=1)
X_train

In [None]:
# preprocess actor experience
X_train = preproc._actor_frequency(X_train)
X_train

In [None]:
# combine the facebook likes
X_train['total_facebook_likes'] = X_train['actor_1_facebook_likes'] + X_train['actor_2_facebook_likes'] + X_train['actor_3_facebook_likes']
X_train

In [None]:
# drop processed columns
drop_columns = ['actor_1_facebook_likes', 'actor_2_facebook_likes','actor_3_facebook_likes',
                'color','language','movie_title'
                ]

X_train = X_train.drop(columns=drop_columns)
X_train

In [None]:
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 7))

# Plot the histogram on the first subplot
ax1.hist(imdb_df_filtered['imdb_score'], bins=10, edgecolor='black')
ax1.set_title('Histogram')
ax1.set_xlabel('IMDB score')
ax1.set_ylabel('Frequency')

# Plot the boxplot on the second subplot
ax2.boxplot(imdb_df_filtered['imdb_score'])
ax2.set_title('Boxplot')
ax2.set_ylabel('IMDB score')

# Show the figure
plt.show()

In [None]:
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

# Plot the histogram on the first subplot
ax1.hist(imdb_df_filtered['gross'], bins=100, edgecolor='black')
ax1.set_title('Histogram')
ax1.set_xlabel('Gross')
ax1.set_ylabel('Frequency')

# Plot the histogram on the first subplot
ax2.hist(imdb_df_filtered['budget'], bins=30, edgecolor='black')
ax2.set_title('Histogram')
ax2.set_xlabel('Budget')
ax2.set_ylabel('Frequency')

# Show the figure
plt.show()

In [None]:
# Sample data
data = {
    'Category': ['A', 'B', 'A', 'C', 'B', 'C', 'A', 'D', 'E', 'C'],
    'Target': [1, 0, 1, 0, 1, 0, 1, 1, 0, 0]
}
df = pd.DataFrame(data)

# Calculate the mean of the target for each category
target_mean = df.groupby('Category')['Target'].mean()

# Map the mean back to the original dataframe
df['Category_Encoded'] = df['Category'].map(target_mean)