In [None]:
import utils.fetcher_utils as fetcher
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Get the data from the IMDb dataFrame
imdb_df = fetcher.aquireIMDbDataFrame()
imdb_df.head()

In [None]:
drop_columns = ['movie_imdb_link','aspect_ratio', 'plot_keywords']

imdb_df_filtered = imdb_df.drop(columns=drop_columns)
imdb_df_filtered = imdb_df_filtered.dropna(subset='title_year')


In [None]:
imdb_df_filtered = imdb_df_filtered[sorted(imdb_df_filtered.columns)]

In [None]:
imdb_df_filtered = imdb_df_filtered[(imdb_df_filtered['gross']>1_000) & 
                                    (imdb_df_filtered['budget']>1_000) & 
                                    (imdb_df_filtered['country'] == 'USA') &
                                    (imdb_df['title_year']>1994)].drop(columns='country')
imdb_df_filtered.head(5)

In [None]:
for column in imdb_df_filtered.columns:
    if(imdb_df_filtered[column].dtype == 'object'):
        print(column)
        imdb_df_filtered[column] = imdb_df_filtered[column].fillna('unknown')
        imdb_df_filtered[column] = imdb_df_filtered[column].apply(lambda x: x.strip())

In [None]:
imdb_df_filtered = imdb_df_filtered.fillna(-1)

In [None]:
imdb_df_filtered.isna().sum()

In [None]:
# Concatenate with a comma separator
imdb_df_filtered['actors'] = imdb_df_filtered[['actor_1_name', 'actor_2_name', 'actor_3_name']].agg(list, axis=1)
imdb_df_filtered['actor_facebook_likes'] = imdb_df_filtered[['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']].agg(list, axis=1)

# join some columns and explode them 
imdb_df_filtered = imdb_df_filtered.explode(column=['actors', 'actor_facebook_likes']).reset_index(drop=True)


imdb_df_filtered

In [None]:
# Calculate the frequency of each category
frequency = imdb_df_filtered['actors'].value_counts()

# Map the frequency back to the original dataframe
imdb_df_filtered['actors_Encoded'] = imdb_df_filtered['actors'].map(frequency)


# Calculate the frequency of each category
frequency = imdb_df_filtered['director_name'].value_counts()

# Map the frequency back to the original dataframe
imdb_df_filtered['directors_Encoded'] = imdb_df_filtered['director_name'].map(frequency)
imdb_df_filtered

In [None]:
processed_df = imdb_df_filtered

In [None]:
X = processed_df.drop(columns=['imdb_score'])
y = processed_df['imdb_score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
dummy_columns = ['content_rating', 'color', 'language']
for col in dummy_columns:
    print(col)
    column_dummies = pd.get_dummies(X_train[col], prefix=col, prefix_sep='_', dtype='int') 
    X_train = pd.concat([X_train, column_dummies], axis=1)
    X_train.drop(columns=[col], inplace=True)
X_train

In [None]:
# X_train.info()

In [None]:
# X_train.describe()

In [None]:
drop_columns = ['actor_1_facebook_likes', 'actor_1_name', 'actor_2_facebook_likes','actor_2_name', 'actor_3_facebook_likes', 'actor_3_name', 'director_name', 'actors', 'movie_title']

In [None]:
X_train = X_train.drop(columns=drop_columns).reset_index(drop=True)
X_train

In [None]:
X_train['genres'] = X_train['genres'].apply(lambda x: x.split('|'))
genre_dummies = X_train['genres'].explode().str.get_dummies().groupby(level=0).max()
X_train = pd.concat([X_train.drop(columns=['genres']), genre_dummies], axis=1)
X_train

In [None]:
imdb_df_filtered['actors_Encoded'].value_counts()

# Label Encoder

In [None]:
##### do we encode before or after splitting data
### need to fix 

# label_encoder = LabelEncoder()
# object_columns = imdb_df_filtered.select_dtypes(include='object').columns

# for column in object_columns:
#     encoded_column = f"{column}_encoded"  
#     imdb_df_filtered[encoded_column] = label_encoder.fit_transform(imdb_df_filtered[column])

In [None]:
# imdb_df_filtered['director_name'].value_counts().head(50)

In [None]:
# imdb_df_numeric = imdb_df_filtered.select_dtypes(include='number')
# imdb_df_numeric

In [None]:
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 7))

# Plot the histogram on the first subplot
ax1.hist(imdb_df_filtered['imdb_score'], bins=10, edgecolor='black')
ax1.set_title('Histogram')
ax1.set_xlabel('IMDB score')
ax1.set_ylabel('Frequency')

# Plot the boxplot on the second subplot
ax2.boxplot(imdb_df_filtered['imdb_score'])
ax2.set_title('Boxplot')
ax2.set_ylabel('IMDB score')

# Show the figure
plt.show()

In [None]:
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

# Plot the histogram on the first subplot
ax1.hist(imdb_df_filtered['gross'], bins=100, edgecolor='black')
ax1.set_title('Histogram')
ax1.set_xlabel('Gross')
ax1.set_ylabel('Frequency')

# Plot the histogram on the first subplot
ax2.hist(imdb_df_filtered['budget'], bins=30, edgecolor='black')
ax2.set_title('Histogram')
ax2.set_xlabel('Budget')
ax2.set_ylabel('Frequency')

# Show the figure
plt.show()

In [None]:
# gross_na_rows = imdb_df_filtered.loc[imdb_df_filtered['gross'].isna()]
# movie_list = gross_na_rows['movie_title'].apply(lambda x: x.strip('\xa0')).unique()


# million_df = tmdb_df_filtered.loc[tmdb_df_filtered['original_title'].isin(movie_list)].sort_values(by='original_title')
# million_df

# million_df = tmdb_df_filtered[tmdb_df_filtered['original_title'].str.lower().isin([x.lower() for x in movie_list])]
# million_df


In [None]:
# Sample data
data = {
    'Category': ['A', 'B', 'A', 'C', 'B', 'C', 'A', 'D', 'E', 'C'],
    'Target': [1, 0, 1, 0, 1, 0, 1, 1, 0, 0]
}
df = pd.DataFrame(data)

# Calculate the mean of the target for each category
target_mean = df.groupby('Category')['Target'].mean()

# Map the mean back to the original dataframe
df['Category_Encoded'] = df['Category'].map(target_mean)

In [None]:
len(set(imdb_df_filtered.groupby('actor_1_name')['imdb_score'].mean().values))

In [None]:
target_mean

In [None]:
df