In [None]:
import utils.fetcher_utils as fetcher
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import utils.preprocess_util as preproc


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Get the data from the IMDb dataFrame
imdb_df = fetcher.aquireIMDbDataFrame()
imdb_df.head()

In [None]:
drop_columns = ['movie_imdb_link','aspect_ratio', 'plot_keywords']

imdb_df_filtered = imdb_df.drop(columns=drop_columns)
imdb_df_filtered = imdb_df_filtered.dropna(subset='title_year')


In [None]:
imdb_X_train_filtered = imdb_df_filtered[sorted(imdb_df_filtered.columns)]

In [None]:
imdb_df_filtered = imdb_df_filtered[(imdb_df_filtered['gross']>1_000) & 
                                    (imdb_df_filtered['budget']>1_000) & 
                                    (imdb_df_filtered['country'] == 'USA') &
                                    (imdb_df['title_year']>1994)].drop(columns='country').reset_index(drop=True)
imdb_df_filtered

In [None]:
# imdb_df_filtered.dropna(inplace=True)

In [None]:
X = imdb_df_filtered.drop(columns=['imdb_score'])
y = imdb_df_filtered['imdb_score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
counter = 1

for df in [X_train, X_test]:

       df = preproc._director_frequence(df)
       df = preproc._process_genres(df)

       content_rating_replaced_df = preproc._bucket_contentRatings(df)

       if counter == 1:
              print('1')
              encoder = OneHotEncoder(sparse_output=False)  
              content_rating_encoded = encoder.fit_transform(content_rating_replaced_df[["rating_bin"]])
              content_rating_encoded = pd.DataFrame(content_rating_encoded, columns=encoder.get_feature_names_out(["rating_bin"]))
              content_rating_encoded

       else:
              print('2')
              content_rating_encoded = encoder.transform(content_rating_replaced_df[["rating_bin"]])
              content_rating_encoded = pd.DataFrame(content_rating_encoded, columns=encoder.get_feature_names_out(["rating_bin"]))
              content_rating_encoded

        

       df = pd.concat([df.reset_index(drop=True), content_rating_encoded], axis=1)
       # df.drop('content_rating', axis=1, inplace=True)

       df = preproc._actor_frequency(df)

       df['total_facebook_likes'] = df['actor_1_facebook_likes'] + df['actor_2_facebook_likes'] + df['actor_3_facebook_likes']

       drop_columns = ['actor_1_facebook_likes', 'actor_2_facebook_likes','actor_3_facebook_likes','color','language','movie_title']
       df = df.drop(columns=drop_columns)


       columns_to_standardize = ['num_critic_for_reviews', 'duration', 'director_facebook_likes',
              'gross', 'num_voted_users', 'cast_total_facebook_likes',
              'facenumber_in_poster', 'num_user_for_reviews', 'budget', 'title_year',
              'movie_facebook_likes', 'director_frequency', 'total_actor_frequency',
              'total_facebook_likes']


       scaler = StandardScaler()

       # Fit the scaler to the training data
       scaler.fit(df[columns_to_standardize])

       # Scale the training features
       scaled_features = scaler.transform(df[columns_to_standardize])

       # Create a DataFrame with the scaled features
       scaled_df = pd.DataFrame(scaled_features, columns=columns_to_standardize)

       df = pd.concat([df.drop(columns=columns_to_standardize), scaled_df], axis=1)

       df = df.fillna(-1)

       if counter == 1:
              X_train = df[sorted(df.columns)]
       else:
              X_test = df[sorted(df.columns)]
       
       counter += 1

       print(sorted(df.columns))

       display(df)

In [None]:
X_train

In [None]:
X_test =  X_test.drop(columns='Mystery')
X_test

In [None]:
X_train.info()

In [None]:
corr_test = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
corr_test.corr()['imdb_score'].sort_values(ascending=False)

In [None]:
# Create a function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
calc_vif(X_train).sort_values("VIF", ascending=False)

In [None]:
X_train_filtered = X_train.drop(columns=['cast_total_facebook_likes', 'rating_bin_R'])
X_test_filtered = X_test.drop(columns=['cast_total_facebook_likes', 'rating_bin_R'])
calc_vif(X_train_filtered).sort_values("VIF", ascending=False)

In [None]:
import statsmodels.api as sm

# Use the statsmodels package to create and fit a linear regression
lr = sm.OLS(y_train.reset_index(drop=True), X_train_filtered).fit()

In [None]:
# Create a variable to hold the p-values of all columns sorted in ascending order
p_values = lr.pvalues.sort_values(ascending=False)
p_values

In [None]:
# Use loc to filter to columns with p-values below 0.05
select_cols = p_values.loc[p_values < 0.05]

# Show the index of the results
select_cols.index

In [None]:
len(select_cols.index)

In [None]:
X_train_filtered=X_train_filtered[select_cols.index]
X_train_filtered

In [None]:
X_test_filtered=X_test_filtered[select_cols.index]
X_test_filtered

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train_filtered, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_filtered)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(mse)
print(r2)

In [None]:
from sklearn.linear_model import Lasso

# Create and train a lasso regression model
lasso_model = Lasso(alpha=1)
lasso_model.fit(X_train, y_train)

# Create predictions with the model
y_predicted_lasso = lasso_model.predict(X_test)

print(mean_squared_error(y_test, y_predicted_lasso))
# print(r2_score(y_test, y_predicted_lasso))

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
model_cv = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10])
model_cv = model_cv.fit(X_train, y_train)

In [None]:
# Display the alpha of the best model
model_cv.alpha_

In [None]:
# Create a model using the best alpha
model2 = Ridge(alpha=model_cv.alpha_)

# Train the model
model2.fit(X_train, y_train)

# Create predictions and calculate the mean squared error
y_predicted2 = model2.predict(X_test)
mean_squared_error(y_test, y_predicted2)
print(r2_score(y_test, y_predicted2))

# Label Encoder

In [None]:
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 7))

# Plot the histogram on the first subplot
ax1.hist(imdb_df_filtered['imdb_score'], bins=10, edgecolor='black')
ax1.set_title('Histogram')
ax1.set_xlabel('IMDB score')
ax1.set_ylabel('Frequency')

# Plot the boxplot on the second subplot
ax2.boxplot(imdb_df_filtered['imdb_score'])
ax2.set_title('Boxplot')
ax2.set_ylabel('IMDB score')

# Show the figure
plt.show()

In [None]:
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

# Plot the histogram on the first subplot
ax1.hist(imdb_df_filtered['gross'], bins=100, edgecolor='black')
ax1.set_title('Histogram')
ax1.set_xlabel('Gross')
ax1.set_ylabel('Frequency')

# Plot the histogram on the first subplot
ax2.hist(imdb_df_filtered['budget'], bins=30, edgecolor='black')
ax2.set_title('Histogram')
ax2.set_xlabel('Budget')
ax2.set_ylabel('Frequency')

# Show the figure
plt.show()