<a href="https://colab.research.google.com/github/BLayus/imdb_score_prediction/blob/main/Streamlit_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Bibliotecas

In [1]:
!pip install streamlit

!pip install --upgrade scikit-learn



In [23]:
# Import Streamlit
import streamlit as st

# Import libraries
import pandas as pd
import numpy as np
import re
from datetime import datetime
import datetime as dt
import requests

# Machine Learning Libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

import pickle



## Dataset

In [24]:
st.set_page_config( layout= 'wide')

@st.cache_data
def get_data (path):
  df = pd.read_csv(path)
  return df

2024-07-02 19:41:50.117 No runtime found, using MemoryCacheStorageManager


In [25]:
# get data

path = 'https://raw.githubusercontent.com/BLayus/imdb_score_prediction/main/dataset/Dataset_imdb.csv'
df = get_data(path)

##Pre Processing

In [26]:
# The 'Apollo 13' movie has no date, in official website, I found that release year was 1995
# Using Loc to find and replace the value

df.loc[df['Released_Year'] == 'PG', 'Released_Year'] = 1995

In [27]:
# Convert Released_Year to datetime year only

def convert_datetime(df):
  df['Released_Year']= pd.to_datetime(df['Released_Year'], format= '%Y', errors= 'coerce').dt.year
  return df

In [28]:
# Converting strings in column "Runtime" to int 64 and removing substring 'min'

def convert_runtime(df):
  if df['Runtime'].dtype == 'object':
    df['Runtime']= df['Runtime'].str.extract('(\d+)', expand=False).astype('int64')
  return df

In [29]:
# Replace commas in Gross strings

def convert_gross(df):
  df['Gross'] = df['Gross'].astype(str)
  df['Gross']= df['Gross'].str.replace(r'[^\w\s]', '', regex= True)

  # Converting gross type to numerical and fill NaN

  df['Gross'] = pd.to_numeric(df['Gross'], errors='coerce').fillna(0).astype('int64')
  return df

In [30]:
# Certificate column, imput missing with mode

def simple_imputer(df):
  imputer= SimpleImputer(missing_values= np.nan, strategy= 'most_frequent')
  data= df[['Certificate']]
  imputer= imputer.fit(data)
  df['Certificate']= imputer.transform(data).flatten()

  # Meta Score column, imput with median

  imputer= SimpleImputer(missing_values= np.nan, strategy= 'median')
  data= df[['Meta_score']]
  imputer= imputer.fit(data)
  df['Meta_score']= imputer.transform(data).flatten()
  return df

In [31]:
# Certificate grouping and converting to numerical info
# As this grouping has low cardinality, we can use One Hot Encoder

def certificate_groups(df):
  df['Certificate']= df['Certificate'].apply(lambda x: 'all_age_group' if x == ['U', 'G', 'Passed', 'Approved']
                                             else 'accompanied_age_group' if x == ['PG', 'TV-PG', 'U/A', 'GP']
                                             else '14_years_group' if x == ['PG-13', 'TV-14']
                                             else '16_years_group' if x == ['16', 'R']
                                             else 'adult_group')
  return df

In [32]:
# Convert column dtypes

def convert_dtypes(df):
  df['Released_Year'] = df['Released_Year'].astype(int)
  df['Gross'] = df['Gross'].astype(int)
  return df

In [33]:
# Drop Unnecessary columns

def drop_cols(df):
  drop_cols= ['Series_Title', 'Overview']
  df.drop(columns= drop_cols, inplace= True)
  return df

In [34]:
# Convert column dtypes

def convert_dtypes(df):
  df['Released_Year'] = df['Released_Year'].astype(int)
  df['Gross'] = df['Gross'].astype(int)
  return df

In [35]:
# Drop Unnecessary columns

def drop_cols(df):
  drop_cols= ['Series_Title', 'Overview']
  df.drop(columns= drop_cols, inplace= True)
  return df

In [36]:
# Create a function with with steps before encoding

def pre_encoder(df):
  convert_datetime(df)
  convert_runtime(df)
  convert_gross(df)
  simple_imputer(df)
  certificate_groups(df)
  drop_cols(df)
  convert_dtypes(df)

  return(df)


In [37]:
# Apply pre process function

pre_encoder(df)

df.sample(3)

Unnamed: 0.1,Unnamed: 0,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
744,745,2013,adult_group,91,"Drama, Sci-Fi, Thriller",7.7,96.0,Alfonso Cuarón,Sandra Bullock,George Clooney,Ed Harris,Orto Ignatiussen,769145,274092705
793,794,2001,adult_group,95,"Comedy, Drama, Music",7.7,85.0,John Cameron Mitchell,John Cameron Mitchell,Miriam Shor,Stephen Trask,Theodore Liscinski,31957,3029081
419,420,1972,adult_group,138,"Mystery, Thriller",8.0,79.0,Joseph L. Mankiewicz,Laurence Olivier,Michael Caine,Alec Cawthorne,John Matthews,44748,4081254


In [38]:
# Defining columns to encode

ohe_cols= ['Certificate']

te_cols= ['Released_Year', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']

# Instatiate the encoders

target_enc = TargetEncoder(smooth='auto', target_type='continuous')

ohe_enc = OneHotEncoder(handle_unknown='ignore')

In [39]:
# Making column transformer

col_trans= make_column_transformer(
    (ohe_enc, ohe_cols),
    (target_enc, te_cols),
    remainder= 'passthrough')



In [40]:
# Creating a pipeline to encode

def passthrough_func(X):
    print("Data after ColumnTransformer:", type(X), X.shape)
    df= pd.DataFrame(X)
    for col in df.columns:
      df[col] = df[col].astype(float)
    return df

enc_pipeline= make_pipeline(col_trans,
                        FunctionTransformer(passthrough_func, validate=False)
                        )

##Streamlit APP

In [41]:
# Create a Streamlit app

st.title("IMDB Rating Prediction API")

st.write("Enter the movie details to predict the IMDB rating:")

In [42]:
# Create a form to input movie data

with st.form('Preencha os Dados'):
    Series_Title = st.text_input('Title')
    Released_Year = st.number_input('Year (YYYY format)')
    Certificate = st.text_input('Certificate')
    Runtime = st.number_input('Runtime (in minutes)')
    Genre = st.text_input('Genre')
    Meta_Score = st.number_input('Meta Score')
    Director = st.selectbox('Director', df['Director'].unique())
    Star_1 = st.selectbox('Star 1', df['Star1'].unique())
    Star_2 = st.selectbox('Star 2', df['Star2'].unique())
    Star_3 = st.selectbox('Star 3', df['Star3'].unique())
    Star_4 = st.selectbox('Star 4', df['Star4'].unique())
    No_of_Votes = st.number_input('No of Votes')
    Gross = st.number_input('Gross')

df_pred= pd.DataFrame({'Released_Year' :[ Released_Year],
                        'Certificate' : [Certificate],
                        'Runtime' : [Runtime],
                        'Genre' : [Genre],
                        'Meta_Score' : [Meta_Score],
                        'Director' : [Director],
                        'Star_1' : [Star_1],
                        'Star_2' : [Star_2],
                        'Star_3' : [Star_3],
                        'Star_4' : [Star_4],
                        'No_of_Votes' : [No_of_Votes],
                        'Gross' : [Gross]
                        })


2024-07-02 19:42:16.740 Session state does not function when running a script without `streamlit run`


In [43]:
# Opening saved pickle file

pickle_url = 'https://github.com/BLayus/imdb_score_prediction/raw/main/Model/model_pickle'
response = requests.get(pickle_url)
with open('model_pickle', 'wb') as f:
    f.write(response.content)

with open('model_pickle', 'rb') as f:
    mp = pickle.load(f)



In [44]:
# Create a button to make the prediction

if st.button("Predict IMDB Rating"):

  # Check columns
  for col in df:
    if col not in df_pred:
      df_pred[col] = 1
  else:
    pass
  # Pre process input data point
  pre_encoder(def_pred)

  #  Define X and Target
  x_p= df_pred.drop(columns= ['IMDB_Rating'], axis= 1)
  y_p= df_pred['IMDB_Rating']

  # Applying pipeline
  x_p = enc_pipeline.transform(x_p)

  # Make prediction with pickle
  mp.predict(x_p)



