# Prediction model

1) Data wrangling (null management, number formatting)
2) Train data for the model (we only manage to catch 20k movies with the info inside the playlists, therefore this is the new length of the model)
3) Model for single suggestion and for multiple suggestions.
4) Streamlit prototype

## Pre-processing 

In [1]:
import pandas as pd

In [29]:
# prediction_model = pd.read_csv('C:/Users/benja/Downloads/Test_48_2.csv', index_col=False)

In [6]:
# the length of some playlsit was smaller than 48 so it fill automatically with null values.
prediction_model = prediction_model.fillna(0)

In [8]:
# prediction_model.iloc[:, 1:] = prediction_model.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
# Drop rows where all values (except the first column) are zeros
pred_model_shrink = prediction_model.loc[~(prediction_model.iloc[:, 1:].sum(axis=1) == 0)]

print(f"Remaining rows after dropping rows with only zero values: {len(pred_model_shrink)}")


Remaining rows after dropping rows with only zero values: 21103


In [30]:
pred_model_shrink.head(60)

Unnamed: 0,Titles,https://mubi.com/en/lists/the-top-1000,https://mubi.com/en/lists/101-directors-essential-films,https://mubi.com/en/lists/the-best-films-of-every-year,https://mubi.com/en/lists/edgar-wrights-favorite-movies,https://mubi.com/en/lists/hysterical-in-a-floral-dress,https://mubi.com/en/lists/_zyx-erotica,https://mubi.com/en/lists/essential-films-by-women,https://mubi.com/en/lists/forget-filmschool-learn-from-this,https://mubi.com/en/lists/fox-and-his-queer-friends,...,https://mubi.com/en/lists/inner-city-blues,https://mubi.com/en/lists/lgbt--6,https://mubi.com/en/lists/viva-chile-mierda,https://mubi.com/en/lists/best-horror-movies-ever,https://mubi.com/en/lists/palme-dor-winners,https://mubi.com/en/lists/poland-b61f3ad1-9c0d-4631-9270-fad57f8e4b22,https://mubi.com/en/lists/the-cult-canon,https://mubi.com/en/lists/mubi-2018-ee2fc5cb-5928-4e13-b246-a7936f502e5b,https://mubi.com/en/lists/ladies-almanack-cinema-lesbians,https://mubi.com/en/lists/explore-czech-cinema
0,LA ANTENA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,IT'S WINTER,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,THE PERFUME OF THE LADY IN BLACK,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,RIVIERA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,THE RETURN,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,IL GRIDO,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,THE GENERAL,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,LES BONNES FEMMES,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,COPS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,THE WIND OF THE NIGHT,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
pred_model_shrink.shape

(21103, 1001)

## Single Suggestion Model

In [32]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_similar_movies(pred_model_shrink):
    # Step 0: Ask for user input
    input_title = input("Please enter a movie title that you like: ")
    
    # Step 1: Check for title similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(pred_model_shrink['Titles'])
    input_tfidf = vectorizer.transform([input_title])
    similarities = cosine_similarity(input_tfidf, tfidf_matrix)
    
    # Find the index of the movie title that is most similar to the input
    similar_index = np.argmax(similarities)
    
    # Check if similarity is above 90%
    if similarities[0][similar_index] < 0.9:
        return "Sorry, I don't know that movie at the moment."
    
    # Step 2: Filter DataFrame
    # Get the columns where the input title has a '1'
    input_movie_features = pred_model_shrink.iloc[similar_index, 1:]
    columns_with_ones = input_movie_features[input_movie_features == 1].index
    
    # Filter out movies that don't have at least one '1' in the same columns
    filtered_df = pred_model_shrink.loc[(pred_model_shrink[columns_with_ones] == 1).any(axis=1)]
    
    # If the filtered DataFrame is empty, return the message
    if filtered_df.empty:
        return "I am sorry I can't help at the moment. Try with another movie."
    
    # Step 3: Run suggestion model
    # Sum the features for each movie in the filtered DataFrame
    sum_of_features = filtered_df.iloc[:, 1:].sum(axis=1)
    
    # Find the movie(s) with the maximum sum of features
    max_feature_value = np.max(sum_of_features)
    movies_with_max_features = filtered_df[sum_of_features == max_feature_value]['Titles']
    
    # Step 4: Return results
    # If there's more than one, return all movies
    suggested_movies = movies_with_max_features.tolist()
    
    return suggested_movies

# Example usage:
# suggestions = find_similar_movies(pred_model_shrink)
# print(suggestions)


In [33]:
suggestion = find_similar_movies(pred_model_shrink)

Please enter a movie title that you like: Godfather


In [34]:
print('for sure you will like', suggestion)

for sure you will like ['MAN WITH A MOVIE CAMERA']


## Multiple Suggestion Model

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_similar_movies(pred_model_shrink):
    # Step 0: Ask for user input
    input_title = input("Please enter a movie title that you like: ")
    
    # Step 1: Check for title similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(pred_model_shrink['Titles'])
    input_tfidf = vectorizer.transform([input_title])
    similarities = cosine_similarity(input_tfidf, tfidf_matrix)
    
    # Find the index of the movie title that is most similar to the input
    similar_index = np.argmax(similarities)
    
    # Check if similarity is above 90%
    if similarities[0][similar_index] < 0.9:
        return "Sorry, I don't know that movie at the moment."
    
    # Step 2: Filter DataFrame
    # Get the columns where the input title has a '1'
    input_movie_features = pred_model_shrink.iloc[similar_index, 1:]
    columns_with_ones = input_movie_features[input_movie_features == 1].index
    
    # Filter out movies that don't have at least one '1' in the same columns
    filtered_df = pred_model_shrink.loc[(pred_model_shrink[columns_with_ones] == 1).any(axis=1)]
    
    # If the filtered DataFrame is empty, return the message
    if filtered_df.empty:
        return "I am sorry I can't help at the moment. Try with another movie."
    
    # Step 3: Run suggestion model
    # Sum the features for each movie in the filtered DataFrame
    filtered_df['sum_of_features'] = filtered_df.iloc[:, 1:].sum(axis=1)
    
    # Sort the DataFrame based on the sum of features in descending order
    sorted_df = filtered_df.sort_values(by='sum_of_features', ascending=False)
    
    # Step 4: Return results
    # Return the top three movies
    top_movies = sorted_df.head(3)['Titles'].tolist()
    
    return top_movies

# Example usage:
# suggestions = find_similar_movies(pred_model_shrink)
# print(suggestions)


In [39]:
suggestion = find_similar_movies(pred_model_shrink)

Please enter a movie title that you like: lake tahoe


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['sum_of_features'] = filtered_df.iloc[:, 1:].sum(axis=1)


In [40]:
print('for sure you will like', suggestion)

for sure you will like ['PERSONA', 'ELEPHANT', 'DONNIE DARKO']


## Streamlit Prototype

In [42]:
# pip install streamlit

Collecting streamlitNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for streamlit from https://files.pythonhosted.org/packages/0e/86/69fdac2ec6852304bda08e5af5b72dfa6e74dc0b3cef0d7c1e19994388ae/streamlit-1.35.0-py2.py3-none-any.whl.metadata
  Downloading streamlit-1.35.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Obtaining dependency information for altair<6,>=4.0 from https://files.pythonhosted.org/packages/46/30/2118537233fa72c1d91a81f5908a7e843a6601ccc68b76838ebc4951505f/altair-5.3.0-py3-none-any.whl.metadata
  Downloading altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Obtaining dependency information for blinker<2,>=1.0.0 from https://files.pythonhosted.org/packages/bb/2a/10164ed1f31196a2f7f3799368a821765c62851ead0e630ab52b8e14b4d0/blinker-1.8.2-py3-none-any.whl.metadata
  Downloading blinker-1.8.2-py3-none-any.whl.metadata (1.6 kB)
Collect

In [43]:
import streamlit as st
# Streamlit interface
st.title('Movie Suggester App')

# User input
input_title = st.text_input("Please enter a movie title that you like: ")

# Button to run the suggestion model
if st.button('Find Similar Movies'):
    if input_title:  # Check if the input is not empty
        suggestions = find_similar_movies(pred_model_shrink, input_title)
        if isinstance(suggestions, list):
            st.write('For sure you will like')
            for movie in suggestions:
                st.write(movie)
        else:
            st.write(suggestions)
    else:
        st.write("Please enter a movie title to get suggestions.")

2024-05-31 10:37:41.032 
  command:

    streamlit run C:\Users\benja\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-05-31 10:37:41.033 Session state does not function when running a script without `streamlit run`
