# Tarantino Movie Query App
This notebook contains code to create a simple PyQt5 app that allows users to query information about Tarantino movies using OpenAI's GPT model and precomputed embeddings of movie scripts.

In [1]:
import pandas as pd
import ast
import openai
from scipy import spatial
import tiktoken
import sys
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QPushButton, QComboBox, QTextBrowser, QLineEdit, QLabel

## Constants and Data Loading
Define constants and load the Tarantino embeddings dataset. Ensure to replace the API key and file path with your actual values.

In [2]:
# Define constants
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
FILE_PATH = '/Users/charlieevert/Desktop/ai_projects/structured_data_querying/tarantino/tarantino_embeddings.csv'
openai.api_key = "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" 

# Load the Tarantino embeddings dataset
tarantino_df = pd.read_csv(FILE_PATH)

# Convert embeddings from CSV string type back to list type
tarantino_df['Embedding'] = tarantino_df['Embedding'].apply(ast.literal_eval)

In [3]:
print(tarantino_df)

      Unnamed: 0               Name  Year  Page  \
0              0        Death Proof  2007     1   
1              1        Death Proof  2007     2   
2              2        Death Proof  2007     3   
3              3        Death Proof  2007     4   
4              4        Death Proof  2007     5   
...          ...                ...   ...   ...   
1609        1609  the-hateful-eight  2014   164   
1610        1610  the-hateful-eight  2014   165   
1611        1611  the-hateful-eight  2014   166   
1612        1612  the-hateful-eight  2014   167   
1613        1613  the-hateful-eight  2014   168   

                                                   Text  \
0     . __ ,, . -- .. :. ::. ' -.. . --., .. ~ ~. . ...   
1     ..  .:, .....,; . This script is dedicated to ...   
2     ... .-.,,-..~---''.~--.,.:-.-. . ... . ---~,. ...   
3     . . .. , . :. , . . : .... : -~----.-.- - ' ,,...   
4     . . --~..:.., ~. ., :.:-  .. . ... . --.., ......   
...                              

## Helper Functions
Define helper functions that will be used to interact with OpenAI API and calculate relatedness between strings.

In [4]:
def num_tokens(text: str, df: pd.DataFrame = tarantino_df) -> int:
    """
    Retrieve the number of tokens for a text from the DataFrame.
    """
    # Find the row in the DataFrame that matches the text and return its token count
    matching_row = df[df['Text'] == text]
    if not matching_row.empty:
        return int(matching_row['Total Tokens'].values[0])
    return 0

def strings_ranked_by_relatedness(query: str, df: pd.DataFrame, top_n: int = 5) -> tuple[list[str], list[float], list[int]]:
    """
    Return a list of strings, relatednesses, and page numbers, sorted from most related to least.
    """
    # Get the embedding for the query
    query_embedding_response = openai.Embedding.create(model=EMBEDDING_MODEL, input=query)
    query_embedding = query_embedding_response["data"][0]["embedding"]
    
    # Calculate the relatedness of each string in the DataFrame to the query
    strings_and_relatednesses_and_pages = [
        (f"{row['Text']}",
         1 - spatial.distance.cosine(query_embedding, row["Embedding"]),
         row['Page'])
        for i, row in df.iterrows()
    ]
    
    # Sort by relatedness and return the top_n results
    strings_and_relatednesses_and_pages.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses, pages = zip(*strings_and_relatednesses_and_pages)
    return strings[:top_n], relatednesses[:top_n], pages[:top_n]

def query_message(query: str, df: pd.DataFrame = tarantino_df, model: str = GPT_MODEL, token_budget: int = 3596) -> str:
    """
    Construct a message for GPT with relevant source texts pulled from a DataFrame.
    """
    strings, _, pages = strings_ranked_by_relatedness(query, df)
    introduction = ('Use the below movie scripts to answer the subsequent question. '
                    'If the answer cannot be found in the scripts, write "I could not find an answer." '
                    'Cite all sources in brackets with [Movie Name: <Name>, Year: <Year>, Page: <Page>]. '
                    'Do not simply state that a script section is the answer, cite the information for each '
                    'script section used in the results. Explain why the source answers the question.')
    question = f"\n\nQuestion: {query}"
    message = introduction
    
    # Add relevant script sections to the message until the token budget is reached
    for i, string in enumerate(strings):
        next_script = f'\n\nMovie script section:\n"""\n{string} [Name: {df[df["Text"] == string]["Name"].values[0]}, Year: {df[df["Text"] == string]["Year"].values[0]}, Page: {pages[i]}]\n"""'
        if num_tokens(message + next_script + question, df=df) + num_tokens(question, df=df) > token_budget:
            break
        else:
            message += next_script
    return message + question

def ask(query: str, df: pd.DataFrame = tarantino_df, model: str = GPT_MODEL, token_budget: int = 3596, print_message: bool = False) -> tuple[str, list[str]]:
    """
    Answer a query using GPT and a DataFrame of relevant texts and embeddings.
    """
    strings, _, pages = strings_ranked_by_relatedness(query, df)  # Unpack all three values
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    
    # Construct messages and get the response from GPT
    messages = [
        {"role": "system", "content": "You answer questions using the Tarantino movie scripts."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(model=model, messages=messages, temperature=0)
    response_message = response["choices"][0]["message"]["content"]
    
    # Construct the citations with the correct page numbers
    citations = [f"{string} [Name: {df[df['Text'] == string]['Name'].values[0]}, Year: {df[df['Text'] == string]['Year'].values[0]}, Page: {page}]" for string, page in zip(strings, pages)]
    
    return response_message, citations

## Relevant Questions
List of relevant questions that can be used for predefined queries in the app.

In [5]:
relevant_questions = [
    "Can you describe a memorable dialogue exchange between Vincent and Mia in \"Pulp Fiction\"?",
    "How is the character of Hans Landa depicted in \"Inglourious Basterds\"?",
    "What is the significance of the gold watch in \"Pulp Fiction\", and which character does it belong to?",
    "Can you find a dialogue in \"Django Unchained\" where the character Dr. King Schultz talks about bounty hunting?",
    "Which character in \"Jackie Brown\" works as a flight attendant and gets involved in smuggling money?",
    "In \"Once Upon a Time in Hollywood\", who plays the character of Sharon Tate, and how is she portrayed in the film?",
    "Can you find a scene in \"The Hateful Eight\" where the character Major Marquis Warren confronts General Sandy Smithers?",
    "What is the famous line said by Jules Winnfield in \"Pulp Fiction\" before he executes someone, and what does he claim it to be?",
    "What genre of movie is Django?"
]

## App GUI
Define the PyQt5 app GUI that allows users to select or type a question and display the answer from GPT.

In [6]:
class App(QWidget):
    def __init__(self):
        super().__init__()
        self.title = 'Chadbot'
        self.initUI()

    def initUI(self):
        layout = QVBoxLayout()

        # Styling
        self.setStyleSheet("""
            QWidget {
                background-color: black;
            }
            QLabel, QComboBox, QTextBrowser, QLineEdit {
                color: white;
            }
            QPushButton {
                background-color: red;
                color: white;
                border: none;
                padding: 5px 15px;
                border-radius: 5px;
            }
            QPushButton:hover {
                background-color: #ff3333;
            }
            QTextBrowser {
                background-color: #1a1a1a;
                border: 2px solid red;
                padding: 10px;
            }
            QLineEdit, QComboBox {
                background-color: #1a1a1a;
                border: 2px solid red;
                padding: 5px;
                border-radius: 5px;
            }
        """)

        # Predefined questions dropdown
        self.comboBox = QComboBox(self)
        for question in relevant_questions:
            self.comboBox.addItem(question)
        layout.addWidget(self.comboBox)

        # User input question
        self.label = QLabel("Or type your own question:")
        layout.addWidget(self.label)
        self.userInput = QLineEdit(self)
        layout.addWidget(self.userInput)

        self.button = QPushButton('Ask', self)
        self.button.clicked.connect(self.on_click)
        layout.addWidget(self.button)

        self.textBrowser = QTextBrowser(self)
        layout.addWidget(self.textBrowser)

        self.setLayout(layout)
        self.setWindowTitle(self.title)
        self.show()

    def on_click(self):
        # Check if user has typed a question, otherwise use the dropdown
        if self.userInput.text():
            question = self.userInput.text()
        else:
            question = self.comboBox.currentText()
        
        response, citations = ask(question)
        display_text = f"Question:\n{question}\n\nAnswer:\n{response}\n\nMost Relevant Citations:\n" + "\n".join(citations)
        self.textBrowser.setText(display_text)

if __name__ == '__main__':
    app = QApplication(sys.argv)
    ex = App()
    sys.exit(app.exec_())

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
