In [3]:
import numpy as np
import pandas as pd

from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

llm = ChatOpenAI(temperature=0.5, model="gpt-3.5-turbo")

In [4]:
map_prompt = """
Write a short and concise summary of the following:
Text: '{text}'
CONCISE SUMMARY:
"""

map_prompt_template = PromptTemplate(
    input_variables=["text"],
    template=map_prompt
)

combine_prompt = """
Write a very short and condensed movie summary that describes in a gripping manner
the movie plot to get the reader to watch the movie. At the end of the summary
mention movie genre and the appropriate age group in the format 'Genre: GENRE, Age group: AGE GROUP'.
Text: '{text}'
"""

combine_prompt_template = PromptTemplate(
    input_variables=["text"],
    template=combine_prompt
)

summary_chain_multi = load_summarize_chain(
    llm=llm,
    chain_type="map_reduce",
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    verbose=False
)

summary_chain_single = load_summarize_chain(llm, chain_type="stuff", prompt=combine_prompt_template, verbose=False)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=3496, length_function=lambda x: llm.get_num_tokens(x))


In [38]:
import dask.dataframe as dd
import time

movie_df = pd.read_parquet("movies.parquet")
#movie_df = movie_df.iloc[4000:]
movie_df["summary"] = ""

def get_summary(df):
    for i, row in df.iterrows():
        chunks = text_splitter.create_documents([row["summary"]])
        if len(chunks) == 1:
            output = summary_chain_single.run(chunks)
        else:
            output = summary_chain_multi(chunks)
        try:
            if isinstance(output,dict):
                output = output["output_text"]
            df.at[i, "summary"] = output
        except Exception:
            print(f"Something went wrong with index {i}")
        
    df.to_csv(f"./mySummaries/sum_{time.strftime('%Y%m%d-%H%M%S')}.csv")
        
    return df

def preprocess(df):
    merge_cols = ["Plot", "Title", "Release Year", "Director", "Cast", "genres"]
    for i, row in df.iterrows():
        merged_movie_info = "\n".join([f"{col}: {row[col]}" for col in merge_cols])
        df.at[i, "summary"] = merged_movie_info
    return df
    
ddf = dd.from_pandas(movie_df, npartitions=100)
ddf = ddf.map_partitions(lambda partition: preprocess(partition), meta=ddf._meta).compute()

ddf = dd.from_pandas(ddf, npartitions=5)

ddf = ddf.map_partitions(lambda partition: get_summary(partition), meta=ddf._meta)
ddf = ddf.compute()