In [None]:
# Read in to pandas dataframe the data from the csv file
import pandas as pd
import os
from langchain.chat_models import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
import constants
import os
from langchain_community.document_loaders.csv_loader import CSVLoader
import csv

df = pd.read_csv("../data/Mail/Important/cleaned_messages.csv")

In [None]:
# Add a new column to the dataframe named "Week"
# This column will contain the week number of the year for each date in the "Date" column
df["Date"] = pd.to_datetime(df["Date"])
df["Week"] = df["Date"].dt.isocalendar().week

# Add a new column to the dataframe named "Year"
# This column will contain the year for each date in the "Date" column
df["Year"] = df["Date"].dt.year

df.head()

In [None]:
# Create a new DataFrame with the formatted date and day of week
df_formatted = df.copy()
df_formatted["FormattedDate"] = df["Date"].dt.strftime("%A, %Y-%m-%d")
df_formatted["DayOfWeek"] = df["Date"].dt.dayofweek

# Sort the DataFrame by year, week, and date
df_sorted = df_formatted.sort_values(["Year", "Week", "Date"])

df_sorted.head()

In [None]:
os.environ["OPENAI_API_KEY"] = constants.APIKEY

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

# Map
map_template = """Compose a detailed summary based on the provided list of business correspondence emails exchanged between me and my customers during the given week. Your mission is to give a report on everything that happened in the week, include what happened and who was involved.
Emails: {page_content}
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)

# Chain
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """Generate a comprehensive summary by significant events of the individual summaries created in the previous task. Ensure the summary remains concise, insightful, and suitable for a weekly review, with a target length of around 3000 characters.
Individual Summaries: {doc_summaries}
Helpful Answer:
"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

In [None]:
# Create directory
dir_name = "Emails"
os.makedirs(dir_name, exist_ok=True)

# Load the entire DataFrame
emails = df_sorted

# Save the DataFrame to a CSV file
emails.to_csv("emails.csv", index=False)

loader = CSVLoader("emails.csv")
docs = loader.load()

# Print confirmation of load
print(f"Loaded {len(docs)} documents")

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="page_content",
    # Return the results of the map steps in the output
    return_intermediate_steps=True,
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

# Group the dataframe by Year and Week
grouped = df_sorted.groupby(["Year", "Week"])

# Create a DataFrame to store all results
all_results = pd.DataFrame()

# Process each group with the map reduce chain
for name, group in grouped:
    year, week = name
    filename = os.path.join(dir_name, f"Year_{year}_Week_{week}.csv")

    # Save the group to a CSV file
    group.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)

    loader = CSVLoader(filename)
    docs = loader.load()

    # Split the documents
    split_docs = text_splitter.split_documents(docs)

    # Apply the map reduce chain
    result = map_reduce_chain(split_docs)
    output_text = result.get("output_text", None)
    intermediate_steps = result.get("intermediate_steps", None)

    # Create a DataFrame from the result and append it to all_results
    result_df = pd.DataFrame(
        {
            "Year": [year],
            "Week": [week],
            "Output Text": [output_text],
            "Intermediate Steps": [str(intermediate_steps)],
        }
    )

    # Remove line breaks from the output text and intermediate steps
    result_df["Output Text"] = result_df["Output Text"].str.replace("\n", " ")
    result_df["Intermediate Steps"] = result_df["Intermediate Steps"].str.replace(
        "\n", " "
    )

    all_results = pd.concat([all_results, result_df], ignore_index=True)

    # Save all_results to a CSV file
    all_results.to_csv("all_results.csv", index=False, quoting=csv.QUOTE_ALL)

    if output_text:
        print(f"Output Text for Year {year} Week {week}:")
        print(output_text)

    if intermediate_steps:
        print(f"Intermediate Steps for Year {year} Week {week}:")
        for step in intermediate_steps:
            print(step)

In [None]:
# Read data from CSV file
df = pd.read_csv("all_results.csv")

dir_name = "Summaries"

# Create a directory for all files
if not os.path.exists(dir_name):
    os.makedirs(dir_name)

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    year = row["Year"]  # Assuming 'Year' column exists
    week = row["Week"]  # Assuming 'Week' column exists
    output_text = row["Output Text"]  # Assuming 'Email' column contains the text

    # Create a directory for each year
    year_dir = os.path.join(dir_name, f"{year}")
    if not os.path.exists(year_dir):
        os.makedirs(year_dir)

    # Define the filename with the year directory
    filename = os.path.join(year_dir, f"Week_{week}.md")

    with open(filename, "w") as file:
        # Write the output text to the file
        if output_text:
            file.write(f"# Email Summary Week {week}:\n\n")
            file.write(output_text)
            file.write("\n\n")