## Clean up the transcripts

Use this only if you notice issues where a lot of sentences are cut mid-way. This script can also cause troubles for transcripts that don't have a lot of these issues.

In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_csv("output/cut_4.csv")

In [None]:
def split_into_sentences(text: str):
    sentence_enders = re.compile("[.!?]")
    sentence_list = sentence_enders.split(text)
    return [sentence.strip() for sentence in sentence_list if sentence.strip() != ""]


def correct_df(df: pd.DataFrame) -> pd.DataFrame:
    i = 0
    while i < len(df) - 1:
        current_text = df.iloc[i, df.columns.get_loc("Text")].strip()
        next_text = df.iloc[i + 1, df.columns.get_loc("Text")].strip()
        current_speaker = df.iloc[i, df.columns.get_loc("Speaker")]
        next_speaker = df.iloc[i + 1, df.columns.get_loc("Speaker")]

        # Check conditions
        merge_condition = (
            current_text[-1] in ", "
            or next_text[0].islower()
            or next_text.split()[0].lower() == "and"
            or next_text[0] == "$"
            or next_text[0].isdigit()
        )

        # If only word is "And", push it to the next row
        if next_text.strip().lower() == "and":
            df.iloc[i + 1, df.columns.get_loc("Text")] = current_text + " " + next_text
            df.drop(df.index[i], inplace=True)
            df.reset_index(drop=True, inplace=True)
            continue

        # If merge condition met and speakers are same, merge texts
        if merge_condition:
            # If next text is completely moved up, then change the speaker
            if (
                not re.search(r"[.!?]$", next_text)
                or next_text.split()[0].lower() == "and"
            ):
                df.iloc[i, df.columns.get_loc("Speaker")] = next_speaker
            df.iloc[i, df.columns.get_loc("Text")] += " " + next_text
            df.drop(df.index[i + 1], inplace=True)
            df.reset_index(drop=True, inplace=True)
        else:
            i += 1

    return df


def correct_speakers(df: pd.DataFrame) -> pd.DataFrame:
    i = 1
    while i < len(df) - 1:
        current_speaker = df.loc[i, "Speaker"]
        next_speaker = df.loc[i + 1, "Speaker"]

        if current_speaker == next_speaker:
            df.loc[i, "Text"] += " " + df.loc[i + 1, "Text"]
            df.drop(i + 1, inplace=True)
            df.reset_index(drop=True, inplace=True)
        else:
            i += 1

    return df

In [None]:
df_corrected_text = correct_df(df)
df_corrected_text.head()
df_corrected_speakers = correct_speakers(df_corrected_text.copy())
df_corrected_speakers.to_csv("part_4.csv", index=False)

## Prepare for finetuning

In [None]:
df = pd.read_csv("output/full_transcripts.csv")
df = df.dropna()
df = df.reset_index(drop=True)

prompt_response = {"prompt": [], "completion": []}

for i in range(len(df)):
    if df["Speaker"][i] == "BUFFETT":
        prompt = df["Text"][i - 5 : i]
        prompt = " ".join(prompt)
        response = df["Text"][i]
        prompt_response["prompt"].append(prompt)
        prompt_response["completion"].append(response)


df_clean = pd.DataFrame(prompt_response).shift(-2)
df_clean = df_clean.dropna()
df_clean

In [None]:
df_clean.to_csv("output/train.csv")