# Elon Dialog Dataset

Adapted from Rostyslav Neskorozhenyi's great [Medium article](https://towardsdatascience.com/make-your-own-rick-sanchez-bot-with-transformers-and-dialogpt-fine-tuning-f85e6d1f4e30)

## Initial Model Configuration

## Prepare Dataset

In [None]:
import glob
import logging
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
lex = pd.read_csv('data/lex.txt', header=None, engine="python", delimiter="                                           ")

In [None]:
lex.head()

In [None]:
lex.columns = ['Speaker', 'Text']

In [None]:
lex['Speaker'] = lex['Speaker'].str.rstrip(':')

In [None]:
with open('data/joe_elon_qa.txt', 'r') as f:
    data = f.read().split('\n')[:-1]

df = pd.read_json('[' + ','.join(data) + ']')

joe = pd.DataFrame({
    'instruction': df['prompt'],
    'output': df['completion']
})
joe.head()

In [None]:
# read in clubhouse convo
clubhouse = pd.read_csv('data/elon_clubhouse.txt', header=None, engine="python", delimiter=r"\d\d:\d\d:\d\d")

In [None]:
clubhouse.head()

In [None]:
clubhouse[['Speaker', 'Text']] = clubhouse[1].str.split('Musk:|riram:|arc:|arthi:|lad:|even:', 1, expand=True)

In [None]:
clubhouse['Speaker'] = clubhouse.Speaker.apply(lambda s: s + "Musk" if s == "Elon " else s)

In [None]:
clubhouse.drop(clubhouse.columns[[0, 1]], axis = 1, inplace = True)
clubhouse.head(10)

In [None]:
# concatenate dfs
convo = pd.concat([lex, clubhouse], ignore_index = True)

In [None]:
convo.tail(10)

In [None]:
CHARACTER_NAME = 'Elon Musk'

In [None]:
contexted = []

# context window of size 1
n = 1

for i in convo[convo.Speaker == CHARACTER_NAME].index:
  if i < n:
    continue
  row = []
  prev = i - 1 - n # we additionally substract 1, so row will contain current responce and 7 previous responces  
  for j in range(i, prev, -1):
    row.append(convo.Text[j])
  contexted.append(row)

columns = ['response', 'context'] 
columns = columns + ['context/' + str(i) for i in range(n - 1)]

df = pd.DataFrame.from_records(contexted, columns=columns)

In [None]:
df.sample(5)

In [None]:
# 使用 apply 方法将 context 和 response 数据填入到指定格式
df['example'] = df.apply(lambda x: {"instruction": x['context'], "input": '', "output": x['response']}, axis=1)
df['example'] = df['example'].apply(str)

# 将结果存储到新的列中
new_df = df['example'].to_list()

In [None]:
# 使用 apply 方法将 context 和 response 数据填入到指定格式
joe['example'] = joe.apply(lambda x: {"instruction": x['instruction'], "input": '', "output": x['output']}, axis=1)
joe['example'] = joe['example'].apply(str)

# 将结果存储到新的列中
joe_df = joe['example'].to_list()

In [None]:
# 合并两个数据帧
trn_df = new_df+joe_df


In [None]:
# 将数据帧转换为字典列表
import json

# 导出每行数据为 JSON 文件
with open('data.json', 'w') as f:
    f.write(json.dumps(trn_df))