# PersonaChat by Amazon data collection

In [None]:
!pip install pytorch-pretrained-bert

import json
from pytorch_pretrained_bert import cached_path

url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"

# Download and load JSON dataset
personachat_file = cached_path(url)
with open(personachat_file, "r", encoding="utf-8") as f:
    dataset = json.loads(f.read())

In [None]:
all_dialogues = []
stop_sentence = '__ SILENCE __'

for i in range(len(dataset['train'])):
    all_dialogues.append([sent for sent in dataset['train'][i]['utterances'][-1]['history'] if sent != stop_sentence])

for i in range(len(dataset['valid'])):
    all_dialogues.append([sent for sent in dataset['train'][i]['utterances'][-1]['history'] if sent != stop_sentence])
    
len(all_dialogues)

18878

In [None]:
all_dialogues

[["hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
  'you must be very fast . hunting is one of my favorite hobbies .',
  'i am ! for my hobby i like to do canning or some whittling .',
  'i also remodel homes when i am not out bow hunting .',
  "that's neat . when i was in high school i placed 6th in 100m dash !",
  "that's awesome . do you have a favorite season or time of year ?",
  'i do not . but i do have a favorite meat since that is all i eat exclusively .',
  'what is your favorite meat to eat ?',
  'i would have to say its prime rib . do you have any favorite foods ?',
  'i like chicken or macaroni and cheese .',
  'do you have anything planned for today ? i think i am going to do some canning .',
  'i am going to watch football . what are you canning ?',
  'i think i will can some jam . do you also play footfall for fun ?'],
 ['hi , how are you doing today ?',
  'i am spending time with my 4 sisters what are you up to',
  'wow , fou

In [None]:
_all_pairs = []
for diag in all_dialogues:
    _pairs = [[first, second] for first, second in zip(diag, diag[1:])]
    _all_pairs += _pairs

In [None]:
_all_pairs

[["hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
  'you must be very fast . hunting is one of my favorite hobbies .'],
 ['you must be very fast . hunting is one of my favorite hobbies .',
  'i am ! for my hobby i like to do canning or some whittling .'],
 ['i am ! for my hobby i like to do canning or some whittling .',
  'i also remodel homes when i am not out bow hunting .'],
 ['i also remodel homes when i am not out bow hunting .',
  "that's neat . when i was in high school i placed 6th in 100m dash !"],
 ["that's neat . when i was in high school i placed 6th in 100m dash !",
  "that's awesome . do you have a favorite season or time of year ?"],
 ["that's awesome . do you have a favorite season or time of year ?",
  'i do not . but i do have a favorite meat since that is all i eat exclusively .'],
 ['i do not . but i do have a favorite meat since that is all i eat exclusively .',
  'what is your favorite meat to eat ?'],
 ['what is your fa

In [None]:
import pickle

with open('personaChat_pairs.txt', 'wb') as fp:
    pickle.dump(_all_pairs, fp)

### http://convai.io/data/?ref=hackernoon.com Data collection

In [None]:
!wget http://convai.io/data/data_tolokers.json

In [None]:
!wget http://convai.io/data/data_intermediate.json

In [None]:
!wget http://convai.io/data/data_volunteers.json

In [None]:
import json

with open('/content/data_volunteers.json') as f:
  data = json.load(f)

# Output: {'name': 'Bob', 'languages': ['English', 'Fench']}
print(data)

[{'dialog': [{'id': 0, 'sender': 'participant1', 'text': 'hi there', 'evaluation_score': None, 'sender_class': 'Human'}], 'start_time': '2018-10-29 03:32:08.296000', 'end_time': '2018-10-29 03:32:08.296000', 'bot_profile': ['i like to talk but people have a hard time understanding.', 'i like to look at blocks and sing about letters.', 'i like to eat chocolate candy.', 'when i grow up i want to be a dog.'], 'user_profile': ['i am a clean eater.', 'my parents were both very athletic.', 'i love running and preparing for marathons.', 'i am a cancer survivor.'], 'eval_score': None, 'profile_match': '', 'participant1_id': {'class': 'User', 'user_id': 'User 00172'}, 'participant2_id': {'class': 'Bot', 'user_id': 'Bot 004'}}, {'dialog': [{'id': 0, 'sender': 'participant1', 'text': 'Hello!', 'evaluation_score': None, 'sender_class': 'Human'}, {'id': 1, 'sender': 'participant2', 'text': 'Hi! How are you?', 'evaluation_score': None, 'sender_class': 'Bot'}, {'id': 2, 'sender': 'participant1', 'tex

In [None]:
### "/start", "/test", "Text is not given. Please try to type /end and /test to reset the state and get text."

In [None]:
all_diags = []
for diag in data:
  dialogue = []
  for text in diag['dialog']:
      dialogue.append(text['text'])

  all_diags.append(dialogue)

def del_string_with_word(all_diags, word):
    stop_sent = []
    for diag in all_diags:
        for s in diag:
            if word in s:
                stop_sent.append(s)

    stop_sent = set(stop_sent)
    output = []
    for diag in all_diags:
        output.append([s for s in diag if s not in stop_sent])
    return output

def empty_strings(docs):

    empty_strings = []
    for doc in docs:
      if len(doc) <= 1:
          empty_strings.append(doc)

    return empty_strings

def clean_corpus(all_diags):     
    all_diags = del_string_with_word(all_diags, "/start")
    all_diags = del_string_with_word(all_diags, "/Start")
    all_diags = del_string_with_word(all_diags, "/test")
    all_diags = del_string_with_word(all_diags, "/Test")
    all_diags = del_string_with_word(all_diags, 'Traceback (most recent call last):')
    all_diags = del_string_with_word(all_diags, 'interactive(opt)')
    all_diags = del_string_with_word(all_diags, 'return self.agents[0].num_examples()')
    all_diags = del_string_with_word(all_diags, 'if self.max_exs > 0 or self.num_examples():')
    all_diags = del_string_with_word(all_diags, 'self.update_counters()')
    all_diags = del_string_with_word(all_diags, 'world.parley()')
    all_diags = del_string_with_word(all_diags, '/finish')

    all_diags = [doc for doc in all_diags if doc not in empty_strings(all_diags)]

    return all_diags

all_dialogues = clean_corpus(all_diags)

def check(all_dialogues, word):
  for diag in all_dialogues:
    if any(word in s for s in diag):
        print(diag)

_ = check(all_dialogues, "/test")
_ = check(all_dialogues, "/Test")
_ = check(all_dialogues, "/start")
_ = check(all_dialogues, "/Start")

In [None]:
all_dialogues

[['Hello!',
  'Hi! How are you?',
  'Not bad! And You?',
  "I'm doing well. Just got engaged to my high school sweetheart.",
  'Wowowowow! Congratulations! Is she pretty?',
  "She 's pretty cute. She invited me to dinner tonight. 🙂",
  'Cool! Have a good time you both! And what is your hobby?',
  'I love music! I love Taylor swift. 😉',
  'Me too. And what about Iggy Pop?',
  'I love Ziggy! He is my favorite. Are you and your wife millennial too?',
  "I have no wife. And I'm not millenial, I'm X generation.",
  'Hey? Where are you?',
  'I am sorry to hear that. What do you do for fun?',
  "I'm playing pipe organ.",
  'That sounds impressive. I like to go out to eat with my friends.',
  'Cool! See ya!'],
 ['Hi!', 'Hello!'],
 ['Hi!', 'Hello!'],
 ['Hi!',
  'Hello, how are you today? ',
  "Cool! I'm going to finish with my homework and watch some Disney cartoons. And you",
  'I am doing great. I just got home from work.',
  'Where are you working?',
  'Bro?',
  'Where are you from? ',
  "I'

In [None]:
### ['Traceback (most recent call last):', 'Hi!'],
#  ['Hi!', 'interactive(opt)'],
#  ['interactive(opt)', 'What'],
#  ['What', 'world.parley()'],
#  ['world.parley()', 'What'],
#  ['What', 'self.update_counters()'],
#  ['self.update_counters()', 'And?'],
#  ['And?', 'if self.max_exs > 0 or self.num_examples():'],
#  ['if self.max_exs > 0 or self.num_examples():', 'And?'],
#  ['And?', 'return self.agents[0].num_examples()'],
#  ['return self.agents[0].num_examples()', 'Ok'],

In [None]:
_all_pairs = []
for diag in all_dialogues:
    _pairs = [[first, second] for first, second in zip(diag, diag[1:])]
    _all_pairs += _pairs

In [None]:
def remove_dots(s):
    s = re.sub("[\.][\.][\.]", "",s)
    s = re.sub("[\,][\,][\,]", "",s)
    return s

In [None]:
_all_pairs = [[remove_dots(s) for s in pair] for pair in _all_pairs]
len(_all_pairs)

13508

In [None]:
import pickle

with open('data_volunteers.txt', 'wb') as fp:
    pickle.dump(_all_pairs, fp)

# ConvoKit data collection

In [None]:
!pip install convokit

Collecting convokit
[?25l  Downloading https://files.pythonhosted.org/packages/65/40/c4f9e73856b50487ef8887f9b1356970fc3cafb47c7b0308e58c500de29b/convokit-2.4.5.tar.gz (144kB)
[K     |██▎                             | 10kB 15.3MB/s eta 0:00:01[K     |████▌                           | 20kB 21.9MB/s eta 0:00:01[K     |██████▉                         | 30kB 15.4MB/s eta 0:00:01[K     |█████████                       | 40kB 11.1MB/s eta 0:00:01[K     |███████████▎                    | 51kB 8.6MB/s eta 0:00:01[K     |█████████████▋                  | 61kB 8.7MB/s eta 0:00:01[K     |███████████████▉                | 71kB 8.8MB/s eta 0:00:01[K     |██████████████████▏             | 81kB 8.9MB/s eta 0:00:01[K     |████████████████████▍           | 92kB 8.6MB/s eta 0:00:01[K     |██████████████████████▋         | 102kB 9.1MB/s eta 0:00:01[K     |█████████████████████████       | 112kB 9.1MB/s eta 0:00:01[K     |███████████████████████████▏    | 122kB 9.1MB/s eta 0:00:01

In [None]:
from convokit import Corpus, download
# corpus = Corpus(filename=download("friends-corpus"))#good
# corpus = Corpus(filename=download("persuasionforgood-corpus"))
# corpus = Corpus(filename=download("gap-corpus"))
# corpus = Corpus(filename=download("parliament-corpus")) #good
# corpus = Corpus(filename=download("winning-args-corpus")) #might be good
corpus = Corpus(filename=download("persuasionforgood-corpus")) 
corpus.print_summary_stats()

Downloading persuasionforgood-corpus to /root/.convokit/downloads/persuasionforgood-corpus
Downloading persuasionforgood-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/persuasionforgood-corpus/persuasionforgood.zip (2.7MB)... Done
Number of Speakers: 1285
Number of Utterances: 20932
Number of Conversations: 1017


In [None]:
convo = corpus.random_conversation()
convo.print_conversation_structure(lambda utt: utt.text[:80])

Hi! Are you interested in making a donation to Save the Children?
    Hi, Yes I am
        That's great to hear! Donations make a huge difference in the lives of these kid
            Yes, I agree. Helping children and pets is one of the donations I enjoy making a
                Do you have an amount in mind that you'd like to donate?
                    I was thinking to donate around 10 cents for save the children. I hope that is a
                        Any amount we can collect for them is welcome. Would it convince you to donate m
                            ok how about 20 cents. I do want to help in anyway that I can.
                                Thank you so much. It's really appreciated. The amount of suffering that childre
                                    yeah I heard a little bit about on how children are suffering. I want to give a 
                                        It will make a big difference to Save the Children. The organization has done wo
              

In [None]:
sentences = []
for utt in convo.iter_utterances():
    sentences.append(utt.text)

sentences

['Hi! Are you interested in making a donation to Save the Children?',
 'Hi, Yes I am',
 "That's great to hear! Donations make a huge difference in the lives of these kids. Your help will give them access to food, healthcare, education, and safety, among other things.",
 'Yes, I agree. Helping children and pets is one of the donations I enjoy making and helping at the same time.',
 "Do you have an amount in mind that you'd like to donate?",
 'I was thinking to donate around 10 cents for save the children. I hope that is adequate and helps them for food and shelter.',
 'Any amount we can collect for them is welcome. Would it convince you to donate more if I can assure you that the donation is absolutely secure and will be completely given to a trustworthy, verified fund within Save the Children?',
 'ok how about 20 cents. I do want to help in anyway that I can.',
 "Thank you so much. It's really appreciated. The amount of suffering that children around the world face is terrible. For exa

In [None]:
all_dialogues = []
for conv in corpus.iter_conversations():
    sentences = []
    for utt in conv.iter_utterances():
        sentences.append(utt.text)
        
    all_dialogues.append(sentences)

In [None]:
all_dialogues[0]

['Good morning. How are you doing today?',
 'Hi. I am doing good. How about you?',
 "I'm doing pretty good for a Tuesday morning. ",
 'Haha. Same here, but it really feels like a Monday.',
 'Ugh yes it does!',
 'I can not believe how warm it is already.',
 'Where are you from? ',
 'I am from the Midwest. What about you?',
 "I'm from the South East. It's always warm here. ",
 'Oh, yep. You are definitely in for warm weather, which is great as far as I am concerned.',
 "We're about to get hit by a tropical storm.",
 'I heard that some bad weather was going to be coming. I hope it is not too severe.',
 "Me too. It's just part of living on the Gulf. You have to be prepared for it.",
 'Yes, I am sure you get a lot of storms.',
 'We do. I guess I should get into what this chat is supposed to be about. Have you heard of the Charity Save The Children?',
 'I have heard about them. What do you like about them?',
 "I like that they're committed to helping children in need. They're very transparen

In [None]:
_all_pairs = []
for diag in all_dialogues:
    _pairs = [[first, second] for first, second in zip(diag, diag[1:])]
    _all_pairs += _pairs

In [None]:
_all_pairs

[['Good morning. How are you doing today?',
  'Hi. I am doing good. How about you?'],
 ['Hi. I am doing good. How about you?',
  "I'm doing pretty good for a Tuesday morning. "],
 ["I'm doing pretty good for a Tuesday morning. ",
  'Haha. Same here, but it really feels like a Monday.'],
 ['Haha. Same here, but it really feels like a Monday.', 'Ugh yes it does!'],
 ['Ugh yes it does!', 'I can not believe how warm it is already.'],
 ['I can not believe how warm it is already.', 'Where are you from? '],
 ['Where are you from? ', 'I am from the Midwest. What about you?'],
 ['I am from the Midwest. What about you?',
  "I'm from the South East. It's always warm here. "],
 ["I'm from the South East. It's always warm here. ",
  'Oh, yep. You are definitely in for warm weather, which is great as far as I am concerned.'],
 ['Oh, yep. You are definitely in for warm weather, which is great as far as I am concerned.',
  "We're about to get hit by a tropical storm."],
 ["We're about to get hit by a 

In [None]:
import re
_all_pairs = [[remove_dots(s) for s in pair] for pair in _all_pairs]
_all_pairs

[['Good morning. How are you doing today?',
  'Hi. I am doing good. How about you?'],
 ['Hi. I am doing good. How about you?',
  "I'm doing pretty good for a Tuesday morning. "],
 ["I'm doing pretty good for a Tuesday morning. ",
  'Haha. Same here, but it really feels like a Monday.'],
 ['Haha. Same here, but it really feels like a Monday.', 'Ugh yes it does!'],
 ['Ugh yes it does!', 'I can not believe how warm it is already.'],
 ['I can not believe how warm it is already.', 'Where are you from? '],
 ['Where are you from? ', 'I am from the Midwest. What about you?'],
 ['I am from the Midwest. What about you?',
  "I'm from the South East. It's always warm here. "],
 ["I'm from the South East. It's always warm here. ",
  'Oh, yep. You are definitely in for warm weather, which is great as far as I am concerned.'],
 ['Oh, yep. You are definitely in for warm weather, which is great as far as I am concerned.',
  "We're about to get hit by a tropical storm."],
 ["We're about to get hit by a 

In [None]:
# import pickle

# with open('persuasionforgood_corpus.txt', 'wb') as fp:
#     pickle.dump(_all_pairs, fp)

# TO-DO: Divide persuasionforgood_corpus.txt more into small sentences

In [None]:
persuasionforgood_pairs_mod1 = [[diag[0].split('?')[0], diag[1].split('.')[0] + '.'] for diag in _all_pairs]
persuasionforgood_pairs_mod1

[['Good morning. How are you doing today', 'Hi.'],
 ['Hi. I am doing good. How about you',
  "I'm doing pretty good for a Tuesday morning."],
 ["I'm doing pretty good for a Tuesday morning. ", 'Haha.'],
 ['Haha. Same here, but it really feels like a Monday.', 'Ugh yes it does!.'],
 ['Ugh yes it does!', 'I can not believe how warm it is already.'],
 ['I can not believe how warm it is already.', 'Where are you from? .'],
 ['Where are you from', 'I am from the Midwest.'],
 ['I am from the Midwest. What about you', "I'm from the South East."],
 ["I'm from the South East. It's always warm here. ", 'Oh, yep.'],
 ['Oh, yep. You are definitely in for warm weather, which is great as far as I am concerned.',
  "We're about to get hit by a tropical storm."],
 ["We're about to get hit by a tropical storm.",
  'I heard that some bad weather was going to be coming.'],
 ['I heard that some bad weather was going to be coming. I hope it is not too severe.',
  'Me too.'],
 ["Me too. It's just part of li

In [None]:
persuasionforgood_pairs_mod2 = [[diag[0].split('.')[0], diag[1].split('.')[0]] for diag in _all_pairs]
persuasionforgood_pairs_mod2

[['Good morning', 'Hi'],
 ['Hi', "I'm doing pretty good for a Tuesday morning"],
 ["I'm doing pretty good for a Tuesday morning", 'Haha'],
 ['Haha', 'Ugh yes it does!'],
 ['Ugh yes it does!', 'I can not believe how warm it is already'],
 ['I can not believe how warm it is already', 'Where are you from? '],
 ['Where are you from? ', 'I am from the Midwest'],
 ['I am from the Midwest', "I'm from the South East"],
 ["I'm from the South East", 'Oh, yep'],
 ['Oh, yep', "We're about to get hit by a tropical storm"],
 ["We're about to get hit by a tropical storm",
  'I heard that some bad weather was going to be coming'],
 ['I heard that some bad weather was going to be coming', 'Me too'],
 ['Me too', 'Yes, I am sure you get a lot of storms'],
 ['Yes, I am sure you get a lot of storms', 'We do'],
 ['We do', 'I have heard about them'],
 ['I have heard about them',
  "I like that they're committed to helping children in need"],
 ["I like that they're committed to helping children in need",
  'Y

In [None]:
import pickle

with open('persuasionforgood_pairs_mod1.txt', 'wb') as fp:
    pickle.dump(persuasionforgood_pairs_mod1, fp)

with open('persuasionforgood_pairs_mod2.txt', 'wb') as fp:
    pickle.dump(persuasionforgood_pairs_mod2, fp)

# Friends pairs extraction

In [None]:
def get_indices(sentences):
    idxs = []

    idx = 0
    for text in sentences:
        if text == '':
            idxs.append(idx)
        idx += 1

    idxs.append(sentences.index(sentences[-1]))

    return idxs

def get_dialogues(sentences):
  idxs = get_indices(sentences)
  if len(idxs) != 1:
      dialogues = [[sentences[i] for i in range(first+1, second)] for first, second in zip(idxs, idxs[1:])]
  else:
      dialogues = [[sentences[i] for i in range(len(sentences))]]
  return dialogues

In [None]:
all_dialogues = []
all_sentences = []
for conv in corpus.iter_conversations():
    sentences = []
    for utt in conv.iter_utterances():
        sentences.append(utt.text)
    all_sentences.append(sentences)
    dialogues = get_dialogues(sentences)
    all_dialogues.append(dialogues)

In [None]:
len(all_sentences)

3107

In [None]:
_all_sent = []
for doc in all_sentences:
    _all_sent += doc

len(_all_sent)

67373

In [None]:
_all_pairs = []
for diag in all_dialogues:
    _pairs = [[first, second] for first, second in zip(diag[0], diag[0][1:])]
    _all_pairs += _pairs

In [None]:
_all_pairs

[["Just, 'cause, I don't want her to go through what I went through with Carl- oh!",
  "Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex."],
 ["Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.",
  'Sounds like a date to me.'],
 ['If I let go of my hair, my head will fall off.',
  'Ooh, she should not be wearing those pants.'],
 ['Ooh, she should not be wearing those pants.',
  'I say push her down the stairs.'],
 ['I say push her down the stairs.',
  'Push her down the stairs! Push her down the stairs! Push her down the stairs!'],
 ["I'm thinking we've got a bookcase here.", "It's a beautiful thing."],
 ["It's a beautiful thing.", "What's this?"],
 ["What's this?", "I would have to say that is an 'L'-shaped bracket."],
 ["I would have to say that is an 'L'-shaped bracket.", 'Which goes where?'],
 ['Which goes where?', 'I have no idea.'],
 ['Oh my God!',
  "I know, I 

In [None]:
len(_all_pairs)

37383

In [None]:
_all_pairs = [[remove_dots(s) for s in pair] for pair in _all_pairs]
len(_all_pairs)

37383

In [None]:
import pickle

with open('friends_corpus.txt', 'wb') as fp:
    pickle.dump(_all_pairs, fp)

# Tennis pairs extraction

In [None]:
from convokit import Corpus, download
# corpus = Corpus(filename=download("friends-corpus"))#good
# corpus = Corpus(filename=download("persuasionforgood-corpus"))
# corpus = Corpus(filename=download("gap-corpus"))
# corpus = Corpus(filename=download("parliament-corpus")) #good
# corpus = Corpus(filename=download("winning-args-corpus")) #might be good
corpus = Corpus(filename=download("tennis-corpus")) 
corpus.print_summary_stats()

tennis_pairs = []
for conv in corpus.iter_conversations():
    sentences = []
    for utt in conv.iter_utterances():
        sentences.append(utt.text)

    
    tennis_pairs.append(sentences)

Dataset already exists at /root/.convokit/downloads/tennis-corpus
Number of Speakers: 359
Number of Utterances: 163948
Number of Conversations: 81974


In [None]:
len(tennis_pairs)

81974

In [None]:
def remove_dash(s):
    s = re.sub("[\-][\-]", "",s)
    return s

In [None]:
tennis_pairs = [[remove_dots(s) for s in pair] for pair in tennis_pairs]
tennis_pairs = [[remove_dash(s) for s in pair] for pair in tennis_pairs]

len(tennis_pairs)

81974

In [None]:
tennis_pairs

[['I think this is your biggest success right now, first Grand Slam, third round.',
  'Yeah.'],
 ['How would you describe it? Is it fantastic for you?',
  "Yeah, I'm pretty happy, but it was  I wasn't playing well today, but he retired, and I was just lucky, guess."],
 ['Do you know why he has retired?', "No, no. I didn't know, no."],
 ['Could you tell us a little bit yourself? You practiced at the Bollettieri Academy. Why did you choose that academy?',
  'Um, because there is more players, top players. Many pros are practicing there, and many coach.'],
 ['But you could choose, also, another one. Why did you choose Bollettieri?',
  'Oh, because the one, the guy helps me, his name is Mr. Morita. He send me there, yeah.'],
 ['How old have you been when you went',
  'When I was 13, so almost five years now.'],
 ['Okay. And how is the contact with Mr. Bollettieri at the moment, for example?',
  'What do you mean, contract?'],
 ['Contact with him. I mean, do you speak with him after such a 

In [None]:
tennis_pairs_mod1 = [[diag[0].split('?')[0] + '?', diag[1].split('.')[0] + '.'] for diag in tennis_pairs]
tennis_pairs_mod1

[['I think this is your biggest success right now, first Grand Slam, third round.?',
  'Yeah.'],
 ['How would you describe it?',
  "Yeah, I'm pretty happy, but it was  I wasn't playing well today, but he retired, and I was just lucky, guess."],
 ['Do you know why he has retired?', 'No, no.'],
 ['Could you tell us a little bit yourself?',
  'Um, because there is more players, top players.'],
 ['But you could choose, also, another one. Why did you choose Bollettieri?',
  'Oh, because the one, the guy helps me, his name is Mr.'],
 ['How old have you been when you went?',
  'When I was 13, so almost five years now.'],
 ['Okay. And how is the contact with Mr. Bollettieri at the moment, for example?',
  'What do you mean, contract?.'],
 ['Contact with him. I mean, do you speak with him after such a big victory, for example?',
  'Yeah.'],
 ['What does he say?', 'Congrats.'],
 ['Did you get any sort of sense of why he retired?',
  "I don't know why he retired, still."],
 ['Can you just talk ab

In [None]:
tennis_pairs_mod2 = [[diag[0].split('.')[0], diag[1].split('.')[0] + '.'] for diag in tennis_pairs]
tennis_pairs_mod2

[['I think this is your biggest success right now, first Grand Slam, third round',
  'Yeah.'],
 ['How would you describe it? Is it fantastic for you?',
  "Yeah, I'm pretty happy, but it was  I wasn't playing well today, but he retired, and I was just lucky, guess."],
 ['Do you know why he has retired?', 'No, no.'],
 ['Could you tell us a little bit yourself? You practiced at the Bollettieri Academy',
  'Um, because there is more players, top players.'],
 ['But you could choose, also, another one',
  'Oh, because the one, the guy helps me, his name is Mr.'],
 ['How old have you been when you went',
  'When I was 13, so almost five years now.'],
 ['Okay', 'What do you mean, contract?.'],
 ['Contact with him', 'Yeah.'],
 ['What does he say?', 'Congrats.'],
 ['Did you get any sort of sense of why he retired? Did he seem to have an injury, or do you know what was in his mind?',
  "I don't know why he retired, still."],
 ['Can you just talk about your match today, how you played',
  'I tried

In [None]:
import pickle

with open('tennis_pairs_mod2.txt', 'wb') as fp:
    pickle.dump(tennis_pairs_mod2, fp)