# Fetch Data - Relations Prediction
In this notebook, data is fetched from Reddit in order to construct an unlabelled dataset.

### Fetch & Clean

In [None]:
def clean_text(text):
    # Replace utf-8 single quotes with ascii apostrophes
    text = re.sub(r"(\u2018|\u2019)", "'", text)
    # Replace utf-8 double quotes with ascii double quotes
    text = re.sub(r"(\u201c|\u201d)", '"', text)
    
    return text

def clean_sentences(sentences):
    # Convert common utf-8 punctuation to ascii
    sentences = [clean_text(sentence) for sentence in sentences]
    # Split all sentences containing \n into separate sentences
    sentences = [split_args for sentence in sentences for split_args in sentence.splitlines()]
    # Remove whitespace from arguments and empty strings from thread list
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # Remove duplicates from the list
    sentences = list(set(sentences))
    return sentences

In [None]:
# Retrieve data from r/changemyview using Reddit's API through the PRAW library
# Randomly select pairs of sentences from neighbouring 
# Saves the data to a file for labelling

import json
import re
import praw
import nltk
import random

# download dataset for sentence tokenizer
nltk.download('punkt')

# Create new praw instance with credentials from praw.ini
reddit = praw.Reddit('arg-mining')

pairs = []

ap = ArgumentPredictor()

# Add sentences from submission title, body and comments to arg_threads array
for submission in reddit.subreddit('changemyview').hot(limit=200):
       
    # Remove "replace more" from comments results (expand full comment tree)
    submission.comments.replace_more(limit=50)
    
    # Get full comment tree under top level comments and add to arg_threads
    for comment in submission.comments.list():
       
        # All arg sentences in the current comment
        comment_sentences = [sentence for sentence in nltk.sent_tokenize(comment.body)]
        comment_sentences = clean_sentences(comment_sentences)
        comment_sentences = [sentence for sentence in comment_sentences if ap.is_arg(sentence)]
        
        # All arg sentences in all replies to the current comment
        reply_sentences = [sentence for reply in comment.replies.list() for sentence in nltk.sent_tokenize(reply.body) ]
        reply_sentences = clean_sentences(reply_sentences)
        reply_sentences = [sentence for sentence in reply_sentences if ap.is_arg(sentence)]
        
        if reply_sentences:
            for comment_sentence in comment_sentences:
                reply_sentence = random.choice(reply_sentences)
                pairs.append('£££££££'.join([comment_sentence, reply_sentence]))
    
pairs


In [None]:
len(pairs)

In [None]:
# Shuffle the pairs by taking a sample of the entire list
pairs_shuffled = random.sample(pairs, len(pairs))

### Store

In [None]:
with open('./unlabelled_data/unlabelled_relations_sentences.txt', 'w') as write_file:
    for pair in pairs_shuffled:
        write_file.write(pair + '\n')