In [1]:
import numpy as np
import json

In [2]:
from pymongo import MongoClient

In [3]:
client = MongoClient()
db = client.cmv
posts_collection = db.posts
tl_comments_collection = db.tl_comments
deltad_replies_collection = db.deltad_replies

In [None]:
class CMV:
    def __init__(self, doc_type='comment', topic_model='pytextrank', classification_model='xgboost', random_state):
        client = MongoClient()
        db = client.cmv
        self.posts_collection = db.posts
        self.tl_comments_collection = db.tl_comments
        self.deltad_replies_collection = db.deltad_replies
        
        self.topic_model=topic_model
        
        self.deltad_docs = []
        self.undeltad_docs = []
        self.test_docs = []
        self.train_docs = []
        
        self.train_ids = []
        self.train_texts = []
        self.train_labels = []
        self.test_ids = []
        self.test_texts = []
        self.test_labels = []
        self.val_ids = []
        self.val_texts = []
        self.val_labels = []
        
        self.reduced_train_texts = []
        self.reduced_test_texts = []
        self.reduced_val_texts = []
        
        if random_state:
            rand_state = random_state
        
        deltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": True}})
        if doctype == 'post':
            undeltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": False}})
            
            #retrieve post id, text and label (1=deltad, 0=undeltad)
            self.deltad_docs = [{'id': post[f'{doctype}_id'], 'text': post[f'{doctype}_text'], 'label': 1} for post in deltad_post_gen]
            self.undeltad_docs = [{'id': post[f'{doctype}_id'], 'text': post[f'{doctype}_text'], 'label': 0} for post in undeltad_post_gen]
            
        if doctype == 'comment':
            # get ids of Top Level Comments that resulted in deltas AND list of all TL Comment IDs for posts where some delta was awarded by OP
            post_comment_ids = [(post['tl_comment_delta_parents'], post['comment_ids']) for post in deltad_post_gen]
            (deltad_tl_comment_ids, all_tl_comment_ids) = zip(*post_comment_ids)

            # flatten lists of lists
            deltad_tl_comment_ids = [item for sublist in deltad_tl_comment_ids for item in sublist]
            all_tl_comment_ids = [item for sublist in all_tl_comment_ids for item in sublist]

            # get ids of TL Comments that did not result in deltas from posts where OP did award deltas
            undeltad_tl_comment_ids = list(set(all_tl_comment_ids) - set(deltad_tl_comment_ids))

            # if I reimport: deltad_tl_comment_gen = tl_comments_collection.find( {'comment_id': {"$in": deltad_tl_comment_ids}})
            deltad_tl_comment_gen = tl_comments_collection.find( {'$and': [{'comment_id': {"$in": deltad_tl_comment_ids}},{'comment_text': {"$ne": '[deleted]'}}]})
            # retrieve TL comments NOT resulting in deltas
            undeltad_tl_comment_gen = tl_comments_collection.find( {'$and': [{'comment_id': {"$in": undeltad_tl_comment_ids}},{'comment_text': {"$ne": '[deleted]'}}]})
            
            #retrieve comment id, text and label (1=deltad, 0=undeltad)
            self.deltad_docs = [{'id': comment[f'{doctype}_id'], 'text': comment[f'{doctype}_text'], 'label': 1} for comment in deltad_tl_comment_gen]
            self.undeltad_docs = [{'id': comment[f'{doctype}_id'], 'text': comment[f'{doctype}_text'], 'label': 0} for comment in undeltad_tl_comment_gen]

    def test_split(self, test_ratio=0.2, val_set=True, val_ratio=0.2):
        if val_set = True:
            total_ratio = test_ratio + val_ratio
            
            val_split_d = int(val_ratio*len(self.deltad_docs))
            val_split_u = int(val_ratio*len(self.undeltad_docs))
            
            np.random.seed(seed=rand_state)
            np.random.shuffle(self.deltad_docs)
            np.random.shuffle(self.undeltad_docs)

            self.val_docs = self.deltad_docs[0:val_split_d]
            self.val_docs.extend(self.undeltad_docs[0:val_split_u])
            
            test_split_d = int(total_ratio*len(self.deltad_docs))
            test_split_u = int(total_ratio*len(self.undeltad_docs))
            
            self.test_docs = deltad_docs[val_split_d:test_split_d]
            self.test_docs.extend(undeltad_docs[val_split_u:test_split_u])

            self.train_docs= self.deltad_docs[test_split_d::]
            self.train_docs.extend(self.undeltad_docs[test_split_u::])
            
            val_tuples = [(doc['id'],doc['text'],doc['label']) for doc in self.val_docs]

            (self.val_ids, self.val_texts, self.val_labels) = zip(*val_tuples)
            train_tuples = [(doc['id'],doc['text'],doc['label']) for doc in self.train_docs]
            test_tuples = [(doc['id'],doc['text'],doc['label']) for doc in self.test_docs]

            (self.train_ids, self.train_texts, self.train_labels) = zip(*train_tuples)
            (self.test_ids, self.test_texts, self.test_labels) = zip(*test_tuples)
            
            return self.train_docs, self.val_docs, self.test_docs

        else:
            test_split_d = int(test_ratio*len(self.deltad_docs))
            test_split_u = int(test_ratio*len(self.undeltad_docs))

            np.random.seed(seed=rand_state)
            np.random.shuffle(self.deltad_docs)
            np.random.shuffle(self.undeltad_docs)

            self.test_docs = self.deltad_docs[0:test_split_d]
            self.test_docs.extend(self.undeltad_docs[0:test_split_u])

            self.train_docs= self.deltad_docs[test_split_d::]
            self.train_docs.extend(self.undeltad_docs[test_split_u::])

            train_tuples = [(doc['id'],doc['text'],doc['label']) for doc in self.train_docs]
            test_tuples = [(doc['id'],doc['text'],doc['label']) for doc in self.test_docs]

            (self.train_ids, self.train_texts, self.train_labels) = zip(*train_tuples)
            (self.test_ids, self.test_texts, self.test_labels) = zip(*test_tuples)

            return self.train_docs, self.test_docs

    def clean_text(something):
        #do something
        
    def topic_extraction(topic_model):
        if topic_model:
            use_model = topic_model
        else:
            use_model = self.topic_model
            
        if use_model == 'lda':
            #do something
        elif use_model == 'lsa':
            #do something
        elif use_model == 'pytextrank':
            #do something
            
        return something
    
        #self.reduced_train_texts = []
        #self.reduced_test_texts = []
        #self.reduced_val_texts = []