In [209]:
import numpy as np
import json

In [210]:
from pymongo import MongoClient

In [211]:
client = MongoClient()
db = client.cmv
posts_collection = db.posts
tl_comments_collection = db.tl_comments
deltad_replies_collection = db.deltad_replies

In [212]:
rand_state = 42

In [213]:
# establish the set of texts to use (ie posts vs comments)
doctype = 'comment'

In [214]:
# getting posts with (or without) deltas awarded
deltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": True}})
if doctype == 'post':
    undeltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": False}})

In [215]:
if doctype == 'comment':
    # get ids of Top Level Comments that resulted in deltas AND list of all TL Comment IDs for posts where some delta was awarded by OP
    post_comment_ids = [(post['tl_comment_delta_parents'], post['comment_ids']) for post in deltad_post_gen]
    (deltad_tl_comment_ids, all_tl_comment_ids) = zip(*post_comment_ids)

    # flatten lists of lists
    deltad_tl_comment_ids = [item for sublist in deltad_tl_comment_ids for item in sublist]
    all_tl_comment_ids = [item for sublist in all_tl_comment_ids for item in sublist]

    # get ids of TL Comments that did not result in deltas from posts where OP did award deltas
    undeltad_tl_comment_ids = list(set(all_tl_comment_ids) - set(deltad_tl_comment_ids))


In [221]:
if doctype == 'comment':
    # retrieve TL comments resulting in deltas by id
    # if I reimport: deltad_tl_comment_gen = tl_comments_collection.find( {'comment_id': {"$in": deltad_tl_comment_ids}})
    deltad_tl_comment_gen = tl_comments_collection.find( {'$and': [{'comment_id': {"$in": deltad_tl_comment_ids}},{'key_phrases': {"$exists": True}}]})
    # retrieve TL comments NOT resulting in deltas
    undeltad_tl_comment_gen = tl_comments_collection.find( {'$and': [{'comment_id': {"$in": undeltad_tl_comment_ids}},{'key_phrases': {"$exists": True}}]})

In [222]:
# establish the set of texts to use (ie posts vs comments)
if doctype == 'comment':
    deltad_docs = [{'id': comment[f'{doctype}_id'], 'text': comment[f'{doctype}_text'], 'label': 1} for comment in deltad_tl_comment_gen]
    undeltad_docs = [{'id': comment[f'{doctype}_id'], 'text': comment[f'{doctype}_text'], 'label': 0} for comment in undeltad_tl_comment_gen]
    
elif doctype == 'post':
    deltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": True}})
    undeltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": False}})
    
    deltad_docs = [{'id': post[f'{doctype}_id'], 'text': post[f'{doctype}_text'], 'label': 1} for post in deltad_post_gen]
    undeltad_docs = [{'id': post[f'{doctype}_id'], 'text': post[f'{doctype}_text'], 'label': 0} for post in undeltad_post_gen]

In [225]:
len(deltad_docs)

3850

In [223]:
len(undeltad_docs)

29578

In [226]:
#train test split
test_split_d = int(0.2*len(deltad_docs))
test_split_u = int(0.2*len(undeltad_docs))

np.random.seed(seed=rand_state)
np.random.shuffle(deltad_docs)
np.random.shuffle(undeltad_docs)

test_docs = deltad_docs[0:test_split_d]
test_docs.extend(undeltad_docs[0:test_split_u])

train_docs= deltad_docs[test_split_d::]
train_docs.extend(undeltad_docs[test_split_u::])

In [227]:
train_tuples = [(doc['id'],doc['text'],doc['label']) for doc in train_docs]
test_tuples = [(doc['id'],doc['text'],doc['label']) for doc in test_docs]

#train_docs = []
#test_docs = []

(train_ids, train_texts, train_labels) = zip(*train_tuples)
(test_ids, test_texts, test_labels) = zip(*test_tuples)

train_tuples = []
test_tuples = []


In [228]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import string

In [229]:
def clean_text(texts, tokenizer, stemmer):
    stemmer_inst = stemmer()
    tokenizer_inst = tokenizer()
    cleaned_texts = []
    for text in texts:
        #cleaned_words = []
        
        #strip punctuation and digits from whole post
        to_replace = [punc for punc in string.punctuation+string.digits if punc!="'"]
        translate_dict = {key: ' ' for key in to_replace}
        translate_dict["'"] = ''
        replacement_table = str.maketrans(translate_dict)
        stripped_text = text.translate(replacement_table)
        
        #lower case post
        lowered_text = stripped_text.lower()
        cleaned_texts.append(lowered_text)
        #tokenized_post = tokenizer_inst.tokenize(stripped_post) 
        #for word in tokenized_post:
            #low_word = stemmer_inst.stem(word.lower())
            #cleaned_words.append(low_word)
            #remove stopwords?
            #if low_word not in stopwords:
            #    cleaned_words.append(low_word)
        # cleaned_posts.append(' '.join(cleaned_words))
    return cleaned_texts

In [230]:
stemmer = PorterStemmer
tokenizer = WhitespaceTokenizer

In [231]:
cleaned_train_texts = clean_text(train_texts, tokenizer, stemmer)
cleaned_test_texts = clean_text(test_texts, tokenizer, stemmer)

In [207]:
len(cleaned_train_texts)

4280

In [208]:
len(cleaned_test_texts)

1069

In [59]:
print(train_texts[0])
print(cleaned_train_texts[0])

I am an organ donor.  I want my organs to go to people who need my organs. I do not want my donation of my organs to get involved in notion of social justice, of "deserving" and so on.  The organ I donate goes to someone who is sick, because being sick is a really shitty thing.  It's not _less shitty_ when it happens to an asshole, or even to someone who doesn't want to donate their own organs.  My gift is not contingent.

I agree with you that it should not be taken for granted, I believe that it should be wildly promoted and that we should change our systems of becoming an organ donor.  As you note, this is about saving lives.  I believe it to be bad to create a system that implicitly values one life greater than the other, and that is what your proposal does.  It says that you deserve to live because you are an organ donor and you don't because you aren't.

And...Get your shit together Canada!
i am an organ donor   i want my organs to go to people who need my organs  i do not want m

In [19]:
my_vectorizer = CountVectorizer(max_df=0.85, min_df=25,
                                max_features=1000,
                                stop_words='english')

In [54]:
vectorized_train_texts = my_vectorizer.fit_transform(cleaned_train_texts)
vectorized_test_texts = my_vectorizer.transform(cleaned_test_texts)

In [26]:
df_cv = pd.DataFrame(vectorized_train_texts.toarray(), columns=my_vectorizer.get_feature_names())

## Unsupervised Topic Extraction

In [27]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [28]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### LSA

In [241]:
lsa_model = TruncatedSVD(n_components=100, random_state=rand_state)
lsa_model.fit(vectorized_train_texts)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=42, tol=0.0)

In [242]:
lsa_train_texts = lsa_model.transform(vectorized_train_texts)
lsa_test_texts = lsa_model.transform(vectorized_test_texts)

In [243]:
display_topics(lsa_model,my_vectorizer.get_feature_names(),20) # We have to look at the topics before hand and then add the labels afterwards


Topic  0
peoples, key, driving, life, thing, majority, ways, wanted, theyve, thousands, reality, imagine, saying, knowledge, gone, doing, th, youve, going, personal

Topic  1
peoples, books, win, general, trade, quite, gender, greater, questions, complex, ignore, policies, sound, race, hate, data, identity, popular, currently, safety

Topic  2
got, money, truly, voted, wages, mind, talking, paying, majority, creating, stop, increasing, taught, workers, effect, status, pass, commit, save, powerful

Topic  3
truly, voted, driving, commit, life, hes, hillary, pass, thing, president, car, politicians, save, entire, vs, republican, imagine, republicans, deserve, drive

Topic  4
life, games, key, gay, play, peoples, thousands, assuming, playing, plenty, congress, gone, view, players, taxes, new, future, money, theyll, workers

Topic  5
general, wont, key, meet, trade, wages, mind, money, life, got, talking, womens, share, ignore, room, making, paying, workers, field, increasing

Topic  6
bo

#### LDA

In [59]:
n_topics = 150
n_iter = 13
lda_model = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=n_iter,
                                random_state=rand_state,
                               learning_method='online')
lda_train_texts = lda_model.fit_transform(vectorized_train_texts)
lda_test_texts = lda_model.transform(vectorized_test_texts)

In [60]:
display_topics(lda_model,my_vectorizer.get_feature_names(),20) # We have to look at the topics before hand and then add the labels afterwards


Topic  0
real, lives, problems, relationship, long, healthy, global, relationships, term, experiences, thousands, lack, think, way, going, levels, totally, exists, lots, working

Topic  1
general, actions, argue, experience, certain, given, claim, example, argument, does, way, second, particular, fact, choose, think, define, role, present, generally

Topic  2
police, individuals, race, single, racist, minority, equally, random, related, exists, certainly, did, capable, personally, just, saying, say, like, comes, people

Topic  3
thing, debt, national, essentially, university, honestly, lot, takes, choices, policy, systems, mind, legal, high, havent, question, goes, elected, vote, solution

Topic  4
like, great, just, im, kind, lot, guy, friends, think, really, pretty, didnt, got, stuff, sure, know, thats, follow, shit, little

Topic  5
state, issue, illegal, eu, impossible, federal, deal, consent, essentially, totally, laws, isnt, allowing, want, happen, completely, used, nation, situ

#### PyTextRank for graph-based feature extraction

In [160]:
train_docs[0]

{'id': '6nvmki',
 'text': "I have a solipsistic worldview, which means that I don't believe that it is possible to know anything outside of my own mind. For all I know, the reality that I perceive could be an illusion, and there is no reason to trust any of my senses or memories. It's also possible that my senses are giving me a perfectly accurate representation of the reality around me. I don't really see how I could know fore sure one way or the other. Other than the thoughts in my mind, there's no way to truly be sure about anything. \n_____\n\n> *This is a footnote from the CMV moderators. We'd like to remind you of a couple of things. Firstly, please* ***[read through our rules](http://www.reddit.com/r/changemyview/wiki/rules)***. *If you see a comment that has broken one, it is more effective to report it than downvote it. Speaking of which,* ***[downvotes don't change views](http://www.reddit.com/r/changemyview/wiki/guidelines#wiki_upvoting.2Fdownvoting)****! Any questions or co

In [161]:
replacement_table = str.maketrans({'\n': ' ', "'": '', '-': '', '/': ''})
train_dicts = [{'id': doc['id'], 'text': doc['text'].translate(replacement_table)} for doc in train_docs]
test_dicts = [{'id': doc['id'], 'text': doc['text'].translate(replacement_table)} for doc in test_docs]

In [162]:
len(train_dicts)

12123

In [163]:
train_docs[20]

{'id': '77d02i',
 'text': "First of all, most definitions of prejudice define it as a preconceived notion of something (IE an assumption). Making assumptions was an evolutionary must, as an animal's (And a human's) brain is unable to process and understand every piece of information the senses give it at the same time as making rational decisions about it. Thus, the brain 'fills in the blanks' by rationally guessing facts and evidence. This way, the brain is able to concentrate on more important matters. \n\nFor example, primitive humans had to make split second decisions on threats in order to survive. A pair of eyes hiding in the bushes could signify a friend, but the brain assumes that it is an enemy, in order to protect itself. As humans spread, animal threats became less of an issue, but other human tribes became a bigger threat. Thus, now, the human has to make assumptions about other humans based on experience or social experience (The spreading of news and facts throughout a tr

In [164]:
train_dicts[9529]

{'id': '4sqfyh',
 'text': 'If I believe that abortion, even in the first trimester is the same as killing another human being without their consent, it is still perfectly fine for me to be pro abortion.  I do not want this first part "If I believe that abortion, even in the first trimester is the same as killing another human being without their consent" to be the point of this CMV, which is why I phrased it in this way.  As for whether I should be for or against abortion, I do know that in some circumstances, such as the death penalty or ending the life of someone on life support (edit: where the individual is unconscious and there is no stipulation in his will about such a situation), it is considered lawful (in most countries for life support and some for death penalty) to end someones life without their consent. Since that president is set, it is possible that abortions can be justified if the benefit of an abortion is large enough (Also because a fetus can technically be considere

In [165]:
len(train_dicts)

12123

In [171]:
import pytextrank
import json
import sys

path_stage0 = 'stage0.json'
path_stage1 = 'stage1.json'
path_stage2 = 'stage2.json'
path_stage3 = 'stage3.json'

#0-500

failed_ids=[]
for i, doc_dict in enumerate(train_dicts):
    if i % 50 == 0:
        print(i)
    doc_dict['text'] = doc_dict['text'].split('\n_____\n\n')[0]
    
    try:
        with open(path_stage0, 'w') as f:
            json.dump(doc_dict, f)
        # Stage 1    
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
                # print(pytextrank.pretty_print(graf))
        # Stage 2
        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)
        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
                # to view output in this notebook
                # print(pytextrank.pretty_print(rl))
        # Stage 3
        kernel = pytextrank.rank_kernel(path_stage2)
        with open(path_stage3, 'w') as f:
            for s in pytextrank.top_sentences(kernel, path_stage1):
                f.write(pytextrank.pretty_print(s._asdict()))
                f.write("\n")
                # to view output in this notebook
                # print(pytextrank.pretty_print(s._asdict()))
        # Stage 4
        phrase_list = list(set([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=15)]))
        phrases = ", ".join(phrase_list)

        sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1])
        s = []

        for sent_text, idx in sent_iter:
            s.append(pytextrank.make_sentence(sent_text))

        graf_text = " ".join(s)
        if doctype == 'comment':
            tl_comments_collection.update_one({f'{doctype}_id': {'$eq': doc_dict['id']}},{'$set': {'key_phrases': phrase_list}})
        else:
            print('entered else')
            print('doc_dicid is: ', doc_dict['id'])
            print('key_phrases are: ', phrase_list)
            posts_collection.update_one({f'{doctype}_id': {'$eq': doc_dict['id']}},{'$set': {'key_phrases': phrase_list}})
    except:
        failed_ids.append(doc_dict['id'])
        print('failed on ',doc_dict['id'])
        continue
    #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases), '\n\n')

0
entered else
doc_dicid is:  6nvmki
key_phrases are:  ['change', 'anything', 'own mind', 'cmv', 'cmv moderators', 'moderators', 'solipsistic worldview', 'way', 'downvotes', 'thoughts', 'mind', 'www.reddit.comrchangemyviewwikiguidelines#wiki_upvoting.2fdownvoting']
entered else
doc_dicid is:  4bu9ip
key_phrases are:  ['product quality', 'strategies', 'certain products funds marketing strategies', 'additional cost', 'quality', 'product improvements', 'product', 'marketing', 'consumer', 'companies', 'advertising']
entered else
doc_dicid is:  7131t6
key_phrases are:  ['change', 'experience', 'most casses', 'new approaches', 'new things', 'cmv', 'cmv moderators', 'work', 'personal experiences', 'subject', 'evidence', 'downvotes', 'little knowledge', 'thing', 'little experience', 'www.reddit.comrchangemyviewwikiguidelines#wiki_upvoting.2fdownvoting']
entered else
doc_dicid is:  4qrfkn
key_phrases are:  ['free education', 'people', 'minimum d grade', 'education', 'current situation', 'politi

In [252]:
key_phrase_gen = tl_comments_collection.find({'key_phrases': {"$exists": True}})

In [253]:
key_phrases_list = [comment['key_phrases'] for comment in key_phrase_gen]

In [254]:
flat_key_phrases = [item for sublist in key_phrases_list for item in sublist]
key_phrases_list = list(set(flat_key_phrases))
print(len(key_phrases_list))

100999


In [255]:
flat_key_phrases = []

In [256]:
phrase_array = np.array(key_phrases_list)
key_phrases_list = []

In [257]:
from sklearn.feature_extraction.text import CountVectorizer

In [258]:
binary_vectorizer = CountVectorizer(max_df=0.9, min_df=5,
                                    max_features=5000,
                                    vocabulary = phrase_array,
                                    binary=False)

In [259]:
textranked_train_texts = binary_vectorizer.fit_transform(cleaned_train_texts)
textranked_test_texts = binary_vectorizer.transform(cleaned_test_texts)

#### Choosing the type of topic modeling:

In [260]:
reduced_train_texts = textranked_train_texts
reduced_test_texts = textranked_test_texts

## Supervised Classification

In [74]:
# undersampling undeltad observations:

#### SVM Classifier

In [53]:
from sklearn.metrics import confusion_matrix, roc_auc_score

In [54]:
from sklearn.svm import SVC

In [55]:
svc_model = SVC(class_weight='balanced', random_state=rand_state)
svc_model.fit(reduced_train_texts, train_labels)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [56]:
predicted_labels = svc_model.predict(reduced_test_texts)
conf_mat = confusion_matrix(test_labels, predicted_labels)
print(conf_mat)

[[2630   14]
 [ 362    8]]


In [58]:
print('mean accuracy is {}'.format(svc_model.score(reduced_test_texts, test_labels)))

mean accuracy is 0.8752488387524884


#### XGBoost Random Forest Classifier

In [261]:
from xgboost import XGBClassifier

In [262]:
xg_model = XGBClassifier(scale_pos_weight=7)
xg_model.fit(reduced_train_texts, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=7, seed=None,
       silent=True, subsample=1)

In [263]:
print('train set mean accuracy is {}'.format(xg_model.score(reduced_train_texts, train_labels)))


train set mean accuracy is 0.7154769472385297


  if diff:


In [264]:
print('mean accuracy is {}'.format(xg_model.score(reduced_test_texts, test_labels)))
predicted_labels = xg_model.predict(reduced_test_texts)
conf_mat = confusion_matrix(test_labels, predicted_labels)
print(conf_mat)

mean accuracy is 0.6896035901271503
[[4212 1703]
 [ 372  398]]


  if diff:
  if diff:


In [76]:
#from sklearn.cluster import DBSCAN

In [77]:
#dbscan_model = DBSCAN(eps=0.5, min_samples=5, metric=’euclidean’, metric_params=None, algorithm=’auto’, leaf_size=30, p=None, n_jobs=1)

In [84]:
from sklearn.linear_model import LogisticRegression

In [86]:
logreg = LogisticRegression(random_state=rand_state, class_weight='balanced')
logreg.fit(reduced_train_texts, train_labels)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [90]:
predicted_test_labels = logreg.predict(reduced_test_texts)

In [91]:
print('mean accuracy is {}'.format(logreg.score(reduced_test_texts, test_labels)))
conf_mat = confusion_matrix(test_labels, predicted_test_labels)
print(conf_mat)

mean accuracy is 0.6766313082610094
[[1925  795]
 [ 211  180]]
