In [1]:
import numpy as np
import json

In [2]:
from pymongo import MongoClient

In [3]:
client = MongoClient()
db = client.cmv
posts_collection = db.posts
tl_comments_collection = db.tl_comments
deltad_replies_collection = db.deltad_replies

In [4]:
# getting posts with and without deltas awarded
deltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": True}})
undeltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": False}})
deltad_post_list = list(deltad_post_gen)
undeltad_post_list = list(undeltad_post_gen)
print(len(deltad_post_list))
print(len(undeltad_post_list))

1146
5370


In [5]:
# get ids of Top Level Comments that resulted in deltas AND list of all TL Comment IDs for posts where some delta was awarded by OP
deltad_tl_comment_ids = [post['tl_comment_delta_parents'] for post in deltad_post_list]
all_tl_comment_ids = [post['comment_ids'] for post in deltad_post_list]

# flatten lists of lists
deltad_tl_comment_ids = [item for sublist in deltad_tl_comment_ids for item in sublist]
all_tl_comment_ids = [item for sublist in all_tl_comment_ids for item in sublist]

# get ids of TL Comments that did not result in deltas from posts where OP did award deltas
undeltad_tl_comment_ids = list(set(all_tl_comment_ids) - set(deltad_tl_comment_ids))


In [6]:
# retrieve TL comments resulting in deltas by id
deltad_tl_comment_gen = tl_comments_collection.find( {'comment_id': {"$in": deltad_tl_comment_ids}})
# retrieve TL comments NOT resulting in deltas
undeltad_tl_comment_gen = tl_comments_collection.find( {'comment_id': {"$in": undeltad_tl_comment_ids}})

In [7]:
deltad_tl_comments = list(deltad_tl_comment_gen)
undeltad_tl_comments = list(undeltad_tl_comment_gen)
print(len(deltad_tl_comments))
print(len(undeltad_tl_comments))

1955
13601


In [8]:
# establish the set of texts to use (ie posts vs comments)
doctype = 'comment'

if doctype == 'comment':
    deltad_docs = deltad_tl_comments
    undeltad_docs = undeltad_tl_comments
    
elif doctype == 'post':
    deltad_docs = deltad_post_list
    undeltad_docs = undeltad_post_list

In [9]:
#train test split
test_split_d = int(0.2*len(deltad_docs))
test_split_u = int(0.2*len(undeltad_docs))

np.random.shuffle(deltad_docs)
np.random.shuffle(undeltad_docs)

test_docs = deltad_docs[0:test_split_d]
test_docs.extend(undeltad_docs[0:test_split_u])

train_docs= deltad_docs[test_split_d::]
train_docs.extend(undeltad_docs[test_split_u::])

In [10]:
type(test_split_d-1)

int

In [11]:
test_doc_labels = [1]*(test_split_d)
test_doc_labels.extend(([0]*(test_split_u)))
test_doc_labels = np.array(test_doc_labels)

In [12]:
num_train_d = len(deltad_docs) - test_split_d
num_train_u = len(undeltad_docs) - test_split_u

train_doc_labels = [1]*num_train_d
train_doc_labels.extend([0]*(num_train_u))

train_doc_labels = np.array(train_doc_labels)

In [13]:
num_train_u/num_train_d

6.957161125319693

In [14]:
train_doc_labels.shape

(12445,)

In [15]:
len(train_docs)

12445

In [16]:
len(test_docs)

3111

In [17]:
train_texts = [doc[f'{doctype}_text'] for doc in train_docs]
test_texts = [doc[f'{doctype}_text'] for doc in test_docs]

In [28]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import string

In [29]:
alist = ['a', 'b']
blist = ['c', 'd']
alist.extend(blist)
print(alist)

['a', 'b', 'c', 'd']


In [30]:
def clean_text(posts, tokenizer, stemmer):
    stemmer_inst = stemmer()
    tokenizer_inst = tokenizer()
    cleaned_posts = []
    for post in posts:
        #cleaned_words = []
        
        #strip punctuation and digits from whole post
        to_replace = [punc for punc in string.punctuation+string.digits]
        replacement_table = str.maketrans({key: None for key in to_replace})
        stripped_post = post.translate(replacement_table)
        
        #lower case post
        lowered_post = stripped_post.lower()
        cleaned_posts.append(lowered_post)
        #tokenized_post = tokenizer_inst.tokenize(stripped_post) 
        #for word in tokenized_post:
            #low_word = stemmer_inst.stem(word.lower())
            #cleaned_words.append(low_word)
            #remove stopwords?
            #if low_word not in stopwords:
            #    cleaned_words.append(low_word)
        # cleaned_posts.append(' '.join(cleaned_words))
    return cleaned_posts

In [31]:
stemmer = PorterStemmer
tokenizer = WhitespaceTokenizer

In [32]:
cleaned_train_texts = clean_text(train_texts, tokenizer, stemmer)
cleaned_test_texts = clean_text(test_texts, tokenizer, stemmer)

In [37]:
print(cleaned_train_texts[1])

no matter how you look at it or what incentives or rules youre willing to put in place the reality is that its simply not a level playing field if a team is allowed to recruit the best players it can get with no gender quotas then those players  of the time in  of sports are going to be male its just biology

in a world of coed pro sports aspiring female athletes would have to choose between  options

 forget it youre not the right gender
 go for it heres the steroids and hormones and shit youll need to pump yourself full of in order to stand even the remotest chance of overcoming your biological disadvantage
 dont worry they have quotas andor other some other handicap in place so youll really only be competing against other women anyway of course the sport will suffer and your male teammates will resent you but at least you get to play with the boys right

having an allfemale league even if its considered less important than the male even with the reduced coverage and celebrity and re

In [33]:
my_vectorizer = CountVectorizer(max_df=0.85, min_df=25,
                                max_features=1000,
                                stop_words='english')

In [34]:
vectorized_train_texts = my_vectorizer.fit_transform(cleaned_train_texts)
vectorized_test_texts = my_vectorizer.fit_transform(cleaned_test_texts)

In [40]:
import pandas as pd
df_cv = pd.DataFrame(vectorized_train_texts.toarray(), columns=my_vectorizer.get_feature_names())

## Unsupervised Topic Extraction

In [20]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [202]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### LSA

In [241]:
lsa_model = TruncatedSVD(n_components=100, random_state=42)
lsa_model.fit(vectorized_train_texts)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=42, tol=0.0)

In [242]:
lsa_train_texts = lsa_model.transform(vectorized_train_texts)
lsa_test_texts = lsa_model.transform(vectorized_test_texts)

In [243]:
display_topics(lsa_model,my_vectorizer.get_feature_names(),20) # We have to look at the topics before hand and then add the labels afterwards


Topic  0
peoples, key, driving, life, thing, majority, ways, wanted, theyve, thousands, reality, imagine, saying, knowledge, gone, doing, th, youve, going, personal

Topic  1
peoples, books, win, general, trade, quite, gender, greater, questions, complex, ignore, policies, sound, race, hate, data, identity, popular, currently, safety

Topic  2
got, money, truly, voted, wages, mind, talking, paying, majority, creating, stop, increasing, taught, workers, effect, status, pass, commit, save, powerful

Topic  3
truly, voted, driving, commit, life, hes, hillary, pass, thing, president, car, politicians, save, entire, vs, republican, imagine, republicans, deserve, drive

Topic  4
life, games, key, gay, play, peoples, thousands, assuming, playing, plenty, congress, gone, view, players, taxes, new, future, money, theyll, workers

Topic  5
general, wont, key, meet, trade, wages, mind, money, life, got, talking, womens, share, ignore, room, making, paying, workers, field, increasing

Topic  6
bo

#### LDA

In [244]:
n_topics = 150
n_iter = 13
lda_model = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=n_iter,
                                random_state=42,
                               learning_method='online')
lda_train_texts = lda_model.fit_transform(vectorized_train_texts)
lda_test_texts = lda_model.transform(vectorized_test_texts)

In [246]:
display_topics(lda_model,my_vectorizer.get_feature_names(),20) # We have to look at the topics before hand and then add the labels afterwards


Topic  0
friend, value, critical, values, mean, meaning, prevent, culture, process, theyve, issues, exact, cultural, employees, problems, subject, governments, sure, youd, advantage

Topic  1
perfect, id, isnt, zero, increasing, decided, helps, simple, vast, type, actions, treatment, emotions, does, influence, talking, called, vs, united, highly

Topic  2
bunch, specifically, rich, wants, provided, big, strong, speaking, population, technology, business, allows, states, truth, heard, student, pass, animal, starting, changing

Topic  3
certainly, emotional, foreign, net, super, possible, said, continue, realize, change, effective, chance, built, size, gone, theyre, safety, peoples, basic, experiences

Topic  4
normal, ill, extra, ability, gets, powerful, mental, problem, schools, point, build, just, clear, good, taking, lots, running, popular, liberal, opinions

Topic  5
games, lets, play, future, congress, playing, taxes, standards, spectrum, women, players, plenty, conditions, key, l

#### PyTextRank for graph-based feature extraction

In [21]:
train_docs[0]

{'_id': ObjectId('5b792ef3f0d3c50b0c1bd673'),
 'comment_id': 'd5kluv7',
 'comment_text': 'Purchasing a chance payout is like a lottery. If you are okay with lotteries being legal, despite it being a terrible value for the player, then you are okay with this game model.',
 'author': 'philotrow',
 'deltad_reply_ids': ['d5kmz7c']}

In [22]:
replacement_table = str.maketrans({'\n': ' '})
train_dicts = [{'id': doc[f'{doctype}_id'], 'text': doc[f'{doctype}_text'].translate(replacement_table)} for doc in train_docs]
test_dicts = [{'id': doc[f'{doctype}_id'], 'text': doc[f'{doctype}_text'].translate(replacement_table)} for doc in test_docs]

###### should I replace \' with '?

In [23]:
train_jsons = [json.dumps(doc_dict) for doc_dict in train_dicts]
test_jsons = [json.dumps(doc_dict) for doc_dict in test_dicts]

In [26]:
train_jsons[1]

'{"id": "d2few12", "text": "No matter how you look at it or what incentives or rules you\'re willing to put in place, the reality is that it\'s simply not a level playing field. If a team is allowed to recruit the best players it can get, with no gender quotas, then those players, 99% of the time, in 99% of sports, are going to be male. It\'s just biology.  In a world of co-ed pro sports, aspiring female athletes would have to choose between 3 options:  1. Forget it. You\'re not the right gender. 2. Go for it, here\'s the steroids and hormones and shit you\'ll need to pump yourself full of in order to stand even the remotest chance of overcoming your biological disadvantage. 3. Don\'t worry, they have quotas and/or other some other handicap in place so you\'ll really only be competing against other women anyway. Of course, the sport will suffer and your male teammates will resent you, but at least you get to play with the boys, right?  Having an all-female league, even if it\'s conside

In [48]:
import pytextrank
import sys

path_stage0 = 'stage0.json'
path_stage1 = 'stage1.json'
path_stage2 = 'stage2.json'
path_stage3 = 'stage3.json'

key_phrases = []
key_sentences = []

for doc_json in train_dicts[1:50]:
    with open(path_stage0, 'w') as f:
        json.dump(doc_json, f)
    # Stage 1    
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
            # print(pytextrank.pretty_print(graf))
    # Stage 2
    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)
    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
            # to view output in this notebook
            # print(pytextrank.pretty_print(rl))
    # Stage 3
    kernel = pytextrank.rank_kernel(path_stage2)
    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
            # to view output in this notebook
            # print(pytextrank.pretty_print(s._asdict()))
    # Stage 4
    phrases = ", ".join(set([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=15)]))
    phrase_list = phrases.split(', ')
    key_phrases.append(phrase_list)
    key_sentences.append(graf_text)
    
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1])
    s = []

    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))

    graf_text = " ".join(s)
    print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,))

**excerpts:** If a team is allowed to recruit the best players it can get, with no gender quotas, then those players, 99 % of the time, in 99 % of sports, are going to be male. It's just biology. You're not the right gender. Go for it, here's the steroids and hormones and shit you'll need to pump yourself full of in order to stand even the remotest chance of overcoming your biological disadvantage. Do n't worry, they have quotas and/or other some other handicap in place so you'll really only be competing against other women anyway. Of course, the sport will suffer and your male teammates will resent you, but at least you get to play with the boys, right?

**keywords:** reality, best players, options, place, gender quotas, right gender, gender, remotest chance, disadvantage, boys, other women, quotas, male, other handicap, male teammates, biological disadvantage
**excerpts:** Facebook is n't Reddit. Most people view their pages as" their space" ( hey," myspace" would be a great name for

In [49]:
key_phrases

[['reality',
  'best players',
  'options',
  'place',
  'gender quotas',
  'right gender',
  'gender',
  'remotest chance',
  'disadvantage',
  'boys',
  'other women',
  'quotas',
  'male',
  'other handicap',
  'male teammates',
  'biological disadvantage'],
 ['page',
  'someone',
  'button',
  'most people',
  'dontchathink',
  'house',
  'opinion',
  'great name',
  'candidate button',
  'reddit',
  'people',
  'social media network'],
 ['everyday lives',
  'own faith',
  'absolute answer',
  'super secret government craft',
  'more information',
  'most trouble',
  'faith',
  'jesus',
  'facts',
  'countless historians',
  'life',
  'good enough answer',
  'information',
  'answer',
  'people',
  'academics'],
 ['life-or-death situation', 'weapons', 'mass shooting', 'fighting'],
 [''],
 ['thousands',
  'megastructures',
  'movies',
  'world',
  'new things',
  'conclusion',
  'access',
  'europe',
  'law](https://en.wikipedia.org / wiki / godwin%27s_law',
  'long amounts',
  'chi

In [50]:
key_sentences

['Well yeah. Anything that stops you from doing what you want is against freedom. But that does n\'t mean" they should be allowed to. " It\'s against my freedom for the government to stop me from punching you in the face, but I accept the government\'s authority as part of the social contract. I want to live in the USA, and the US Supreme Court has ruled that businesses ca n\'t discriminate based on race or religion, based on the laws passed by Congress. They did n\'t include homosexuality in the [ original unanimous decision from the 60\'s](https://en.wikipedia.org / wiki / Katzenbach_v._McClung ), but they did allow the ruling against the baker who refused to bake a cake for a gay couple to stand a few months ago.',
 "If a team is allowed to recruit the best players it can get, with no gender quotas, then those players, 99 % of the time, in 99 % of sports, are going to be male. It's just biology. You're not the right gender. Go for it, here's the steroids and hormones and shit you'll

In [247]:
reduced_train_texts = lda_train_texts
reduced_test_texts = lda_test_texts

## Supervised Classification

In [None]:
# undersampling undeltad observations:

#### SVM Classifier

In [214]:
from sklearn.metrics import confusion_matrix, roc_auc_score

In [215]:
from sklearn.svm import SVC

In [248]:
svc_model = SVC(class_weight='balanced', random_state=42)
svc_model.fit(reduced_train_texts, train_doc_labels)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [249]:
predicted_doc_labels = svc_model.predict(reduced_test_texts)
conf_mat = confusion_matrix(test_doc_labels, predicted_doc_labels)
print(conf_mat)

[[ 126 2594]
 [   2  389]]


In [250]:
print('mean accuracy is {}'.format(svc_model.score(reduced_test_texts, test_doc_labels)))

mean accuracy is 0.16554162648666024


#### XGBoost Random Forest Classifier

In [251]:
from xgboost import XGBClassifier

In [252]:
xg_model = XGBClassifier(scale_pos_weight=7)
xg_model.fit(reduced_train_texts, train_doc_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=7, seed=None,
       silent=True, subsample=1)

In [253]:
print('train set mean accuracy is {}'.format(xg_model.score(reduced_train_texts, train_doc_labels)))


train set mean accuracy is 0.6717557251908397


  if diff:


In [254]:
print('mean accuracy is {}'.format(xg_model.score(reduced_test_texts, test_doc_labels)))
predicted_doc_labels = xg_model.predict(reduced_test_texts)
conf_mat = confusion_matrix(test_doc_labels, predicted_doc_labels)
print(conf_mat)

mean accuracy is 0.656702025072324
[[1864  856]
 [ 212  179]]


  if diff:
  if diff:


In [76]:
#from sklearn.cluster import DBSCAN

In [77]:
#dbscan_model = DBSCAN(eps=0.5, min_samples=5, metric=’euclidean’, metric_params=None, algorithm=’auto’, leaf_size=30, p=None, n_jobs=1)

In [33]:
#from sklearn.linear_model import LogisticRegression

In [34]:
#logreg = LogisticRegression(random_state=42, class_weight='balanced')
#logreg.fit(reduced_train_posts, train_post_labels)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [35]:
#predicted_test_labels = logreg.predict(reduced_test_posts)

In [38]:
#print('mean accuracy is {}'.format(logreg.score(reduced_test_posts, test_post_labels)))
#conf_mat = confusion_matrix(test_post_labels, predicted_test_labels)
#print(conf_mat)

mean accuracy is 0.6362240982348427
[[787 287]
 [187  42]]
