In [117]:
import numpy as np

In [118]:
from pymongo import MongoClient

In [119]:
client = MongoClient()
db = client.cmv
posts_collection = db.posts
tl_comments_collection = db.tl_comments
deltad_replies_collection = db.deltad_replies

In [120]:
# getting posts with and without deltas awarded
deltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": True}})
undeltad_post_gen = posts_collection.find( {'tl_comment_delta_parents': {"$exists": False}})
deltad_post_list = list(deltad_post_gen)
undeltad_post_list = list(undeltad_post_gen)
print(len(deltad_post_list))
print(len(undeltad_post_list))

1146
5370


In [121]:
# get ids of Top Level Comments that resulted in deltas AND list of all TL Comment IDs for posts where some delta was awarded by OP
deltad_tl_comment_ids = [post['tl_comment_delta_parents'] for post in deltad_post_list]
all_tl_comment_ids = [post['comment_ids'] for post in deltad_post_list]

# flatten lists of lists
deltad_tl_comment_ids = [item for sublist in deltad_tl_comment_ids for item in sublist]
all_tl_comment_ids = [item for sublist in all_tl_comment_ids for item in sublist]

# get ids of TL Comments that did not result in deltas from posts where OP did award deltas
undeltad_tl_comment_ids = list(set(all_tl_comment_ids) - set(deltad_tl_comment_ids))


In [122]:
# retrieve TL comments resulting in deltas by id
deltad_tl_comment_gen = tl_comments_collection.find( {'comment_id': {"$in": deltad_tl_comment_ids}})
# retrieve TL comments NOT resulting in deltas
undeltad_tl_comment_gen = tl_comments_collection.find( {'comment_id': {"$in": undeltad_tl_comment_ids}})

In [123]:
deltad_tl_comments = list(deltad_tl_comment_gen)
undeltad_tl_comments = list(undeltad_tl_comment_gen)
print(len(deltad_tl_comments))
print(len(undeltad_tl_comments))

1955
13601


In [124]:
# establish the set of texts to use (ie posts vs comments)
doctype = 'comment'

if doctype == 'comment':
    deltad_docs = deltad_tl_comments
    undeltad_docs = undeltad_tl_comments
    
elif doctype == 'post':
    deltad_docs = deltad_post_list
    undeltad_docs = undeltad_post_list

In [131]:
#train test split
test_split_d = int(0.2*len(deltad_docs))
test_split_u = int(0.2*len(undeltad_docs))

np.random.shuffle(deltad_docs)
np.random.shuffle(undeltad_docs)

test_docs = deltad_docs[0:test_split_d]
test_docs.extend(undeltad_docs[0:test_split_u])

train_docs= deltad_docs[test_split_d::]
train_docs.extend(undeltad_docs[test_split_u::])

In [132]:
type(test_split_d-1)

int

In [133]:
test_doc_labels = [1]*(test_split_d)
test_doc_labels.extend(([0]*(test_split_u)))
test_doc_labels = np.array(test_doc_labels)

In [134]:
num_train_d = len(deltad_docs) - test_split_d
num_train_u = len(undeltad_docs) - test_split_u

train_doc_labels = [1]*num_train_d
train_doc_labels.extend([0]*(num_train_u))

train_doc_labels = np.array(train_doc_labels)

In [190]:
num_train_u/num_train_d

6.957161125319693

In [135]:
train_doc_labels.shape

(12445,)

In [136]:
len(train_docs)

12445

In [137]:
len(test_docs)

3111

In [138]:
train_texts = [doc[f'{doctype}_text'] for doc in train_docs]
test_texts = [doc[f'{doctype}_text'] for doc in test_docs]

In [139]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import string

In [140]:
alist = ['a', 'b']
blist = ['c', 'd']
alist.extend(blist)
print(alist)

['a', 'b', 'c', 'd']


In [141]:
def clean_text(posts, tokenizer, stemmer):
    stemmer_inst = stemmer()
    tokenizer_inst = tokenizer()
    cleaned_posts = []
    for post in posts:
        #cleaned_words = []
        
        #strip punctuation and digits from whole post
        to_replace = [punc for punc in string.punctuation+string.digits]
        replacement_table = str.maketrans({key: None for key in to_replace})
        stripped_post = post.translate(replacement_table)
        
        #lower case post
        lowered_post = stripped_post.lower()
        cleaned_posts.append(lowered_post)
        #tokenized_post = tokenizer_inst.tokenize(stripped_post) 
        #for word in tokenized_post:
            #low_word = stemmer_inst.stem(word.lower())
            #cleaned_words.append(low_word)
            #remove stopwords?
            #if low_word not in stopwords:
            #    cleaned_words.append(low_word)
        # cleaned_posts.append(' '.join(cleaned_words))
    return cleaned_posts

In [142]:
stemmer = PorterStemmer
tokenizer = WhitespaceTokenizer

In [145]:
cleaned_train_texts = clean_text(train_texts, tokenizer, stemmer)
cleaned_test_texts = clean_text(test_texts, tokenizer, stemmer)

In [194]:
my_vectorizer = CountVectorizer(max_df=0.8, min_df=5,
                                max_features=1000,
                                stop_words='english')

In [195]:
vectorized_train_texts = my_vectorizer.fit_transform(cleaned_train_texts)
vectorized_test_texts = my_vectorizer.fit_transform(cleaned_test_texts)

## Unsupervised Topic Extraction

In [200]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [202]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### LSA

In [197]:
lsa_model = TruncatedSVD(n_components=150, random_state=40)
lsa_model.fit(vectorized_train_texts)

In [199]:
lsa_train_texts = lsa_model.transform(vectorized_train_texts)
lsa_test_texts = lsa_model.transform(vectorized_test_texts)

In [201]:
display_topics(lsa_model,my_vectorizer.get_feature_names(),20) # We have to look at the topics before hand and then add the labels afterwards


Topic  0
paying, jobs, dont, level, theyve, main, way, want, theyre, thought, read, illegal, save, kind, goal, discussion, test, youve, global, period

Topic  1
paying, birth, willing, games, total, quality, game, got, putting, compared, identify, points, solve, question, happy, cut, ideas, politics, cultural, run

Topic  2
goes, minimum, true, vote, wage, military, talk, past, main, court, status, income, talking, work, economy, statement, parts, come, safety, poverty

Topic  3
true, vote, dont, come, level, hell, high, parts, theyve, pregnancy, came, policy, safety, end, voting, religious, illegal, remember, democrats, donald

Topic  4
level, jobs, future, gain, plan, paying, thought, arms, play, players, concept, goal, video, planet, tax, negative, fun, minimum, theres, work

Topic  5
games, womens, jobs, meant, total, wage, military, minimum, level, goes, talk, women, sex, identify, risk, majority, past, work, female, income

Topic  6
birth, willing, jobs, level, quality, child, t

#### LDA

In [223]:
n_topics = 150
n_iter = 13
lda_model = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=n_iter,
                                random_state=42,
                               learning_method='online')
lda_train_texts = lda_model.fit_transform(vectorized_train_texts)
lda_test_texts = lda_model.transform(vectorized_test_texts)

In [224]:
lda_test_texts = lda_model.transform(vectorized_test_texts)

In [226]:
display_topics(lda_model,my_vectorizer.get_feature_names(),20) # We have to look at the topics before hand and then add the labels afterwards


Topic  0
minimum, valid, bunch, crime, president, major, convince, probably, demand, crimes, problem, self, criminal, past, service, instance, problems, iq, private, arent

Topic  1
people, human, internet, zero, income, debt, held, significant, various, turn, actions, transgender, effort, directly, individuals, talk, bunch, voting, union, high

Topic  2
bring, species, rest, wanted, protect, benefits, straight, space, poor, team, build, allow, state, trust, healthcare, strong, parts, america, start, chance

Topic  3
candidates, running, career, drive, theyll, level, paying, dont, saw, cases, truth, theyve, main, lets, generally, jobs, use, comments, treatment, safe

Topic  4
news, identity, experiences, ability, generally, poverty, member, primary, says, playing, born, ive, city, goes, takes, loss, rule, politics, lets, open

Topic  5
left, fun, concept, fighting, tax, standard, vs, specific, computer, advantage, election, woman, play, legal, goal, understanding, set, expect, major, 

In [225]:
reduced_train_texts = lda_train_texts
reduced_test_texts = lda_test_texts

## Supervised Classification

In [None]:
# undersampling undeltad observations:

In [214]:
from sklearn.metrics import confusion_matrix, roc_auc_score

In [215]:
from sklearn.svm import SVC

In [230]:
svc_model = SVC(class_weight='balanced', random_state=42)
svc_model.fit(reduced_train_texts, train_doc_labels)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [231]:
predicted_doc_labels = svc_model.predict(reduced_test_texts)
conf_mat = confusion_matrix(test_doc_labels, predicted_doc_labels)
print(conf_mat)

[[   0 2720]
 [   0  391]]


In [232]:
print('mean accuracy is {}'.format(svc_model.score(reduced_test_texts, test_doc_labels)))

mean accuracy is 0.12568306010928962


In [219]:
from xgboost import XGBClassifier

In [227]:
xg_model = XGBClassifier(scale_pos_weight=7)
xg_model.fit(reduced_train_texts, train_doc_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=7, seed=None,
       silent=True, subsample=1)

In [228]:
print('train set mean accuracy is {}'.format(xg_model.score(reduced_train_texts, train_doc_labels)))


train set mean accuracy is 0.680032141422258


  if diff:


In [229]:
print('mean accuracy is {}'.format(xg_model.score(reduced_test_texts, test_doc_labels)))
predicted_doc_labels = xg_model.predict(reduced_test_texts)
conf_mat = confusion_matrix(test_doc_labels, predicted_doc_labels)
print(conf_mat)

mean accuracy is 0.6570234651237544
[[1857  863]
 [ 204  187]]


  if diff:
  if diff:


In [76]:
#from sklearn.cluster import DBSCAN

In [77]:
#dbscan_model = DBSCAN(eps=0.5, min_samples=5, metric=’euclidean’, metric_params=None, algorithm=’auto’, leaf_size=30, p=None, n_jobs=1)

In [33]:
#from sklearn.linear_model import LogisticRegression

In [34]:
#logreg = LogisticRegression(random_state=42, class_weight='balanced')
#logreg.fit(reduced_train_posts, train_post_labels)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [35]:
#predicted_test_labels = logreg.predict(reduced_test_posts)

In [38]:
#print('mean accuracy is {}'.format(logreg.score(reduced_test_posts, test_post_labels)))
#conf_mat = confusion_matrix(test_post_labels, predicted_test_labels)
#print(conf_mat)

mean accuracy is 0.6362240982348427
[[787 287]
 [187  42]]
