In [None]:
"""
preprocess post title text of all r/technews posts 2011-2020
"""

In [5]:
import pandas as pd
import numpy as np
import pickle
import datetime

import re
import string

In [6]:
import nltk
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

In [7]:
with open('filtered_technews_posts.pickle', 'rb') as read_file:
    posts = pickle.load(read_file)
    
posts.shape

(48082, 13)

In [8]:
posts.head()

Unnamed: 0,id,created_utc,date_time,author,author_fullname,title,url,domain,num_comments,score,permalink,full_link,refine_title
0,29j6ov,1404184792,2014-07-01 03:19:52,Password_is_123456,t2_6crtu,New leaks show off the iPhone 6's curved edged...,http://www.cnet.com/news/new-leaks-show-off-up...,cnet.com,0,0,/r/technews/comments/29j6ov/new_leaks_show_off...,https://www.reddit.com/r/technews/comments/29j...,New leaks show off the iPhone 6's curved edged...
1,29j6op,1404184791,2014-07-01 03:19:51,Password_is_123456,t2_6crtu,Millions of dynamic DNS users suffer after Mic...,http://arstechnica.com/security/2014/06/millio...,arstechnica.com,6,63,/r/technews/comments/29j6op/millions_of_dynami...,https://www.reddit.com/r/technews/comments/29j...,Millions of dynamic DNS users suffer after Mic...
2,29j2c1,1404181722,2014-07-01 02:28:42,[deleted],,Paypal Freezes ProtonMail Campaign Funds,https://protonmail.ch/blog/paypal-freezes-prot...,protonmail.ch,0,1,/r/technews/comments/29j2c1/paypal_freezes_pro...,https://www.reddit.com/r/technews/comments/29j...,Paypal Freezes ProtonMail Campaign Funds
3,29hy6f,1404156644,2014-06-30 19:30:44,ANIMAL_NewYork,t2_h1mjr,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,http://animalnewyork.com/2014/facebook-emotion...,animalnewyork.com,0,1,/r/technews/comments/29hy6f/unethical_facebook...,https://www.reddit.com/r/technews/comments/29h...,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...
4,29hvyr,1404155395,2014-06-30 19:09:55,Password_is_123456,t2_6crtu,Microsoft May Prioritize The Desktop In Windows 9,http://techcrunch.com/2014/06/30/microsoft-may...,techcrunch.com,23,53,/r/technews/comments/29hvyr/microsoft_may_prio...,https://www.reddit.com/r/technews/comments/29h...,Microsoft May Prioritize The Desktop In Windows 9


In [9]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# Text preprocessing steps - remove numbers and punctuation

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_remove = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x)

posts['preproc_title'] = posts['refine_title'].map(alphanumeric).map(punc_remove)
posts.head()

Unnamed: 0,id,created_utc,date_time,author,author_fullname,title,url,domain,num_comments,score,permalink,full_link,refine_title,preproc_title
0,29j6ov,1404184792,2014-07-01 03:19:52,Password_is_123456,t2_6crtu,New leaks show off the iPhone 6's curved edged...,http://www.cnet.com/news/new-leaks-show-off-up...,cnet.com,0,0,/r/technews/comments/29j6ov/new_leaks_show_off...,https://www.reddit.com/r/technews/comments/29j...,New leaks show off the iPhone 6's curved edged...,New leaks show off the iPhone s curved edged...
1,29j6op,1404184791,2014-07-01 03:19:51,Password_is_123456,t2_6crtu,Millions of dynamic DNS users suffer after Mic...,http://arstechnica.com/security/2014/06/millio...,arstechnica.com,6,63,/r/technews/comments/29j6op/millions_of_dynami...,https://www.reddit.com/r/technews/comments/29j...,Millions of dynamic DNS users suffer after Mic...,Millions of dynamic DNS users suffer after Mic...
2,29j2c1,1404181722,2014-07-01 02:28:42,[deleted],,Paypal Freezes ProtonMail Campaign Funds,https://protonmail.ch/blog/paypal-freezes-prot...,protonmail.ch,0,1,/r/technews/comments/29j2c1/paypal_freezes_pro...,https://www.reddit.com/r/technews/comments/29j...,Paypal Freezes ProtonMail Campaign Funds,Paypal Freezes ProtonMail Campaign Funds
3,29hy6f,1404156644,2014-06-30 19:30:44,ANIMAL_NewYork,t2_h1mjr,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,http://animalnewyork.com/2014/facebook-emotion...,animalnewyork.com,0,1,/r/technews/comments/29hy6f/unethical_facebook...,https://www.reddit.com/r/technews/comments/29h...,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...
4,29hvyr,1404155395,2014-06-30 19:09:55,Password_is_123456,t2_6crtu,Microsoft May Prioritize The Desktop In Windows 9,http://techcrunch.com/2014/06/30/microsoft-may...,techcrunch.com,23,53,/r/technews/comments/29hvyr/microsoft_may_prio...,https://www.reddit.com/r/technews/comments/29h...,Microsoft May Prioritize The Desktop In Windows 9,Microsoft May Prioritize The Desktop In Windows


In [11]:
## stem the title text

#lemma = nltk.stem.WordNetLemmatizer()  ## this only made some of the plural words singular

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

posts['unstemmed'] = posts.preproc_title.str.split()
#posts['stem_title'] = posts.unstemmed.apply(lambda x: [lemma.lemmatize(y) for y in x])

posts['stem_title'] = posts.unstemmed.apply(lambda x: [stemmer.stem(y) for y in x])

In [12]:
posts['processed_title'] = posts['stem_title'].apply(lambda x: ' '.join(x))

posts.head()

Unnamed: 0,id,created_utc,date_time,author,author_fullname,title,url,domain,num_comments,score,permalink,full_link,refine_title,preproc_title,unstemmed,stem_title,processed_title
0,29j6ov,1404184792,2014-07-01 03:19:52,Password_is_123456,t2_6crtu,New leaks show off the iPhone 6's curved edged...,http://www.cnet.com/news/new-leaks-show-off-up...,cnet.com,0,0,/r/technews/comments/29j6ov/new_leaks_show_off...,https://www.reddit.com/r/technews/comments/29j...,New leaks show off the iPhone 6's curved edged...,New leaks show off the iPhone s curved edged...,"[New, leaks, show, off, the, iPhone, s, curved...","[new, leak, show, off, the, iphon, s, curv, ed...",new leak show off the iphon s curv edg display
1,29j6op,1404184791,2014-07-01 03:19:51,Password_is_123456,t2_6crtu,Millions of dynamic DNS users suffer after Mic...,http://arstechnica.com/security/2014/06/millio...,arstechnica.com,6,63,/r/technews/comments/29j6op/millions_of_dynami...,https://www.reddit.com/r/technews/comments/29j...,Millions of dynamic DNS users suffer after Mic...,Millions of dynamic DNS users suffer after Mic...,"[Millions, of, dynamic, DNS, users, suffer, af...","[million, of, dynam, dns, user, suffer, after,...",million of dynam dns user suffer after microso...
2,29j2c1,1404181722,2014-07-01 02:28:42,[deleted],,Paypal Freezes ProtonMail Campaign Funds,https://protonmail.ch/blog/paypal-freezes-prot...,protonmail.ch,0,1,/r/technews/comments/29j2c1/paypal_freezes_pro...,https://www.reddit.com/r/technews/comments/29j...,Paypal Freezes ProtonMail Campaign Funds,Paypal Freezes ProtonMail Campaign Funds,"[Paypal, Freezes, ProtonMail, Campaign, Funds]","[paypal, freez, protonmail, campaign, fund]",paypal freez protonmail campaign fund
3,29hy6f,1404156644,2014-06-30 19:30:44,ANIMAL_NewYork,t2_h1mjr,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,http://animalnewyork.com/2014/facebook-emotion...,animalnewyork.com,0,1,/r/technews/comments/29hy6f/unethical_facebook...,https://www.reddit.com/r/technews/comments/29h...,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,"[UNETHICAL, FACEBOOK, “EMOTIONAL, CONTAGION”, ...","[uneth, facebook, “emot, contagion”, studi, no...",uneth facebook “emot contagion” studi not fund...
4,29hvyr,1404155395,2014-06-30 19:09:55,Password_is_123456,t2_6crtu,Microsoft May Prioritize The Desktop In Windows 9,http://techcrunch.com/2014/06/30/microsoft-may...,techcrunch.com,23,53,/r/technews/comments/29hvyr/microsoft_may_prio...,https://www.reddit.com/r/technews/comments/29h...,Microsoft May Prioritize The Desktop In Windows 9,Microsoft May Prioritize The Desktop In Windows,"[Microsoft, May, Prioritize, The, Desktop, In,...","[microsoft, may, priorit, the, desktop, in, wi...",microsoft may priorit the desktop in window


In [13]:
# will try topic modeling with only nouns
# POS tag to keep nouns

def keep_nouns(title_text):
    tokens = word_tokenize(title_text) 
    tags = pos_tag(tokens) # this labels each word as a part of speech
    nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    return ' '.join(nouns)

In [14]:
posts['title_nouns'] = posts['preproc_title'].apply(keep_nouns)
posts.head()

Unnamed: 0,id,created_utc,date_time,author,author_fullname,title,url,domain,num_comments,score,permalink,full_link,refine_title,preproc_title,unstemmed,stem_title,processed_title,title_nouns
0,29j6ov,1404184792,2014-07-01 03:19:52,Password_is_123456,t2_6crtu,New leaks show off the iPhone 6's curved edged...,http://www.cnet.com/news/new-leaks-show-off-up...,cnet.com,0,0,/r/technews/comments/29j6ov/new_leaks_show_off...,https://www.reddit.com/r/technews/comments/29j...,New leaks show off the iPhone 6's curved edged...,New leaks show off the iPhone s curved edged...,"[New, leaks, show, off, the, iPhone, s, curved...","[new, leak, show, off, the, iphon, s, curv, ed...",new leak show off the iphon s curv edg display,New leaks iPhone s display
1,29j6op,1404184791,2014-07-01 03:19:51,Password_is_123456,t2_6crtu,Millions of dynamic DNS users suffer after Mic...,http://arstechnica.com/security/2014/06/millio...,arstechnica.com,6,63,/r/technews/comments/29j6op/millions_of_dynami...,https://www.reddit.com/r/technews/comments/29j...,Millions of dynamic DNS users suffer after Mic...,Millions of dynamic DNS users suffer after Mic...,"[Millions, of, dynamic, DNS, users, suffer, af...","[million, of, dynam, dns, user, suffer, after,...",million of dynam dns user suffer after microso...,Millions DNS users Microsoft IP domains
2,29j2c1,1404181722,2014-07-01 02:28:42,[deleted],,Paypal Freezes ProtonMail Campaign Funds,https://protonmail.ch/blog/paypal-freezes-prot...,protonmail.ch,0,1,/r/technews/comments/29j2c1/paypal_freezes_pro...,https://www.reddit.com/r/technews/comments/29j...,Paypal Freezes ProtonMail Campaign Funds,Paypal Freezes ProtonMail Campaign Funds,"[Paypal, Freezes, ProtonMail, Campaign, Funds]","[paypal, freez, protonmail, campaign, fund]",paypal freez protonmail campaign fund,Paypal Freezes ProtonMail Campaign Funds
3,29hy6f,1404156644,2014-06-30 19:30:44,ANIMAL_NewYork,t2_h1mjr,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,http://animalnewyork.com/2014/facebook-emotion...,animalnewyork.com,0,1,/r/technews/comments/29hy6f/unethical_facebook...,https://www.reddit.com/r/technews/comments/29h...,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,UNETHICAL FACEBOOK “EMOTIONAL CONTAGION” STUDY...,"[UNETHICAL, FACEBOOK, “EMOTIONAL, CONTAGION”, ...","[uneth, facebook, “emot, contagion”, studi, no...",uneth facebook “emot contagion” studi not fund...,FACEBOOK “ EMOTIONAL CONTAGION ” STUDY NOT FUN...
4,29hvyr,1404155395,2014-06-30 19:09:55,Password_is_123456,t2_6crtu,Microsoft May Prioritize The Desktop In Windows 9,http://techcrunch.com/2014/06/30/microsoft-may...,techcrunch.com,23,53,/r/technews/comments/29hvyr/microsoft_may_prio...,https://www.reddit.com/r/technews/comments/29h...,Microsoft May Prioritize The Desktop In Windows 9,Microsoft May Prioritize The Desktop In Windows,"[Microsoft, May, Prioritize, The, Desktop, In,...","[microsoft, may, priorit, the, desktop, in, wi...",microsoft may priorit the desktop in window,Microsoft May Desktop Windows


In [15]:
with open('preprocessed_posts.pickle', 'wb') as to_write:
    pickle.dump(posts, to_write)