In [23]:
#nltk
#pip install any packages you don't have
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd
import re, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import calinski_harabaz_score

from collections import Counter

In [63]:
df = pd.read_csv('df_with_gensim_summaries.csv')

In [64]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1.1'], axis = 1, inplace = True)

In [65]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",And never more so than in Showtime’s new serie...,[' And never more so than in Showtime’s n...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,AlphaGo’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,How a weapon against war became a weapon again...,[' How a weapon against war became a weap...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",Genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,Inside the test flight of Facebook’s first int...,[' Inside the test flight of Facebook’s f...


<h3>Tokenize

In [6]:
df.first_100 = df.first_100.str.lower()

In [7]:
df['tokenized_first_100'] = df.first_100.apply(lambda x: word_tokenize(x, language = 'en'))

In [8]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ..."
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,..."
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap..."
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its..."
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s..."


<h3>Remove Stop Words

In [9]:
stops = list(set(stopwords.words('english'))) + list(punctuation) + ['s', "'", 't', 'and', '"', 'a', 'or', '/', 'in',
                                                                    'for', '&', '-', "''"]

In [488]:
#df.head()

In [11]:
#function to remove stop words
def remove_stops(text):
    text_no_stops = []
    for i in text:
        if i not in stops:
            if len(i) == 1:
                pass
            else:
                text_no_stops.append(i)
        else:
            pass
    return text_no_stops

In [12]:
df['first_100_no_stops'] = df['tokenized_first_100'].apply(lambda x: remove_stops(x))

In [155]:
#verify that it worked
#df.head()

<h3>Lemmatization

In [13]:
#initialize WordNetLemmatizer class
lemmatizer = nltk.stem.WordNetLemmatizer()

In [14]:
#function to lemmatize text
def lemmatize_text(text):
    lemmatized = []
    for word in text:
        lemmatized.append(lemmatizer.lemmatize(word))
    return lemmatized
        

In [15]:
df['lemmatize_first_100'] = df['first_100_no_stops'].apply(lemmatize_text)

In [16]:
df['lemmatize_first_100'] = df['lemmatize_first_100'].apply(lambda x: ' '.join(x))

In [17]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ...","[never, showtime, new, series, revival, spoile...",never showtime new series revival spoiler ahea...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,...","[alphago, victory, defeat, humans, opportunity...",alphago victory defeat human opportunity loss ...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap...","[weapon, war, became, weapon, web, every, year...",weapon war became weapon web every year artist...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its...","[genius, quietly, laid, bunch, engineers, surv...",genius quietly laid bunch engineer survive med...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s...","[inside, test, flight, facebook, first, intern...",inside test flight facebook first internet dro...


In [18]:
#df.to_csv('df_with_lemmings.csv')

<h3>KMEANS CLUSTERING

In [89]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from langdetect import detect
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import words
from nltk.tokenize import RegexpTokenizer
import ast

In [19]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ...","[never, showtime, new, series, revival, spoile...",never showtime new series revival spoiler ahea...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,...","[alphago, victory, defeat, humans, opportunity...",alphago victory defeat human opportunity loss ...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap...","[weapon, war, became, weapon, web, every, year...",weapon war became weapon web every year artist...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its...","[genius, quietly, laid, bunch, engineers, surv...",genius quietly laid bunch engineer survive med...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s...","[inside, test, flight, facebook, first, intern...",inside test flight facebook first internet dro...


<h3> Detect languages of articles

In [24]:
df['language'] = df['lemmatize_first_100'].apply(detect)

In [26]:
df.groupby('language').count()

Unnamed: 0_level_0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
en,97038,97038,84941,97038,97038,97038,97038,97038,97038
es,4,4,2,4,4,4,4,4,4
fr,19,19,19,19,19,19,19,19,19
it,2,2,2,2,2,2,2,2,2
ko,1,1,1,1,1,1,1,1,1


In [29]:
#drop rows that are not english
df = df.loc[df['language'] == 'en']

In [34]:
#df.to_csv('df_english_articles.csv')

<h3>Modeling

In [345]:
df = pd.read_csv('df_english_articles.csv')

In [None]:
#df.head()

In [336]:
stemmer = PorterStemmer()

In [421]:
test = ['words', 'word', 'running', 'ran']

In [441]:
#create function to stem each word in a list and concat the list
def stem_list(lst):
    stemmed_list = []
    for i in lst:
        stemmed_list.append(stemmer.stem(i))
    stem_string = ' '.join(stemmed_list)
    return stem_string

In [436]:
#convert list contained in string to a regular list so it can be stemmed
df['stemmed'] = df["first_100_no_stops"].apply(lambda x: ast.literal_eval(x))

In [442]:
#stem words in list
df['stemmed'] = df["stemmed"].apply(lambda x: stem_list(x))


In [450]:
#verify that it worked
#df.head()

In [447]:
df = df[~df['stemmed'].str.contains("archiveteam.org contain", case=False)]

In [448]:
df.shape

(95790, 12)

In [462]:
#CHECKPOINT --- SAVE TO CSV
#df.to_csv('df_with_stems_final.csv')

In [68]:
#CHECKPOINT --- RUN TO OPEN CSV IF STARTING WORK HERE
#df = pd.read_csv('df_with_stems_final.csv')
#df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace = True)

In [69]:
#df.head()

In [70]:
documents = df['stemmed'].to_list()

In [71]:
documents[:3]

['never showtim new seri reviv spoiler ahead episod season twin peak may 21st showtim brought back david lynch groundbreak tv seri twin peak fulfil propheci process second season final back 1991 spirit series-defin murder victim laura palmer told fbi special agent seri protagonist dale cooper see 25 years. clip play first episod lynch twin peak reviv remind decad fact gone laura promis',
 'alphago victori defeat human opportun loss human man succumb machin heard alphago latest exploit last week crush world best go player confirm artifici intellig master ancient chines board game may heard news deliv doomsday terms.ther certain melancholi ke jie capitul sure 19-year-old chines prodigi declar would never lose ai follow alphago earthshak victori lee se-dol last year see onstag last week nearli bent doubl',
 'weapon war becam weapon web everi year artist technolog enthusiast meet linz austria ar electronica festiv meetup citi downtown locat danub river festiv eye toward futur someth burn m

In [72]:
#function to vectorize strings and perform tf-idf transformation
def vectorize_texts(list_of_strings):
    print('Performing vectorization and TF/IDF transformation on texts...')
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(list_of_strings)
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf = transformer.fit_transform(X)
    return tfidf

In [73]:
def cluster_texts(num_clusters, tfidf):
    #perform kmeans clustering for range of clusters
    print('Beginning KMeans Clustering, number of clusters = ', num_clusters, '\n') 
    km = KMeans(n_clusters=num_clusters, max_iter = 100, verbose = 2, n_init = 1).fit(tfidf)
    
    
    return km

<h3>Run Clustering for range of K's

In [74]:
documents_vectorized = vectorize_texts(documents)

Performing vectorization and TF/IDF transformation on texts...


In [75]:
#kmeans3 = cluster_texts(3, documents_vectorized)

In [76]:
#kmeans3.labels_

In [77]:
#test = documents_vectorized.toarray()

In [78]:
#score = calinski_harabaz_score(test, kmeans3.labels_)

In [79]:
#kmeans4= cluster_texts(4, documents_vectorized)

In [80]:
#kmeans5= cluster_texts(5, documents_vectorized)

In [81]:
#kmeans6= cluster_texts(6, documents_vectorized)

In [82]:
#kmeans7= cluster_texts(7, documents_vectorized)

In [83]:
#kmeans8= cluster_texts(8, documents_vectorized)

In [84]:
#kmeans9= cluster_texts(9, documents_vectorized)

In [85]:
#kmeans10= cluster_texts(10, documents_vectorized)

In [87]:
#kmeans11= cluster_texts(11, documents_vectorized)

In [90]:
kmeans12= cluster_texts(12, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  12 

Initialization complete
Iteration  0, inertia 180678.661
Iteration  1, inertia 93745.651
Iteration  2, inertia 93285.612
Iteration  3, inertia 93118.425
Iteration  4, inertia 93042.718
Iteration  5, inertia 93003.390
Iteration  6, inertia 92984.769
Iteration  7, inertia 92975.315
Iteration  8, inertia 92967.950
Iteration  9, inertia 92959.559
Iteration 10, inertia 92954.307
Iteration 11, inertia 92950.018
Iteration 12, inertia 92946.236
Iteration 13, inertia 92942.656
Iteration 14, inertia 92939.529
Iteration 15, inertia 92937.248
Iteration 16, inertia 92935.565
Iteration 17, inertia 92934.119
Iteration 18, inertia 92932.889
Iteration 19, inertia 92931.760
Iteration 20, inertia 92930.751
Iteration 21, inertia 92929.731
Iteration 22, inertia 92928.570
Iteration 23, inertia 92926.964
Iteration 24, inertia 92924.923
Iteration 25, inertia 92922.680
Iteration 26, inertia 92920.272
Iteration 27, inertia 92918.159
Iteration 28, inertia 9

In [91]:
import pickle

In [92]:
#save kmeans12 model for further use
pickle.dump(kmeans12, open("save.pkl", "wb"))

In [None]:
#load back in kmeans12 model
#kmeans = pickle.load(open("save.pkl", "rb"))

In [None]:
sample = """Drivers don’t always realize that they may be overpaying for car insurance. If you haven't compared quotes
recently, even if you have a low rate, you could still be paying too much. Fortunately, millions of smart drivers have
used EverQuote™'s free service to save hundreds on their insurance bills. It’s really no wonder that with so many 
drivers saving money, EverQuote™ is gaining momentum. EverQuote™ is an efficient source that tries to give consumers
the lowest rates with tools you can trust. Just imagine what you could do with the money you save!"""

In [34]:
k_list = [kmeans3, kmeans4, kmeans5, kmeans6, kmeans7, kmeans8, kmeans9, kmeans10, kmeans11, kmeans12]

In [32]:
kmeans3.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [30]:
kmeans_df = pd.DataFrame()

In [36]:
for i in k_list:
    labels = i.labels_
    kmeans_df[i] = labels

In [38]:
kmeans_df.columns = ['kmeans3', 'kmeans4', 'kmeans5', 'kmeans6', 'kmeans7', 'kmeans8', 'kmeans9', 'kmeans10', 'kmeans11', 'kmeans12']

In [98]:
kmeans_df['kmeans12'] = kmeans12.labels_

In [50]:
kmeans_df['stemmed'] = df['stemmed']

In [51]:
print(df.shape)
print(kmeans_df.shape)

(95790, 11)
(95790, 11)


In [99]:
kmeans_df.head()

Unnamed: 0.1,Unnamed: 0,kmeans3,kmeans4,kmeans5,kmeans6,kmeans7,kmeans8,kmeans9,kmeans10,kmeans11,kmeans12,stemmed
0,0,0,0,1,0,6,4,6,8,9,3,never showtim new seri reviv spoiler ahead epi...
1,1,0,0,1,0,6,4,6,8,9,3,alphago victori defeat human opportun loss hum...
2,2,0,0,1,0,6,4,2,3,9,3,weapon war becam weapon web everi year artist ...
3,3,0,1,2,0,5,1,4,9,5,10,geniu quietli laid bunch engin surviv media co...
4,4,0,0,1,0,6,4,2,3,9,3,insid test flight facebook first internet dron...


In [100]:
#CHECKPOINT --- SAVE TO CSV TO AVOID RUNNING KMEANS FUNCTIONS AGAIN
kmeans_df.to_csv('kmeans_df.csv')

In [3]:
kmeans_df = pd.read_csv('kmeans_df.csv')

In [None]:
plt.plot([3, 4, 5, 6, 7], CH_score)
plt.xticks([3,4,5,6,7])
plt.title("Calinski Harabaz Scores for Different Values of K")
plt.ylabel("Variance Ratio")
plt.xlabel("K=")
plt.show()

<h3>Check Clusters for K's

In [4]:
kmeans_df['kmeans3'].value_counts()

0    55197
1    20409
2    20184
Name: kmeans3, dtype: int64

In [101]:
kmeans_df['kmeans12'].value_counts()

3     38169
0     18915
1      9671
10     6120
11     6033
2      4171
4      3634
6      2185
5      2171
8      1883
7      1869
9       969
Name: kmeans12, dtype: int64

In [107]:
def get_most_common_words(num_words):
    common_words = []
    for i in range(0,12):
        common = Counter(" ".join(kmeans_df.loc[kmeans_df['kmeans12'] == i]['stemmed']).split()).most_common(num_words)
        for j in common:
            dict_ = {}
            dict_['cluster'] = i
            dict_['word'] = j[0]
            common_words.append(dict_)
            
    return common_words    
            

In [116]:
get_most_common_words(20)

[{'cluster': 0, 'word': 'state'},
 {'cluster': 0, 'word': 'said'},
 {'cluster': 0, 'word': "'s"},
 {'cluster': 0, 'word': 'new'},
 {'cluster': 0, 'word': 'year'},
 {'cluster': 0, 'word': 'presid'},
 {'cluster': 0, 'word': 'peopl'},
 {'cluster': 0, 'word': 'nation'},
 {'cluster': 0, 'word': '``'},
 {'cluster': 0, 'word': 'one'},
 {'cluster': 0, 'word': 'unit'},
 {'cluster': 0, 'word': 'countri'},
 {'cluster': 0, 'word': 'govern'},
 {'cluster': 0, 'word': 'would'},
 {'cluster': 0, 'word': 'report'},
 {'cluster': 0, 'word': 'two'},
 {'cluster': 0, 'word': 'american'},
 {'cluster': 0, 'word': 'last'},
 {'cluster': 0, 'word': 'u.s.'},
 {'cluster': 0, 'word': 'offici'},
 {'cluster': 1, 'word': 'trump'},
 {'cluster': 1, 'word': 'presid'},
 {'cluster': 1, 'word': 'donald'},
 {'cluster': 1, 'word': 'said'},
 {'cluster': 1, 'word': "'s"},
 {'cluster': 1, 'word': 'mr.'},
 {'cluster': 1, 'word': '``'},
 {'cluster': 1, 'word': 'would'},
 {'cluster': 1, 'word': 'new'},
 {'cluster': 1, 'word': 'white

In [47]:
Counter(" ".join(kmeans_df.loc[kmeans_df['kmeans12'] == 9]['stemmed']).split()).most_common(10)

[('polic', 5375),
 ('offic', 2748),
 ('said', 2244),
 ('kill', 1398),
 ('man', 1101),
 ('shoot', 975),
 ('peopl', 967),
 ("'s", 935),
 ('shot', 927),
 ('attack', 915)]

In [111]:
kmeans_df.loc[kmeans_df['kmeans12'] == 9]['stemmed'].values

array(['san francisco digit ad pop onlin frequent ubiquit mani peopl use softwar block tri stop ad show facebook desktop websit luck social network found way block ad blocker tuesday facebook flip switch desktop websit essenti render ad blocker program prevent websit display ad page user visit site useless chang allow silicon valley compani serv ad',
       'facebook adblock plu enter digit war follow facebook announc tuesday ad-block would longer work social network tuesday facebook announc blog post would take anoth approach advert ad-block releas new way user filter ad along updat would make ad-block useless social network advertis relev well-mad ad use help us find new product servic introduc us new experi like ad show favorit band come',
       'articl part featur also send via email polit amp polici daili daili roundup event idea american polit written special newslett subscrib sign pleas enter email address field provid russian left two vacat compound outsid washington d.c. new 

<h3>Merge Kmeans12 into original df
    

In [113]:
df_clusters = pd.concat([df, kmeans_df['kmeans12']], axis = 1, sort = False)

In [115]:
df_clusters.head(100)

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100,language,stemmed,kmeans12
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"['and', 'never', 'more', 'so', 'than', 'in', '...","['never', 'showtime', 'new', 'series', 'reviva...",never showtime new series revival spoiler ahea...,en,never showtim new seri reviv spoiler ahead epi...,3
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"['alphago', '’', 's', 'victory', 'isn', '’', '...","['alphago', 'victory', 'defeat', 'humans', 'op...",alphago victory defeat human opportunity loss ...,en,alphago victori defeat human opportun loss hum...,3
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"['how', 'a', 'weapon', 'against', 'war', 'beca...","['weapon', 'war', 'became', 'weapon', 'web', '...",weapon war became weapon web every year artist...,en,weapon war becam weapon web everi year artist ...,3
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"['genius', 'quietly', 'laid', 'off', 'a', 'bun...","['genius', 'quietly', 'laid', 'bunch', 'engine...",genius quietly laid bunch engineer survive med...,en,geniu quietli laid bunch engin surviv media co...,10
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"['inside', 'the', 'test', 'flight', 'of', 'fac...","['inside', 'test', 'flight', 'facebook', 'firs...",inside test flight facebook first internet dro...,en,insid test flight facebook first internet dron...,3
5,E-Waste Empire,Gadget shopping? Chances are that as soo...,Longform,"In the US, we threw away 16 billion pounds of ...",gadget shopping? chances are that as soon as y...,"[' Gadget shopping?', 'Chances are that a...","['gadget', 'shopping', '?', 'chances', 'are', ...","['gadget', 'shopping', 'chances', 'soon', 'plu...",gadget shopping chance soon plunk cash new sma...,en,gadget shop chanc soon plunk cash new smartpho...,3
6,Xbox: Start to Continue,It was a rare sunny day in Seattle and P...,Longform,"Microsoft’s approach starts with hardware, whe...",it was a rare sunny day in seattle and phil sp...,[' It was a rare sunny day in Seattle and...,"['it', 'was', 'a', 'rare', 'sunny', 'day', 'in...","['rare', 'sunny', 'day', 'seattle', 'phil', 's...",rare sunny day seattle phil spencer seemed ple...,en,rare sunni day seattl phil spencer seem pleas ...,3
7,On set with Arnold: Can Schwarzenegger bring b...,Terminator ResurrectedOn set with A...,Longform,"It all looks strange, but then you begin to se...",terminator resurrectedon set with arnold for w...,[' Terminator ResurrectedOn set with...,"['terminator', 'resurrectedon', 'set', 'with',...","['terminator', 'resurrectedon', 'set', 'arnold...",terminator resurrectedon set arnold could best...,en,termin resurrectedon set arnold could best ter...,3
8,Can we cure the common hangover?,We start with shots of Jameson at 9:45PM...,Longform,Even though about three-quarters of people who...,"we start with shots of jameson at 9:45pm, beca...",[' We start with shots of Jameson at 9:45...,"['we', 'start', 'with', 'shots', 'of', 'jameso...","['start', 'shots', 'jameson', '9:45pm', 'las',...",start shot jameson 9:45pm la vega feel like pl...,en,start shot jameson 9:45pm la vega feel like pl...,3
9,Surrounded by sound: how 3D audio hacks your b...,"On a crisp afternoon late last year, I made ...",Longform,"You’ll hear a whisper in one ear.""""It puts you...","on a crisp afternoon late last year, i made my...","[' On a crisp afternoon late last year, I mad...","['on', 'a', 'crisp', 'afternoon', 'late', 'las...","['crisp', 'afternoon', 'late', 'last', 'year',...",crisp afternoon late last year made way manhat...,en,crisp afternoon late last year made way manhat...,3
