In [6]:
#nltk
#pip install any packages you don't have
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd
import re, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import calinski_harabaz_score

AttributeError: module 'nltk' has no attribute 'data'

In [3]:
df = pd.read_csv('df_with_gensim_summaries.csv')

In [4]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1.1'], axis = 1, inplace = True)

In [5]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",And never more so than in Showtime’s new serie...,[' And never more so than in Showtime’s n...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,AlphaGo’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,How a weapon against war became a weapon again...,[' How a weapon against war became a weap...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",Genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,Inside the test flight of Facebook’s first int...,[' Inside the test flight of Facebook’s f...


<h3>Tokenize

In [6]:
df.first_100 = df.first_100.str.lower()

In [7]:
df['tokenized_first_100'] = df.first_100.apply(lambda x: word_tokenize(x, language = 'en'))

In [8]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ..."
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,..."
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap..."
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its..."
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s..."


<h3>Remove Stop Words

In [9]:
stops = list(set(stopwords.words('english'))) + list(punctuation) + ['s', "'", 't', 'and', '"', 'a', 'or', '/', 'in',
                                                                    'for', '&', '-', "''"]

In [488]:
#df.head()

In [11]:
#function to remove stop words
def remove_stops(text):
    text_no_stops = []
    for i in text:
        if i not in stops:
            if len(i) == 1:
                pass
            else:
                text_no_stops.append(i)
        else:
            pass
    return text_no_stops

In [12]:
df['first_100_no_stops'] = df['tokenized_first_100'].apply(lambda x: remove_stops(x))

In [155]:
#verify that it worked
#df.head()

<h3>Lemmatization

In [13]:
#initialize WordNetLemmatizer class
lemmatizer = nltk.stem.WordNetLemmatizer()

In [14]:
#function to lemmatize text
def lemmatize_text(text):
    lemmatized = []
    for word in text:
        lemmatized.append(lemmatizer.lemmatize(word))
    return lemmatized
        

In [15]:
df['lemmatize_first_100'] = df['first_100_no_stops'].apply(lemmatize_text)

In [16]:
df['lemmatize_first_100'] = df['lemmatize_first_100'].apply(lambda x: ' '.join(x))

In [17]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ...","[never, showtime, new, series, revival, spoile...",never showtime new series revival spoiler ahea...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,...","[alphago, victory, defeat, humans, opportunity...",alphago victory defeat human opportunity loss ...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap...","[weapon, war, became, weapon, web, every, year...",weapon war became weapon web every year artist...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its...","[genius, quietly, laid, bunch, engineers, surv...",genius quietly laid bunch engineer survive med...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s...","[inside, test, flight, facebook, first, intern...",inside test flight facebook first internet dro...


In [18]:
#df.to_csv('df_with_lemmings.csv')

<h3>KMEANS CLUSTERING

In [425]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from langdetect import detect
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import words
from nltk.tokenize import RegexpTokenizer
import ast

In [19]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ...","[never, showtime, new, series, revival, spoile...",never showtime new series revival spoiler ahea...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,...","[alphago, victory, defeat, humans, opportunity...",alphago victory defeat human opportunity loss ...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap...","[weapon, war, became, weapon, web, every, year...",weapon war became weapon web every year artist...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its...","[genius, quietly, laid, bunch, engineers, surv...",genius quietly laid bunch engineer survive med...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s...","[inside, test, flight, facebook, first, intern...",inside test flight facebook first internet dro...


<h3> Detect languages of articles

In [24]:
df['language'] = df['lemmatize_first_100'].apply(detect)

In [26]:
df.groupby('language').count()

Unnamed: 0_level_0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
en,97038,97038,84941,97038,97038,97038,97038,97038,97038
es,4,4,2,4,4,4,4,4,4
fr,19,19,19,19,19,19,19,19,19
it,2,2,2,2,2,2,2,2,2
ko,1,1,1,1,1,1,1,1,1


In [29]:
#drop rows that are not english
df = df.loc[df['language'] == 'en']

In [34]:
#df.to_csv('df_english_articles.csv')

<h3>Modeling

In [345]:
df = pd.read_csv('df_english_articles.csv')

In [None]:
#df.head()

In [336]:
stemmer = PorterStemmer()

In [421]:
test = ['words', 'word', 'running', 'ran']

In [441]:
#create function to stem each word in a list and concat the list
def stem_list(lst):
    stemmed_list = []
    for i in lst:
        stemmed_list.append(stemmer.stem(i))
    stem_string = ' '.join(stemmed_list)
    return stem_string

In [436]:
#convert list contained in string to a regular list so it can be stemmed
df['stemmed'] = df["first_100_no_stops"].apply(lambda x: ast.literal_eval(x))

In [442]:
#stem words in list
df['stemmed'] = df["stemmed"].apply(lambda x: stem_list(x))


In [450]:
#verify that it worked
#df.head()

In [447]:
df = df[~df['stemmed'].str.contains("archiveteam.org contain", case=False)]

In [448]:
df.shape

(95790, 12)

In [462]:
#CHECKPOINT --- SAVE TO CSV
#df.to_csv('df_with_stems_final.csv')

In [2]:
#CHECKPOINT --- RUN TO OPEN CSV IF STARTING WORK HERE
df = pd.read_csv('df_with_stems_final.csv')

NameError: name 'pd' is not defined

In [453]:
documents = df['stemmed'].to_list()

In [454]:
documents[:3]

['never showtim new seri reviv spoiler ahead episod season twin peak may 21st showtim brought back david lynch groundbreak tv seri twin peak fulfil propheci process second season final back 1991 spirit series-defin murder victim laura palmer told fbi special agent seri protagonist dale cooper see 25 years. clip play first episod lynch twin peak reviv remind decad fact gone laura promis',
 'alphago victori defeat human opportun loss human man succumb machin heard alphago latest exploit last week crush world best go player confirm artifici intellig master ancient chines board game may heard news deliv doomsday terms.ther certain melancholi ke jie capitul sure 19-year-old chines prodigi declar would never lose ai follow alphago earthshak victori lee se-dol last year see onstag last week nearli bent doubl',
 'weapon war becam weapon web everi year artist technolog enthusiast meet linz austria ar electronica festiv meetup citi downtown locat danub river festiv eye toward futur someth burn m

In [504]:
#function to vectorize strings and perform tf-idf transformation
def vectorize_texts(list_of_strings):
    print('Performing vectorization and TF/IDF transformation on texts...')
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(list_of_strings)
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf = transformer.fit_transform(X)
    return tfidf

In [505]:
def cluster_texts(num_clusters, tfidf):
    #perform kmeans clustering for range of clusters
    print('Beginning KMeans Clustering, number of clusters = ', num_clusters, '\n') 
    km = KMeans(n_clusters=num_clusters, max_iter = 100, verbose = 2, n_init = 1).fit(tfidf)
    
    
    return km

<h3>Run Clustering for range of K's

In [506]:
documents_vectorized = vectorize_texts(documents)

Performing vectorization and TF/IDF transformation on texts...


In [507]:
kmeans3 = cluster_texts(3, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  3 

Initialization complete
Iteration  0, inertia 184870.417
Iteration  1, inertia 94221.144
Iteration  2, inertia 93995.573
Iteration  3, inertia 93906.754
Iteration  4, inertia 93877.128
Iteration  5, inertia 93865.697
Iteration  6, inertia 93860.637
Iteration  7, inertia 93857.378
Iteration  8, inertia 93854.632
Iteration  9, inertia 93852.623
Iteration 10, inertia 93851.569
Iteration 11, inertia 93851.179
Iteration 12, inertia 93850.997
Iteration 13, inertia 93850.884
Iteration 14, inertia 93850.831
Iteration 15, inertia 93850.796
Iteration 16, inertia 93850.770
Iteration 17, inertia 93850.755
Iteration 18, inertia 93850.748
Iteration 19, inertia 93850.744
Iteration 20, inertia 93850.743
Iteration 21, inertia 93850.742
Iteration 22, inertia 93850.741
Iteration 23, inertia 93850.741
Iteration 24, inertia 93850.741
Iteration 25, inertia 93850.741
Iteration 26, inertia 93850.740
Iteration 27, inertia 93850.740
Iteration 28, inertia 93

In [508]:
kmeans3.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [513]:
test = documents_vectorized.toarray()

In [None]:
score = calinski_harabaz_score(test, kmeans3.labels_)

In [1]:
kmeans4= cluster_texts(documents, 4)

NameError: name 'cluster_texts' is not defined

In [None]:
kmeans5= cluster_texts(documents, 5)

In [None]:
kmeans6= cluster_texts(documents, 6)

In [None]:
kmeans7= cluster_texts(documents, 7)

In [None]:
kmeans8= cluster_texts(documents, 8)

In [None]:
kmeans9= cluster_texts(documents, 9)

In [None]:
kmeans10= cluster_texts(documents, 10)

In [None]:
kmeans11= cluster_texts(documents, 11)

In [None]:
kmeans12= cluster_texts(documents, 12)

In [None]:
k_list = [kmeans3, kmeans4, kmeans5, kmeans6, kmeans7, kmeans8, kmeans9, kmeans10, kmeans11, kmeans12]

In [None]:
labels_list = km.labels_.tolist()
    pred = km.labels_
    score = calinski_harabaz_score(km, pred)

In [None]:
CH_score = []

for i in k_list:
    pred = i.labels_
    score = calinski_harabaz_score(X_2, pred)
    CH_score.append(score)

In [None]:
plt.plot([3, 4, 5, 6, 7], CH_score)
plt.xticks([3,4,5,6,7])
plt.title("Calinski Harabaz Scores for Different Values of K")
plt.ylabel("Variance Ratio")
plt.xlabel("K=")
plt.show()

<h3>Check Clusters for K's

In [None]:
dict_={'text':documents, 'cluster':clusters} #Creating dict having doc with the corresponding cluster number.
frame=pd.DataFrame(dict_,index=[clusters], columns=['text','cluster']) # Converting it into a dataframe.


In [494]:
def cluster_multiple(list_of_strings, max_cluster):
    _dict = []
    
    for i in range(3, max_cluster):
        clusters = cluster_texts(list_of_strings, i)
        
        _dict.append(clusters)
        
        print('finished with clustering iteration...' + '\n\n')
        if i % 3 == 0:
            print(_dict)
    return _dict

In [500]:
#clusters_dict = cluster_multiple(documents, 10)

In [459]:
dict_={'text':documents, 'cluster':three_clusters} #Creating dict having doc with the corresponding cluster number.
frame=pd.DataFrame(dict_,index=[three_clusters], columns=['text','cluster']) # Converting it into a dataframe.


In [460]:
frame['cluster'].value_counts()

1    62948
2    21151
0    11691
Name: cluster, dtype: int64

In [461]:
frame.loc[frame['cluster'] == 0].values

array([['geniu quietli laid bunch engin surviv media compani geniu rais 56.9 million promis would one day annot entir internet lose mind januari compani quietli laid quarter staff bulk cut come engin depart post geniu blog time co-found tom lehman told employe geniu plan shift emphasi away annot platform attract top-tier investor favor becom',
        0],
       ['march 2015 philipp rein former aid hillari clinton us state depart reach old colleagu consult firm client rein contact capricia marshal consult us chief protocol top state depart offic act liaison foreign diplomat rein want marshal arrang meet foreign embassi dataminr compani come scrutini privaci expert servic analyz twitter data.thi piec publish partnership maplight nonprofit organ reveal influenc money politics..partnership-blurb',
        0],
       ['facebook face wither critic us allow fake news spread 2016 presidenti elect social network come similar scrutini germani amid concern widespread disinform campaign could imp

In [248]:
num_clusters = 10 #Change it according to your data.
km = KMeans(n_clusters=num_clusters, max_iter = 100, verbose = 2, n_init = 1)
km.fit(tfidf)
clusters = km.labels_.tolist()

Initialization complete
Iteration  0, inertia 182138.018
Iteration  1, inertia 94497.045
Iteration  2, inertia 93417.592
Iteration  3, inertia 93304.224
Iteration  4, inertia 93262.398
Iteration  5, inertia 93238.397
Iteration  6, inertia 93218.999
Iteration  7, inertia 93197.988
Iteration  8, inertia 93177.286
Iteration  9, inertia 93164.417
Iteration 10, inertia 93159.001
Iteration 11, inertia 93156.461
Iteration 12, inertia 93154.782
Iteration 13, inertia 93153.714
Iteration 14, inertia 93152.909
Iteration 15, inertia 93152.201
Iteration 16, inertia 93151.592
Iteration 17, inertia 93151.122
Iteration 18, inertia 93150.726
Iteration 19, inertia 93150.361
Iteration 20, inertia 93150.076
Iteration 21, inertia 93149.830
Iteration 22, inertia 93149.546
Iteration 23, inertia 93149.164
Iteration 24, inertia 93148.527
Iteration 25, inertia 93147.714
Iteration 26, inertia 93147.339
Iteration 27, inertia 93147.020
Iteration 28, inertia 93146.748
Iteration 29, inertia 93146.494
Iteration 30, i

In [276]:
clusters[:10]

[0, 0, 3, 8, 3, 3, 3, 3, 3, 0]

In [251]:
dict_={'text':documents, 'cluster':clusters} #Creating dict having doc with the corresponding cluster number.
frame=pd.DataFrame(dict_,index=[clusters], columns=['text','cluster']) # Converting it into a dataframe.


In [259]:
frame['cluster'].value_counts()

3    31417
0    21876
9    12774
1    10159
8     6713
6     4363
4     3571
7     2857
2     2060
5     1248
Name: cluster, dtype: int64

In [285]:
frame.loc[frame['cluster'] == 5].values

array([['main site archiv team archiveteam.org contain date inform variou project manifesto plan walkthroughs. collect contain output mani archiv team project ongo complet thank gener provid disk space internet archiv multi-terabyt dataset made avail well use wayback machin provid path back lost websit work collect grown point sub-collect type data acquir seek brows content',
        5],
       ['main site archiv team archiveteam.org contain date inform variou project manifesto plan walkthroughs. collect contain output mani archiv team project ongo complet thank gener provid disk space internet archiv multi-terabyt dataset made avail well use wayback machin provid path back lost websit work collect grown point sub-collect type data acquir seek brows content',
        5],
       ['main site archiv team archiveteam.org contain date inform variou project manifesto plan walkthroughs. collect contain output mani archiv team project ongo complet thank gener provid disk space internet archiv 