In [1]:
#nltk
#pip install any packages you don't have
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd
import re, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import calinski_harabaz_score

In [3]:
df = pd.read_csv('df_with_gensim_summaries.csv')

In [4]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1.1'], axis = 1, inplace = True)

In [5]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",And never more so than in Showtime’s new serie...,[' And never more so than in Showtime’s n...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,AlphaGo’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,How a weapon against war became a weapon again...,[' How a weapon against war became a weap...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",Genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,Inside the test flight of Facebook’s first int...,[' Inside the test flight of Facebook’s f...


<h3>Tokenize

In [6]:
df.first_100 = df.first_100.str.lower()

In [7]:
df['tokenized_first_100'] = df.first_100.apply(lambda x: word_tokenize(x, language = 'en'))

In [8]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ..."
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,..."
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap..."
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its..."
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s..."


<h3>Remove Stop Words

In [9]:
stops = list(set(stopwords.words('english'))) + list(punctuation) + ['s', "'", 't', 'and', '"', 'a', 'or', '/', 'in',
                                                                    'for', '&', '-', "''"]

In [488]:
#df.head()

In [11]:
#function to remove stop words
def remove_stops(text):
    text_no_stops = []
    for i in text:
        if i not in stops:
            if len(i) == 1:
                pass
            else:
                text_no_stops.append(i)
        else:
            pass
    return text_no_stops

In [12]:
df['first_100_no_stops'] = df['tokenized_first_100'].apply(lambda x: remove_stops(x))

In [155]:
#verify that it worked
#df.head()

<h3>Lemmatization

In [13]:
#initialize WordNetLemmatizer class
lemmatizer = nltk.stem.WordNetLemmatizer()

In [14]:
#function to lemmatize text
def lemmatize_text(text):
    lemmatized = []
    for word in text:
        lemmatized.append(lemmatizer.lemmatize(word))
    return lemmatized
        

In [15]:
df['lemmatize_first_100'] = df['first_100_no_stops'].apply(lemmatize_text)

In [16]:
df['lemmatize_first_100'] = df['lemmatize_first_100'].apply(lambda x: ' '.join(x))

In [17]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ...","[never, showtime, new, series, revival, spoile...",never showtime new series revival spoiler ahea...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,...","[alphago, victory, defeat, humans, opportunity...",alphago victory defeat human opportunity loss ...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap...","[weapon, war, became, weapon, web, every, year...",weapon war became weapon web every year artist...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its...","[genius, quietly, laid, bunch, engineers, surv...",genius quietly laid bunch engineer survive med...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s...","[inside, test, flight, facebook, first, intern...",inside test flight facebook first internet dro...


In [18]:
#df.to_csv('df_with_lemmings.csv')

<h3>KMEANS CLUSTERING

In [13]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from langdetect import detect
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import words
from nltk.tokenize import RegexpTokenizer
import ast

In [19]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",and never more so than in showtime’s new serie...,[' And never more so than in Showtime’s n...,"[and, never, more, so, than, in, showtime, ’, ...","[never, showtime, new, series, revival, spoile...",never showtime new series revival spoiler ahea...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,alphago’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...,"[alphago, ’, s, victory, isn, ’, t, a, defeat,...","[alphago, victory, defeat, humans, opportunity...",alphago victory defeat human opportunity loss ...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,how a weapon against war became a weapon again...,[' How a weapon against war became a weap...,"[how, a, weapon, against, war, became, a, weap...","[weapon, war, became, weapon, web, every, year...",weapon war became weapon web every year artist...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...,"[genius, quietly, laid, off, a, bunch, of, its...","[genius, quietly, laid, bunch, engineers, surv...",genius quietly laid bunch engineer survive med...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,inside the test flight of facebook’s first int...,[' Inside the test flight of Facebook’s f...,"[inside, the, test, flight, of, facebook, ’, s...","[inside, test, flight, facebook, first, intern...",inside test flight facebook first internet dro...


<h3> Detect languages of articles

In [24]:
df['language'] = df['lemmatize_first_100'].apply(detect)

In [26]:
df.groupby('language').count()

Unnamed: 0_level_0,title,content,category,gensim_summary,first_100,sent_tokenized,tokenized_first_100,first_100_no_stops,lemmatize_first_100
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
en,97038,97038,84941,97038,97038,97038,97038,97038,97038
es,4,4,2,4,4,4,4,4,4
fr,19,19,19,19,19,19,19,19,19
it,2,2,2,2,2,2,2,2,2
ko,1,1,1,1,1,1,1,1,1


In [29]:
#drop rows that are not english
df = df.loc[df['language'] == 'en']

In [34]:
#df.to_csv('df_english_articles.csv')

<h3>Modeling

In [345]:
df = pd.read_csv('df_english_articles.csv')

In [None]:
#df.head()

In [336]:
stemmer = PorterStemmer()

In [421]:
test = ['words', 'word', 'running', 'ran']

In [441]:
#create function to stem each word in a list and concat the list
def stem_list(lst):
    stemmed_list = []
    for i in lst:
        stemmed_list.append(stemmer.stem(i))
    stem_string = ' '.join(stemmed_list)
    return stem_string

In [436]:
#convert list contained in string to a regular list so it can be stemmed
df['stemmed'] = df["first_100_no_stops"].apply(lambda x: ast.literal_eval(x))

In [442]:
#stem words in list
df['stemmed'] = df["stemmed"].apply(lambda x: stem_list(x))


In [450]:
#verify that it worked
#df.head()

In [447]:
df = df[~df['stemmed'].str.contains("archiveteam.org contain", case=False)]

In [448]:
df.shape

(95790, 12)

In [462]:
#CHECKPOINT --- SAVE TO CSV
#df.to_csv('df_with_stems_final.csv')

In [4]:
#CHECKPOINT --- RUN TO OPEN CSV IF STARTING WORK HERE
#df = pd.read_csv('df_with_stems_final.csv')
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace = True)

In [6]:
#df.head()

In [7]:
documents = df['stemmed'].to_list()

In [8]:
documents[:3]

['never showtim new seri reviv spoiler ahead episod season twin peak may 21st showtim brought back david lynch groundbreak tv seri twin peak fulfil propheci process second season final back 1991 spirit series-defin murder victim laura palmer told fbi special agent seri protagonist dale cooper see 25 years. clip play first episod lynch twin peak reviv remind decad fact gone laura promis',
 'alphago victori defeat human opportun loss human man succumb machin heard alphago latest exploit last week crush world best go player confirm artifici intellig master ancient chines board game may heard news deliv doomsday terms.ther certain melancholi ke jie capitul sure 19-year-old chines prodigi declar would never lose ai follow alphago earthshak victori lee se-dol last year see onstag last week nearli bent doubl',
 'weapon war becam weapon web everi year artist technolog enthusiast meet linz austria ar electronica festiv meetup citi downtown locat danub river festiv eye toward futur someth burn m

In [9]:
#function to vectorize strings and perform tf-idf transformation
def vectorize_texts(list_of_strings):
    print('Performing vectorization and TF/IDF transformation on texts...')
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(list_of_strings)
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf = transformer.fit_transform(X)
    return tfidf

In [10]:
def cluster_texts(num_clusters, tfidf):
    #perform kmeans clustering for range of clusters
    print('Beginning KMeans Clustering, number of clusters = ', num_clusters, '\n') 
    km = KMeans(n_clusters=num_clusters, max_iter = 100, verbose = 2, n_init = 1).fit(tfidf)
    
    
    return km

<h3>Run Clustering for range of K's

In [11]:
documents_vectorized = vectorize_texts(documents)

Performing vectorization and TF/IDF transformation on texts...


In [14]:
kmeans3 = cluster_texts(3, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  3 

Initialization complete
Iteration  0, inertia 185417.149
Iteration  1, inertia 94192.242
Iteration  2, inertia 93981.211
Iteration  3, inertia 93903.577
Iteration  4, inertia 93877.059
Iteration  5, inertia 93863.868
Iteration  6, inertia 93857.832
Iteration  7, inertia 93855.082
Iteration  8, inertia 93853.527
Iteration  9, inertia 93852.659
Iteration 10, inertia 93852.199
Iteration 11, inertia 93851.906
Iteration 12, inertia 93851.712
Iteration 13, inertia 93851.602
Iteration 14, inertia 93851.532
Iteration 15, inertia 93851.489
Iteration 16, inertia 93851.446
Iteration 17, inertia 93851.428
Iteration 18, inertia 93851.416
Iteration 19, inertia 93851.408
Iteration 20, inertia 93851.403
Iteration 21, inertia 93851.399
Iteration 22, inertia 93851.397
Iteration 23, inertia 93851.396
Iteration 24, inertia 93851.395
Iteration 25, inertia 93851.394
Iteration 26, inertia 93851.394
Iteration 27, inertia 93851.394
Converged at iteration 2

In [15]:
kmeans3.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [513]:
test = documents_vectorized.toarray()

In [None]:
score = calinski_harabaz_score(test, kmeans3.labels_)

In [18]:
kmeans4= cluster_texts(4, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  4 

Initialization complete
Iteration  0, inertia 185231.174
Iteration  1, inertia 94194.112
Iteration  2, inertia 93873.094
Iteration  3, inertia 93802.144
Iteration  4, inertia 93783.657
Iteration  5, inertia 93764.486
Iteration  6, inertia 93745.738
Iteration  7, inertia 93737.436
Iteration  8, inertia 93732.944
Iteration  9, inertia 93730.129
Iteration 10, inertia 93728.577
Iteration 11, inertia 93727.789
Iteration 12, inertia 93727.387
Iteration 13, inertia 93727.173
Iteration 14, inertia 93727.032
Iteration 15, inertia 93726.940
Iteration 16, inertia 93726.884
Iteration 17, inertia 93726.849
Iteration 18, inertia 93726.826
Iteration 19, inertia 93726.802
Iteration 20, inertia 93726.767
Iteration 21, inertia 93726.751
Iteration 22, inertia 93726.741
Iteration 23, inertia 93726.726
Iteration 24, inertia 93726.711
Iteration 25, inertia 93726.700
Iteration 26, inertia 93726.687
Iteration 27, inertia 93726.668
Iteration 28, inertia 93

In [19]:
kmeans5= cluster_texts(5, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  5 

Initialization complete
Iteration  0, inertia 183811.131
Iteration  1, inertia 94008.502
Iteration  2, inertia 93776.893
Iteration  3, inertia 93704.016
Iteration  4, inertia 93676.194
Iteration  5, inertia 93665.691
Iteration  6, inertia 93660.893
Iteration  7, inertia 93658.211
Iteration  8, inertia 93656.282
Iteration  9, inertia 93654.502
Iteration 10, inertia 93652.973
Iteration 11, inertia 93651.605
Iteration 12, inertia 93649.618
Iteration 13, inertia 93647.007
Iteration 14, inertia 93642.592
Iteration 15, inertia 93633.020
Iteration 16, inertia 93618.593
Iteration 17, inertia 93613.156
Iteration 18, inertia 93611.727
Iteration 19, inertia 93610.996
Iteration 20, inertia 93610.551
Iteration 21, inertia 93610.287
Iteration 22, inertia 93610.109
Iteration 23, inertia 93610.002
Iteration 24, inertia 93609.933
Iteration 25, inertia 93609.859
Iteration 26, inertia 93609.822
Iteration 27, inertia 93609.801
Iteration 28, inertia 93

In [20]:
kmeans6= cluster_texts(6, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  6 

Initialization complete
Iteration  0, inertia 183325.682
Iteration  1, inertia 94032.954
Iteration  2, inertia 93752.987
Iteration  3, inertia 93623.834
Iteration  4, inertia 93566.280
Iteration  5, inertia 93534.085
Iteration  6, inertia 93511.956
Iteration  7, inertia 93494.591
Iteration  8, inertia 93480.131
Iteration  9, inertia 93471.371
Iteration 10, inertia 93466.003
Iteration 11, inertia 93461.113
Iteration 12, inertia 93455.414
Iteration 13, inertia 93449.723
Iteration 14, inertia 93443.690
Iteration 15, inertia 93429.701
Iteration 16, inertia 93413.804
Iteration 17, inertia 93410.184
Iteration 18, inertia 93409.451
Iteration 19, inertia 93409.208
Iteration 20, inertia 93409.077
Iteration 21, inertia 93409.017
Iteration 22, inertia 93408.988
Iteration 23, inertia 93408.975
Iteration 24, inertia 93408.967
Iteration 25, inertia 93408.962
Iteration 26, inertia 93408.961
Iteration 27, inertia 93408.960
Iteration 28, inertia 93

In [21]:
kmeans7= cluster_texts(7, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  7 

Initialization complete
Iteration  0, inertia 183795.868
Iteration  1, inertia 93909.600
Iteration  2, inertia 93580.205
Iteration  3, inertia 93499.381
Iteration  4, inertia 93442.008
Iteration  5, inertia 93390.921
Iteration  6, inertia 93379.439
Iteration  7, inertia 93371.263
Iteration  8, inertia 93365.784
Iteration  9, inertia 93362.429
Iteration 10, inertia 93360.238
Iteration 11, inertia 93358.306
Iteration 12, inertia 93356.849
Iteration 13, inertia 93354.689
Iteration 14, inertia 93349.996
Iteration 15, inertia 93342.676
Iteration 16, inertia 93340.173
Iteration 17, inertia 93339.125
Iteration 18, inertia 93338.183
Iteration 19, inertia 93337.216
Iteration 20, inertia 93336.229
Iteration 21, inertia 93335.405
Iteration 22, inertia 93334.719
Iteration 23, inertia 93334.134
Iteration 24, inertia 93333.613
Iteration 25, inertia 93333.116
Iteration 26, inertia 93332.695
Iteration 27, inertia 93332.330
Iteration 28, inertia 93

In [22]:
kmeans8= cluster_texts(8, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  8 

Initialization complete
Iteration  0, inertia 182019.619
Iteration  1, inertia 93871.466
Iteration  2, inertia 93542.235
Iteration  3, inertia 93423.602
Iteration  4, inertia 93353.860
Iteration  5, inertia 93313.134
Iteration  6, inertia 93291.406
Iteration  7, inertia 93279.023
Iteration  8, inertia 93270.781
Iteration  9, inertia 93264.181
Iteration 10, inertia 93258.200
Iteration 11, inertia 93251.715
Iteration 12, inertia 93246.635
Iteration 13, inertia 93242.689
Iteration 14, inertia 93239.442
Iteration 15, inertia 93236.716
Iteration 16, inertia 93234.270
Iteration 17, inertia 93230.595
Iteration 18, inertia 93224.534
Iteration 19, inertia 93221.466
Iteration 20, inertia 93220.512
Iteration 21, inertia 93220.041
Iteration 22, inertia 93219.685
Iteration 23, inertia 93219.352
Iteration 24, inertia 93219.043
Iteration 25, inertia 93218.761
Iteration 26, inertia 93218.416
Iteration 27, inertia 93218.004
Iteration 28, inertia 93

In [23]:
kmeans9= cluster_texts(9, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  9 

Initialization complete
Iteration  0, inertia 181983.493
Iteration  1, inertia 93771.841
Iteration  2, inertia 93355.062
Iteration  3, inertia 93267.022
Iteration  4, inertia 93241.009
Iteration  5, inertia 93228.513
Iteration  6, inertia 93218.177
Iteration  7, inertia 93206.260
Iteration  8, inertia 93198.028
Iteration  9, inertia 93193.134
Iteration 10, inertia 93190.658
Iteration 11, inertia 93189.185
Iteration 12, inertia 93188.179
Iteration 13, inertia 93187.276
Iteration 14, inertia 93186.491
Iteration 15, inertia 93185.701
Iteration 16, inertia 93184.458
Iteration 17, inertia 93182.964
Iteration 18, inertia 93180.671
Iteration 19, inertia 93177.406
Iteration 20, inertia 93173.770
Iteration 21, inertia 93168.660
Iteration 22, inertia 93162.855
Iteration 23, inertia 93156.551
Iteration 24, inertia 93151.150
Iteration 25, inertia 93146.688
Iteration 26, inertia 93142.516
Iteration 27, inertia 93138.930
Iteration 28, inertia 93

In [24]:
kmeans10= cluster_texts(10, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  10 

Initialization complete
Iteration  0, inertia 180788.896
Iteration  1, inertia 93700.153
Iteration  2, inertia 93398.157
Iteration  3, inertia 93273.996
Iteration  4, inertia 93197.660
Iteration  5, inertia 93146.223
Iteration  6, inertia 93088.761
Iteration  7, inertia 93043.641
Iteration  8, inertia 93030.912
Iteration  9, inertia 93021.742
Iteration 10, inertia 93007.681
Iteration 11, inertia 92991.070
Iteration 12, inertia 92979.084
Iteration 13, inertia 92970.768
Iteration 14, inertia 92964.771
Iteration 15, inertia 92960.661
Iteration 16, inertia 92956.838
Iteration 17, inertia 92952.473
Iteration 18, inertia 92947.638
Iteration 19, inertia 92942.819
Iteration 20, inertia 92938.672
Iteration 21, inertia 92936.422
Iteration 22, inertia 92935.895
Iteration 23, inertia 92935.813
Iteration 24, inertia 92935.776
Iteration 25, inertia 92935.759
Iteration 26, inertia 92935.748
Iteration 27, inertia 92935.743
Iteration 28, inertia 9

In [25]:
kmeans11= cluster_texts(11, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  11 

Initialization complete
Iteration  0, inertia 181002.707
Iteration  1, inertia 93664.278
Iteration  2, inertia 93336.676
Iteration  3, inertia 93228.509
Iteration  4, inertia 93167.443
Iteration  5, inertia 93118.252
Iteration  6, inertia 93076.728
Iteration  7, inertia 93028.586
Iteration  8, inertia 92999.662
Iteration  9, inertia 92983.819
Iteration 10, inertia 92973.506
Iteration 11, inertia 92966.601
Iteration 12, inertia 92961.224
Iteration 13, inertia 92957.842
Iteration 14, inertia 92955.677
Iteration 15, inertia 92954.239
Iteration 16, inertia 92952.922
Iteration 17, inertia 92951.884
Iteration 18, inertia 92951.260
Iteration 19, inertia 92950.577
Iteration 20, inertia 92949.906
Iteration 21, inertia 92949.148
Iteration 22, inertia 92948.153
Iteration 23, inertia 92946.812
Iteration 24, inertia 92944.338
Iteration 25, inertia 92938.884
Iteration 26, inertia 92929.136
Iteration 27, inertia 92917.510
Iteration 28, inertia 9

In [26]:
kmeans12= cluster_texts(12, documents_vectorized)

Beginning KMeans Clustering, number of clusters =  12 

Initialization complete
Iteration  0, inertia 180397.402
Iteration  1, inertia 93703.946
Iteration  2, inertia 93268.502
Iteration  3, inertia 93103.217
Iteration  4, inertia 92984.758
Iteration  5, inertia 92884.436
Iteration  6, inertia 92848.951
Iteration  7, inertia 92841.731
Iteration  8, inertia 92837.937
Iteration  9, inertia 92834.947
Iteration 10, inertia 92832.361
Iteration 11, inertia 92830.079
Iteration 12, inertia 92827.424
Iteration 13, inertia 92823.431
Iteration 14, inertia 92817.678
Iteration 15, inertia 92812.365
Iteration 16, inertia 92808.479
Iteration 17, inertia 92805.286
Iteration 18, inertia 92803.095
Iteration 19, inertia 92800.855
Iteration 20, inertia 92798.761
Iteration 21, inertia 92796.542
Iteration 22, inertia 92794.564
Iteration 23, inertia 92792.805
Iteration 24, inertia 92791.114
Iteration 25, inertia 92789.411
Iteration 26, inertia 92787.509
Iteration 27, inertia 92786.011
Iteration 28, inertia 9

In [34]:
k_list = [kmeans3, kmeans4, kmeans5, kmeans6, kmeans7, kmeans8, kmeans9, kmeans10, kmeans11, kmeans12]

In [32]:
kmeans3.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [30]:
kmeans_df = pd.DataFrame()

In [36]:
for i in k_list:
    labels = i.labels_
    kmeans_df[i] = labels

In [38]:
kmeans_df.columns = ['kmeans3', 'kmeans4', 'kmeans5', 'kmeans6', 'kmeans7', 'kmeans8', 'kmeans9', 'kmeans10', 'kmeans11', 'kmeans12']

In [50]:
kmeans_df['stemmed'] = df['stemmed']

In [51]:
print(df.shape)
print(kmeans_df.shape)

(95790, 11)
(95790, 11)


In [52]:
kmeans_df.head()

Unnamed: 0,kmeans3,kmeans4,kmeans5,kmeans6,kmeans7,kmeans8,kmeans9,kmeans10,kmeans11,kmeans12,stemmed
0,0,0,1,0,6,4,6,8,9,8,never showtim new seri reviv spoiler ahead epi...
1,0,0,1,0,6,4,6,8,9,8,alphago victori defeat human opportun loss hum...
2,0,0,1,0,6,4,2,3,9,10,weapon war becam weapon web everi year artist ...
3,0,1,2,0,5,1,4,9,5,2,geniu quietli laid bunch engin surviv media co...
4,0,0,1,0,6,4,2,3,9,10,insid test flight facebook first internet dron...


In [70]:
#CHECKPOINT --- SAVE TO CSV TO AVOID RUNNING KMEANS FUNCTIONS AGAIN
kmeans_df.to_csv('kmeans_df.csv')

In [None]:
labels_list = km.labels_.tolist()
    pred = km.labels_
    score = calinski_harabaz_score(km, pred)

In [None]:
CH_score = []

for i in k_list:
    pred = i.labels_
    score = calinski_harabaz_score(X_2, pred)
    CH_score.append(score)

In [None]:
plt.plot([3, 4, 5, 6, 7], CH_score)
plt.xticks([3,4,5,6,7])
plt.title("Calinski Harabaz Scores for Different Values of K")
plt.ylabel("Variance Ratio")
plt.xlabel("K=")
plt.show()

<h3>Check Clusters for K's

In [46]:
kmeans_df['kmeans3'].value_counts()

0    55197
1    20409
2    20184
Name: kmeans3, dtype: int64

In [68]:
kmeans_df['kmeans6'].value_counts()

0    47367
4    25039
5    13712
1     4468
3     4243
2      961
Name: kmeans6, dtype: int64

In [69]:
kmeans_df.loc[kmeans_df['kmeans6'] == 2].values

array([[1, 2, 3, ..., 1, 11,
        'nuclear-tip missil hurtl toward unit state would abl stop mayb lucki expert warn unit state missil defens system reliabl peopl might think right constel sensor 36 interceptor missil make ground-bas midcours defens system gmd intend act insur small-scal nuclear attack north korea possibl iran accord depart defens neither countri missil capabl reach us although us offici say north korea get'],
       [1, 2, 3, ..., 1, 11,
        'donald trump offici presid unit state complet control america nuclear arsen decid start nuclear war legal safeguard stop instead much less tangibl web norm taboo fear rein us presid sinc world war ii north korea escal nuclear weapon test russia promis strengthen nuclear forc new presid unit state openli tweet us must strengthen expand nuclear capabl expert worri fragil web'],
       [1, 2, 3, ..., 1, 11,
        "despit effort last three american presid north korea continu advanc nuclear state donald trump rein rogu state b