In [1]:
import pandas as pd

In [2]:
import nltk
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

import spacy
from gensim import corpora
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def remove_stopwords(text : str):
    textArr = tokenizer.tokenize(text)
    rem_text = " ".join([word for word in textArr if word.lower() not in stop_words ])
    return rem_text

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']):
       output = []
       for sent in texts:
             doc = nlp(sent)
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

LOAD DATA SET

In [4]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [5]:
data=pd.read_csv("/content/gdrive/MyDrive/nllp data set/articles1.csv")

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [7]:
data.count()

Unnamed: 0     50000
id             50000
title          50000
publication    50000
author         43694
date           50000
year           50000
month          50000
url                0
content        50000
dtype: int64

In [8]:
data.isnull().sum()

Unnamed: 0         0
id                 0
title              0
publication        0
author          6306
date               0
year               0
month              0
url            50000
content            0
dtype: int64

DATA Preprocessing

In [9]:
data_=data["content"].drop_duplicates().dropna()[:36000]

In [10]:
data_=data["content"].drop_duplicates()

In [11]:
data_=data_.apply(remove_stopwords)

print(data_.head())

0    WASHINGTON Congressional Republicans new fear ...
1    bullet shells get counted blood dries votive c...
2    Walt Disney Bambi opened 1942 critics praised ...
3    Death may great equalizer necessarily evenhand...
4    SEOUL South Korea North Korea leader Kim said ...
Name: content, dtype: object


In [12]:
data_lemma = lemmatization(data_.tolist())

In [13]:
print(sum(len(x) for x in data_lemma))

8148124


In [14]:
# print number of tokenization
from importlib.util import find_spec as isModule
if(isModule('humanize') != None):
      from humanize import intword
      print(intword(sum(len(x) for x in data_lemma)), " Tokenizations")
else:
      print(sum(len(x) for x in data_lemma), " Tokenizations")

print(data_lemma[:2])

8.1 million  Tokenizations
[['new', 'fear', 'health', 'care', 'lawsuit', 'administration', 'incoming', 'administration', 'executive', 'branch', 'suit', 'administration', 'authority', 'billion', 'dollar', 'health', 'insurance', 'subsidy', 'big', 'victory', 'issue', 'sudden', 'loss', 'subsidy', 'health', 'care', 'program', 'implode', 'million', 'people', 'access', 'health', 'insurance', 'replacement', 'chaos', 'insurance', 'market', 'political', 'backlash', 'full', 'control', 'government', 'stave', 'outcome', 'awkward', 'position', 'huge', 'sum', 'health', 'care', 'law', 'conservative', 'voter', 'end', 'law', 'year', 'twist', 'administration', 'executive', 'branch', 'prerogative', 'republican', 'ally', 'central', 'question', 'ugly', 'political', 'pileup', 'transition', 'team', 'gaming', 'handle', 'lawsuit', 'election', 'limbo', 'late', 'ready', 'divulge', 'strategy', 'litigation', 'administration', 'inappropriate', 'comment', 'spokesman', 'transition', 'effort', 'office', 'administration

In [15]:
# Create a dictionary from the preprocessed data
dictionary = corpora.Dictionary(data_lemma)
# bag of words
corpus = [dictionary.doc2bow(doc) for doc in data_lemma]

In [16]:
# Create a dictionary from the preprocessed data
dictionary = corpora.Dictionary(data_lemma)

# bag of words
doc_term_matrix = [dictionary.doc2bow(doc) for doc in data_lemma]

print(doc_term_matrix[:2])


[[(0, 1), (1, 13), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 5), (15, 1), (16, 7), (17, 4), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 5), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 11), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 5), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 1), (81, 4), (82, 2), (83, 3), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 2), (93, 1), (94, 2), (95, 2), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
train_data,test_data=train_test_split(corpus, test_size=0.3, random_state=42)

In [19]:
import gensim
Lda = gensim.models.LdaMulticore
ldamodel = Lda(corpus=train_data, id2word=dictionary, num_topics=25, passes=30)

In [20]:
print('\nPerplexity: ', ldamodel.log_perplexity(test_data)  )# a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=data_lemma, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.188782166071922

Coherence Score:  0.5330073028833112


In [22]:
!pip install pyLDAvis==3.2.1

  and should_run_async(code)




In [33]:
!pip install pyLDAvis

  and should_run_async(code)




In [23]:
!pip install pandas==1.5.3

  and should_run_async(code)




In [27]:
import pyLDAvis.gensim as gensimvis
import pickle
import pyLDAvis
import os
import pandas as pd

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('/content/gdrive'+str(30))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, '/content/gdrive'+ str(30) +'.html')

LDAvis_prepared

  and should_run_async(code)
  by='saliency', ascending=False).head(R).drop('saliency', 1)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


# New Section