In [12]:
# Generally useful libs
from pathlib import Path
import pandas as pd
import numpy as np
import pickle
import math
import os
import re

# For visualisation
%matplotlib inline     
# Assumes Jupyter notebook
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

# For word embeddings
#from gensim.models.word2vec import Word2Vec

# For parquet files
import pyarrow

# For dimensionality reduction
import umap

# For automating the 'knee method'
from kneed import KneeLocator

# For hierarchical clustering
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, centroid
from tabulate import tabulate

# For validation and visualisation
from sklearn.metrics import confusion_matrix, classification_report, silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#from ctfidf import CTFIDFVectorizer

from wordcloud import WordCloud

In [13]:
textdir = Path("~/shared/RR-workshop-data/text_datasets").expanduser()
textdf = pd.read_csv("nyt_df_2023-10.csv", encoding = 'utf-8')
textdf.head()

Unnamed: 0.1,Unnamed: 0,abstract,web_url,snippet,lead_paragraph,source,multimedia,keywords,pub_date,document_type,...,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,print_section,print_page
0,0,The new chairman of the Senate Foreign Relatio...,https://www.nytimes.com/2023/09/30/us/politics...,The new chairman of the Senate Foreign Relatio...,"Senator Ben Cardin of Maryland, the new chairm...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'glocations', 'value': 'Egypt', 'ran...",2023-10-01T00:06:00+0000,article,...,,,,,,By Karoun Demirjian,"[{'firstname': 'Karoun', 'middlename': None, '...",,,
1,1,The Senate overwhelmingly passed a bill to fun...,https://www.nytimes.com/interactive/2023/09/30...,The Senate overwhelmingly passed a bill to fun...,The Senate overwhelmingly passed a bill to fun...,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Shutdowns (Inst...",2023-10-01T00:06:25+0000,multimedia,...,,,,,,"By Lazaro Gamio, Alicia Parlapiano, Jasmine C....","[{'firstname': 'Lazaro', 'middlename': None, '...",,,
2,2,"In much of Europe, the election in Slovakia wa...",https://www.nytimes.com/2023/09/30/world/europ...,"In much of Europe, the election in Slovakia wa...",A Russia-friendly populist party finished firs...,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'glocations', 'value': 'Slovakia', '...",2023-10-01T00:18:47+0000,article,...,,,,,,By Andrew Higgins,"[{'firstname': 'Andrew', 'middlename': None, '...",,,
3,3,"Gideon Cody, who orchestrated a widely critici...",https://www.nytimes.com/2023/09/30/us/marion-k...,"Gideon Cody, who orchestrated a widely critici...","The police chief of Marion, Kansas, was suspen...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Newspapers', 'r...",2023-10-01T00:47:45+0000,article,...,,Police Chief Who Ordered Newspaper Raid Is Sus...,,,,By Orlando Mayorquin,"[{'firstname': 'Orlando', 'middlename': None, ...",,A,21.0
4,4,The assistant general manager of the Calgary F...,https://www.nytimes.com/2023/09/30/sports/chri...,The assistant general manager of the Calgary F...,"Chris Snow, the assistant general manager of t...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'persons', 'value': 'Snow, Chris', '...",2023-10-01T01:38:51+0000,article,...,,"Chris Snow, 42, Hockey Executive Who ‘Never St...",,,,By Sopan Deb,"[{'firstname': 'Sopan', 'middlename': None, 'l...",,B,10.0


In [14]:
textdf.columns

Index(['Unnamed: 0', 'abstract', 'web_url', 'snippet', 'lead_paragraph',
       'source', 'multimedia', 'keywords', 'pub_date', 'document_type',
       'news_desk', 'section_name', 'subsection_name', 'type_of_material',
       '_id', 'word_count', 'uri', 'headline.main', 'headline.kicker',
       'headline.content_kicker', 'headline.print_headline', 'headline.name',
       'headline.seo', 'headline.sub', 'byline.original', 'byline.person',
       'byline.organization', 'print_section', 'print_page'],
      dtype='object')

In [15]:
textdf_sub = textdf.loc[:, ['headline.main', 'pub_date', 'byline.person', 'word_count', 'keywords', 'document_type', 'news_desk', 'section_name', 'subsection_name', 'abstract', 'lead_paragraph']]
textdf_sub['key_text'] = textdf[['abstract', 'lead_paragraph', 'headline.main']].apply(lambda row: '\n'.join(row.values.astype(str)), axis = 1)
textdf_sub.head()

Unnamed: 0,headline.main,pub_date,byline.person,word_count,keywords,document_type,news_desk,section_name,subsection_name,abstract,lead_paragraph,key_text
0,Senate Democrat Threatens to Block More of Mil...,2023-10-01T00:06:00+0000,"[{'firstname': 'Karoun', 'middlename': None, '...",681,"[{'name': 'glocations', 'value': 'Egypt', 'ran...",article,Washington,U.S.,Politics,The new chairman of the Senate Foreign Relatio...,"Senator Ben Cardin of Maryland, the new chairm...",The new chairman of the Senate Foreign Relatio...
1,How Each Member Voted on the Senate Stopgap Sp...,2023-10-01T00:06:25+0000,"[{'firstname': 'Lazaro', 'middlename': None, '...",0,"[{'name': 'subject', 'value': 'Shutdowns (Inst...",multimedia,Graphics,U.S.,Politics,The Senate overwhelmingly passed a bill to fun...,The Senate overwhelmingly passed a bill to fun...,The Senate overwhelmingly passed a bill to fun...
2,What Does a Russia-Leaning Party Win in an E.U...,2023-10-01T00:18:47+0000,"[{'firstname': 'Andrew', 'middlename': None, '...",1269,"[{'name': 'glocations', 'value': 'Slovakia', '...",article,Foreign,World,Europe,"In much of Europe, the election in Slovakia wa...",A Russia-friendly populist party finished firs...,"In much of Europe, the election in Slovakia wa..."
3,Police Chief Who Ordered Raid on Kansas Newspa...,2023-10-01T00:47:45+0000,"[{'firstname': 'Orlando', 'middlename': None, ...",623,"[{'name': 'subject', 'value': 'Newspapers', 'r...",article,Express,U.S.,,"Gideon Cody, who orchestrated a widely critici...","The police chief of Marion, Kansas, was suspen...","Gideon Cody, who orchestrated a widely critici..."
4,"Chris Snow, Hockey Executive Who Publicly Face...",2023-10-01T01:38:51+0000,"[{'firstname': 'Sopan', 'middlename': None, 'l...",729,"[{'name': 'persons', 'value': 'Snow, Chris', '...",article,Sports,Sports,,The assistant general manager of the Calgary F...,"Chris Snow, the assistant general manager of t...",The assistant general manager of the Calgary F...


# Lemmatization (be consistent and use same process as in previous nbs!!)

In [16]:
# lemmatization (optional step)
""" import spacy
import re
nlp = spacy.load("en_core_web_sm")

def lemmatizeAbstracts(x):
    doc = nlp(x)
    new_text = []
    for token in doc:
        new_text.append(token.lemma_)
    text_string = " ".join(new_text)
    # getting rid of non-word characters
    text_string = re.sub(r"[^\w\s]+", "", text_string)
    text_string = re.sub(r"\s{2,}", " ", text_string)
    return text_string

textdf_sub["keytext_lemma"] = textdf_sub["key_text"].apply(lemmatizeAbstracts)
textdf_sub.loc[:2, ['key_text', 'keytext_lemma']] """

' import spacy\nimport re\nnlp = spacy.load("en_core_web_sm")\n\ndef lemmatizeAbstracts(x):\n    doc = nlp(x)\n    new_text = []\n    for token in doc:\n        new_text.append(token.lemma_)\n    text_string = " ".join(new_text)\n    # getting rid of non-word characters\n    text_string = re.sub(r"[^\\w\\s]+", "", text_string)\n    text_string = re.sub(r"\\s{2,}", " ", text_string)\n    return text_string\n\ntextdf_sub["keytext_lemma"] = textdf_sub["key_text"].apply(lemmatizeAbstracts)\ntextdf_sub.loc[:2, [\'key_text\', \'keytext_lemma\']] '

In [17]:
# Random seed
rs = 43

# Which embeddings to use
src_embeddings = 'doc_vec'

In [18]:
dmeasure = 'euclidean' # distance metric
rdims    = 4 # r-dims == Reduced dimensionality
print(f"UMAP dimensionality reduction to {rdims} dimensions with '{dmeasure}' distance measure.")

UMAP dimensionality reduction to 4 dimensions with 'euclidean' distance measure.


In [19]:
# Extract the embedding from a list-type column
# in the source data frame using this function
def x_from_df(df:pd.DataFrame, col:str='Embedding') -> pd.DataFrame:
    cols = ['E'+str(x) for x in np.arange(0,len(df[col].iloc[0]))]
    return pd.DataFrame(df[col].tolist(), columns=cols, index=df.index)

X = x_from_df(textdf_sub, col=src_embeddings)

# Create and apply a UMAP 'reducer'
reducer = umap.UMAP(
    n_neighbors=25,
    min_dist=0.01,
    n_components=rdims,
    random_state=rs)

X_embedded = reducer.fit_transform(X)

# Create a dictionary that is easily converted into a pandas df
embedded_dict = {}
for i in range(0,X_embedded.shape[1]):
    embedded_dict[f"Dim {i+1}"] = X_embedded[:,i] # D{dimension_num} (Dim 1...Dim n)

dfe = pd.DataFrame(embedded_dict, index=textdf_sub.index)
del(embedded_dict)

KeyError: 'doc_vec'

In [None]:
src_embeddings

'doc_vec'