In [1]:
import pandas as pd
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import spacy
from nltk.corpus import stopwords     # list of common English filler words 

nltk.download('stopwords')
nltk.download('punkt')
import spacy.cli 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/echoes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/echoes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv("artist_data.csv")
#df = pd.read_csv("artist_data_copy.csv")
# Remove "nan" or missing values in Text column 
#df = df.copy().dropna()
df = df.dropna(subset = ['text']).reset_index(drop = True)

In [None]:
def remove_markdown(text):
    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)     # Regex to remove markdown links 
    text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', text)    # Regex to remove markdown images 
    text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text)           # Regex to remove bold formatting 
    text = re.sub(r'(\*|_)(.*?)\1', r'\2', text)              # Regex to remove italic formatting 
    text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) # Regex to remove markdown headings 
    text = re.sub(r'```markdown', '', text)                   # Regex to remove ```markdown 
    return text 

df['no_md_text'] = df['text'].apply(remove_markdown)

In [4]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [5]:
from nltk.tokenize import word_tokenize
import spacy

nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])


def preprocess_text(text):
    """
    This function will do the following to each row of a column:
    - lowercase all of the text 
    - remove punctuation 
    - tokenize the text into individual words 
    - lemmatize 
    - remove stopwords 
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    
    # tokenize + POS-tag + lemmatize all in one spacy function call 
    doc = nlp(text)

    # lemmatize words (and remove stopwords)
    lemmas_tokens = []
    for word in doc:
        if not word.is_stop and not word.is_punct:
            lemmas_tokens.append(word.lemma_)

    
    return ' '.join(lemmas_tokens)    # return filtered words as a single string 

# extract meaningful text: only use paragraphs that contain at least a minimum number of words 
def remove_short_paragraphs(text, min_word_count = 10):
    """
    This function will remove paragraphs that have less number of 
    words than the min_word_count. 
    """
    # 1. split text with 2x newline characters (i.e. = paragraph)
    paragraphs = text.split("\n\n")

    # 2. remove paragraphs that contain fewer than min_word_count number of words 
    good_paragraphs = [] 
    for p in paragraphs:
        if len(p.split()) >= min_word_count:
            good_paragraphs.append(p)

    # 3. rejoin these paragraphs
    return "\n\n".join(good_paragraphs)

#-----------------------------------------------------------------
df['clean_text'] = df['no_md_text'].apply(remove_short_paragraphs)
df['cleaner_text'] = df['clean_text'].apply(preprocess_text)

#df

## Get word count and lexical diversity of each article 

In [6]:
from nltk.tokenize import word_tokenize 
from textblob import TextBlob 

def get_word_count(text):
    return len(text.split())

def get_lexical_diversity(text):
    """
    Lexical diversity measures the relative number of unique words in text. 
    """
    tokens = text.split()

    unique_tokens = set(tokens)   # a set() is like a list but only unique values 
    lex_diversity = len(unique_tokens) / len(tokens)
    return lex_diversity


# Get word count and character count 
df['word_count'] = df['cleaner_text'].apply(get_word_count)
df['lexical_diversity'] = df['cleaner_text'].apply(get_lexical_diversity)

df['char_count'] = df['cleaner_text'].apply(len)

### Split articles into 500-word sections 

In [None]:
import math 

titles = [] 

for orig_idx, row in df.iterrows(): 
    words = row['cleaner_text'].split() 
    #words = row['clean_text'].split() 

    n_titles = math.ceil(len(words) / 500)

    # then loop over each section / title (i.e. each 500 word section of article)
    for title_i in range(n_titles):
        start = title_i * 500  # 1st section; start = 0, 2nd section, start = 500, etc 
        end = start + 500 
        chunk_words = words[start:end]
        chunk_text = " ".join(chunk_words)

        titles.append({
            'line_number': row['line_number'],
            'year' : row['year'],
            'article_name': row['article_name'],
            'media_type' : row['media_type'],
            'specific_type': row['specific_type'],

            'section_id': title_i + 1,
            'section_text' : chunk_text,
            'section_word_count' : len(chunk_words)
        })

# turn into dataframe 
df_sections = pd.DataFrame(titles)

In [17]:
df_sections

Unnamed: 0,Line Number,Year,Article Name,media type,section_id,section_text,section_word_count
0,1,2025,I make millions from AI art — but the law has ...,article,1,million ai art law fair refik anadol support f...,305
1,2,2024,AI’s assault on our intellectual property must...,article,1,use sharing tool find share button article cop...,487
2,3,2025,Photographer slams AI bots that are copying hi...,article,1,publish 730 1 mar 2025 update 1647 3 mar 2025 ...,435
3,4,2023,The problem with AI-generated art ｜ Steven Zap...,audio,1,tanya cushman reviewer reviewer imagine year f...,500
4,4,2023,The problem with AI-generated art ｜ Steven Zap...,audio,2,anybody think possible peer dismaying experien...,446
...,...,...,...,...,...,...,...
433,136,2020,THE VOICE OF THE ARTIST IN THE AGE OF THE ALGO...,paper,3,digital daily life shaw kite explore contempor...,500
434,136,2020,THE VOICE OF THE ARTIST IN THE AGE OF THE ALGO...,paper,4,states license 8 thirteen question mention ear...,500
435,136,2020,THE VOICE OF THE ARTIST IN THE AGE OF THE ALGO...,paper,5,voice feeling idea inclusive collaborative dis...,500
436,136,2020,THE VOICE OF THE ARTIST IN THE AGE OF THE ALGO...,paper,6,performance online time gamer potential custom...,500


In [18]:
df_sections.to_csv("Sections_of_Articles.csv", index = False)