In [1]:
import os
if  not 'juxtorpus' in os.listdir():
    os.chdir('../')
assert 'juxtorpus' in os.listdir(), f"Working directory should be at juxtorpus. But at {os.getcwd()}"
f"Working directory: {os.getcwd()}" 

'Working directory: /Users/hcha9747/workspace/juxtorpus'

# Processing

This demos processing of the Corpus class using Processors.

In [2]:
from juxtorpus.corpus import CorpusBuilder
from pathlib import Path

builder = CorpusBuilder(Path("./assets/samples/elonmusk_tweets.csv"))
builder.head(5)

Unnamed: 0.1,Unnamed: 0,id,created_at,doc
0,0,849636868052275200,2017-04-05 14:56:29,And so the robots spared humanity ... https://...
1,1,848988730585096192,2017-04-03 20:01:01,@ForIn2020 @waltmossberg @mims @defcon_5 Exact...
2,2,848943072423497728,2017-04-03 16:59:35,"@waltmossberg @mims @defcon_5 Et tu, Walt?"
3,3,848935705057280001,2017-04-03 16:30:19,Stormy weather in Shortville ...
4,4,848416049573658624,2017-04-02 06:05:23,@DaveLeeBBC @verge Coal is dying due to nat ga...


In [3]:
builder.show_columns()

Unnamed: 0,0,1,2,3
All Columns,Unnamed: 0,id,created_at,doc
Added,,,,


In [4]:
builder.set_text_column('doc')
builder.set_nrows(100)
builder.add_meta('created_at', dtype='datetime')
corpus = builder.build()

corpus.summary()

Number of words           1386
Number of unique words     752
Number of documents        100
Name: frequency, dtype: uint64

### Process corpus with spaCy

In [5]:
import spacy, pathlib

nlp = spacy.load('en_core_web_sm', exclude=['ner'])    # can be a data directory you specify. (but make sure you have the required components in the global registry)

In [6]:
from juxtorpus.corpus.processors import SpacyProcessor

nlp.add_pipe('extract_hashtags')    # see  juxtorpus.corpus.processors

spacy_processor = SpacyProcessor(nlp)
corpus = spacy_processor.run(corpus)

f"Corpus type is now {type(corpus)}."

"Corpus type is now <class 'juxtorpus.corpus.SpacyCorpus'>."

In [7]:
corpus.metas()

frozendict.frozendict({'created_at': <SeriesMeta [Id: created_at]>, 'extract_hashtags': <DocMeta [Id: extract_hashtags, Attribute: hashtags]})

In [8]:
corpus.metas().get('extract_hashtags').head(5)

Unnamed: 0,text,extract_hashtags
0,And so the robots spared humanity ... https://...,[]
1,@ForIn2020 @waltmossberg @mims @defcon_5 Exact...,[]
2,"@waltmossberg @mims @defcon_5 Et tu, Walt?",[]
3,Stormy weather in Shortville ...,[]
4,@DaveLeeBBC @verge Coal is dying due to nat ga...,[]


In [9]:
from juxtorpus.corpus import CorpusSlicer

slicer = CorpusSlicer(corpus)
sliced = slicer.filter_by_condition('extract_hashtags', lambda hashtags: len(hashtags) > 0)

_newline = '\n'
print(f"There are {len(sliced)} documents with hashtags...\n{_newline.join([d.text for d in sliced])}")

There are 3 documents with hashtags...
@jDaz Because I was a dumb idiot and didn't realize at the time that it would cause confusion #elonmusk
@varunfatehpuria Oh Model Y is coming too in a few years. Kinda have to. #elonmusk
Where are the #%*&gt; aliens? https://t.co/FDuJIdwgrN


In [10]:
# filter by for #elonmusk only...
sliced = CorpusSlicer(sliced).filter_by_item('extract_hashtags', '#elonmusk')

for doc in sliced:
    print(doc)

@jDaz Because I was a dumb idiot and didn't realize at the time that it would cause confusion #elonmusk
@varunfatehpuria Oh Model Y is coming too in a few years. Kinda have to. #elonmusk


In [11]:
# processing history
sliced.history()

[<ProcessEpisode> Spacy Processor processed on 2022-09-30 11:10:30.473728 with pipeline components tok2vec, tagger, parser, attribute_ruler, lemmatizer, extract_hashtags.]

In [12]:
# .texts() now returns documents
sliced.texts()

76    (@jDaz, Because, I, was, a, dumb, idiot, and, ...
83    (@varunfatehpuria, Oh, Model, Y, is, coming, t...
Name: text, dtype: object