In [1]:
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

category_map = {
    'Birding': 'Birding',
    'Nature': 'Nature',
    'Arts & Crafts': 'Arts',
    'Concerts': 'Concerts',
    'Dance': 'Dance',
    'Best for Kids': 'Kids',
    'Free Summer Concerts': 'Concerts',
    'Games': 'Games',
    'Volunteer': 'Volunteer',
    'Waterfront': 'Waterfront',
    'Fitness': 'Fitness',
    'Food': 'Food',
    'Outdoor Fitness': 'Fitness',
    'Education': 'Education',
    'Festivals': 'Festivals',
    'Fall Festivals': 'Festivals',
    'GreenThumb Events': 'Community Events',
    'GreenThumb Workshops': 'Workshops',
    'GreenThumb Partner Events': 'Community Events',
    'History': 'History',
    'Tours': 'Tours',
    'Northern Manhattan Parks': 'Nature',
    'Fort Tryon Park Trust': 'Nature',
    'Talks': 'Talks',
    'Art': 'Arts',
    'Dogs': 'Dogs',
    'MillionTreesNYC: Volunteer: Tree Stewardship and Care': 'Volunteer',
    'Historic House Trust Sites': 'History',
    'Seniors': 'Seniors',
    'Halloween': 'Holidays',
    'Freshkills Park': 'Nature',
    'Freshkills Featured Events': 'Community Events',
    'Freshkills Tours': 'Tours',
    'Theater': 'Theater',
    'Film': 'Film',
    'MillionTreesNYC: Volunteer: Tree Planting': 'Volunteer',
    'Historic House Trust Festival': 'Festivals',
    'Sports': 'Sports',
    'Astronomy': 'Astronomy',
    'Arts, Culture & Fun Series': 'Arts',
    'Pumpkin Fest': 'Festivals',
    'Winter Holidays': 'Holidays',
    'Fall Foliage': 'Nature',
    'Accessible': 'Accessible',
    'Reforestation Stewardship': 'Nature',
    "It's My Park": 'Nature',
    'Urban Park Rangers': 'Community Events',
    'Thanksgiving': 'Holidays',
    "New Year's Eve": 'Holidays',
    'Poe Park Visitor Center': 'Nature',
    'City Parks Foundation': 'Community Events',
    'Markets': 'Markets',
    'Black History Month': 'Cultural Celebrations',
    "Saint Patrick's Day": 'Cultural Celebrations',
    "Valentine's Day": 'Cultural Celebrations',
    'Bike Month NYC': 'Sports',
    'Earth Day & Arbor Day': 'Cultural Celebrations',
    'Girls and Women in Sports': 'Sports',
    "Women's History Month": 'Cultural Celebrations',
    'Easter': 'Cultural Celebrations',
    'Fishing': 'Fishing',
    "Mother's Day": 'Cultural Celebrations',
    'Summer on the Hudson': 'Water Sports',
    'Free Summer Movies': 'Film',
    'Free Summer Theater': 'Theater',
    'Shape Up New York': 'Fitness',
    'Learn To Ride': 'Sports',
    'Holiday Lightings': 'Holidays',
    'School Break': 'Kids',
    'Kids Week': 'Kids',
    'Kayaking and Canoeing': 'Water Sports',
    "Father's Day": 'Cultural Celebrations',
    'SummerStage': 'Concerts',
    'CityParks Kids Arts': 'Kids',
    'Mobile Recreation Van Event': 'Community Events',
    'Wildflower Week': 'Nature',
    'Fourth of July': 'Cultural Celebrations',
    'LGBTQ Pride Month': 'Cultural Celebrations',
    'Partnerships for Parks Tree Workshops': 'Workshops',
    'Martin Luther King Jr. Day of Service': 'Cultural Celebrations',
    'Open House New York': 'Community Events',
    'Shakespeare in the Parks': 'Theater',
    'Running': 'Sports',
    'Forest Park Trust': 'Nature',
    'Summer Sports Experience': 'Sports',
    'Bocce Tournament': 'Games',
    "Santa's Coming to Town": 'Holidays',
    'City Parks Foundation Adults': 'Community Events',
    'Partnerships for Parks Training and Grant Deadlines': 'Community Events',
    'Community Input Meetings': 'Community Events',
    'D/M/WBE': 'Community Events',
    'Lunar New Year': 'Cultural Celebrations',
    'Hiking': 'Hiking',
    'Family Camping': 'Camping',
    'Fireworks': 'Holidays',
    'Ocean Breeze Summer Fitness': 'Fitness',
    'Wildlife': 'Wildlife',
    'Movies Under the Stars': 'Outdoor Movies',
    'National Night Out': 'Community Events',
    'Ocean Breeze Track & Field Athletic Complex': 'Sports',
    'Living With Deer in New York City': 'Wildlife',
    'Bronx River Greenway': 'Nature',
    'Cherry Blossom Festivals': 'Festivals',
    'CityParks PuppetMobile': 'Kids',
    'Art in the Parks: UNIQLO Park Expressions Grant': 'Arts',
    'Parks Without Borders': 'Nature',
    'Community Parks Initiative': 'Community Events',
    'Anchor Parks': 'Nature',
    "She's On Point": 'Sports',
    'National Trails Day': 'Nature',
    'NYC Parks Senior Games': 'Sports',
    'Hispanic Heritage Month': 'Cultural Celebrations',
    'City of Water Day': 'Cultural Celebrations',
    'Art in the Parks: Celebrating 50 Years': 'Arts',
    'Brooklyn Beach Sports Festival': 'Sports',
    'Youth Tech Workshops': 'Workshops',
    'Native American Heritage Month': 'Cultural Celebrations',
    'GreenThumb 40th Anniversary': 'Community Events',
    'My Summer House NYC': 'Community Events',
    'Rockaway Beach': 'Waterfront',
    'Cool Pools NYC': 'Community Events',
    'Recreation Center Open House': 'Community Events',
    'Workshops': 'Workshops',
    'Dogs in Parks: Town Hall': 'Community Events',
    'Summer Solstice Celebrations': 'Cultural Celebrations',
    'Veterans Day': 'Cultural Celebrations',
}

In [3]:
def get_new_category_name(old_category_name):
    return category_map.get(old_category_name, "Other")

if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe('textcat', last=True)
else:
    textcat = nlp.get_pipe('textcat')

for label in category_map.values():
    textcat.add_label(label)
textcat.add_label("Other")

1

In [4]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    
    doc = nlp.make_doc(text)
    tokens = [token.text for token in doc if not token.is_stop]

    return ' '.join(tokens)

In [5]:
event_categories = pd.read_csv(r'C:\Users\vanzy\OneDrive\Documents\Notebooks\NYC_Parks_Events_Listing___Event_Categories.csv')
event_categories_unique = event_categories.drop_duplicates(subset='event_id')
event_listing = pd.read_csv(r'C:\Users\vanzy\OneDrive\Documents\Notebooks\NYC_Parks_Events_Listing___Event_Listing.csv')
merged_data = pd.merge(event_categories_unique, event_listing, on="event_id", how="left")
merged_data_cleaned = merged_data.dropna()

merged_data_cleaned = merged_data_cleaned.copy()
merged_data_cleaned.loc[:, 'new_category_name'] = merged_data_cleaned['name'].apply(get_new_category_name)
merged_data_cleaned.loc[:, 'concat_event_name'] = merged_data_cleaned['title'] + " " + merged_data_cleaned['snippet']

event_names = merged_data_cleaned['concat_event_name'].apply(preprocess_text)
labels = merged_data_cleaned['new_category_name']
events = list(zip(event_names, labels))

In [6]:
print(event_names)
print(labels)

0        Bird Walks New York Botanical Garden diverse h...
1        Bird Walks New York Botanical Garden diverse h...
2        Bird Walks New York Botanical Garden diverse h...
3        Bird Walks New York Botanical Garden diverse h...
4        Bird Walks New York Botanical Garden diverse h...
                               ...                        
91597    Central Park Tour : Iconic Views Central Park ...
91615    Battle Brooklyn Neighborhood Walk Walk Battle ...
91618    Central Park Tour : Iconic Views Central Park ...
91644    Central Park Tour : Iconic Views Central Park ...
91679    Central Park Tour : Iconic Views Central Park ...
Name: concat_event_name, Length: 1461, dtype: object
0        Birding
1        Birding
2        Birding
3        Birding
4        Birding
          ...   
91597      Tours
91615    History
91618      Tours
91644      Tours
91679      Tours
Name: new_category_name, Length: 1461, dtype: object


In [11]:
def prepare_spacy_data(data, textcat_labels):
    processing = []
    for text, cat in data:
        categories = {label: 1.0 if label == cat else 0.0 for label in textcat_labels}
        processing.append((text, {"cats": categories}))
    return processing

grouped_data = merged_data_cleaned.groupby('new_category_name')
train_data = []
test_data = []

In [12]:
for name, group in grouped_data:
    event_names_in_group = group['concat_event_name'].apply(preprocess_text)
    labels_in_group = group['new_category_name']

    data_group = list(zip(event_names_in_group, labels_in_group))
    
    data_group_spacy = prepare_spacy_data(data_group, textcat.labels)

    if len(data_group_spacy) > 1:
        train_data_group, test_data_group = train_test_split(data_group_spacy, test_size=0.2, random_state=42)
        train_data.extend(train_data_group)
        test_data.extend(test_data_group)
    else:
        train_data.extend(data_group_spacy)

In [13]:
from spacy.training import Example
from spacy.util import minibatch
import random

random.seed(42)
spacy.util.fix_random_seed(42)

# Create a function to generate training data examples
def get_examples():
    for text, annotations in train_data:
        yield Example.from_dict(nlp.make_doc(text), annotations)

# Initialize the model with the training data
nlp.initialize(get_examples)

optimizer = nlp.create_optimizer()

n_iter = 10
for epoch in range(n_iter):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=8)
    for batch in batches:
        texts, annotations = zip(*batch)
        examples = [Example.from_dict(nlp.make_doc(text), annot) for text, annot in zip(texts, annotations)]
        nlp.update(examples, sgd=optimizer, losses=losses)
    print(f"Epoch {epoch + 1}, Losses: {losses}")


Epoch 1, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 2.9385533751919866}
Epoch 2, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 2.3609398249536753}
Epoch 3, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 2.164577564690262}
Epoch 4, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 2.027113255811855}
Epoch 5, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1.9883351824246347}
Epoch 6, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1.9208818479346519}
Epoch 7, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1.8621944973128848}
Epoch 8, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1.8305297811166383}
Epoch 9, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1.8210100555988902}
Epoch 10, Losses: {'tok2vec': 0.0, 'tag

In [17]:
def evaluate_model(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            print(f"Label: {label}, Score: {score}, Gold: {gold[label]}")  # Debug information
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if precision + recall == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

In [18]:
test_texts, test_cats = zip(*test_data)
scores = evaluate_model(nlp.tokenizer, textcat, test_texts, test_cats)
print(scores)

{'textcat_p': 0.0, 'textcat_r': 0.0, 'textcat_f': 0.0}


In [30]:
from sklearn.metrics import classification_report

# Get the true labels and the predicted labels from the model
true_labels = [max(cat['cats'], key=cat['cats'].get) for cat in test_cats]
predicted_labels = [max(doc.cats, key=doc.cats.get) for doc in textcat.pipe(nlp.tokenizer.pipe(test_texts))]

# Get the labels from the textcat
labels = list(textcat.labels)

# Print the classification report
print(classification_report(true_labels, predicted_labels, labels=labels, zero_division=1))


                       precision    recall  f1-score   support

              Birding       1.00      1.00      1.00         7
               Nature       0.72      0.62      0.67        55
                 Arts       0.85      1.00      0.92        39
             Concerts       0.60      1.00      0.75         3
                Dance       1.00      0.67      0.80         3
                 Kids       0.29      0.40      0.33         5
                Games       1.00      0.67      0.80         3
            Volunteer       0.67      1.00      0.80         2
           Waterfront       1.00      1.00      1.00         0
              Fitness       0.57      0.67      0.62         6
                 Food       1.00      0.00      0.00         2
            Education       0.47      0.84      0.60        77
            Festivals       0.33      1.00      0.50         1
     Community Events       0.33      0.40      0.36         5
            Workshops       1.00      1.00      1.00  

In [28]:
print(textcat.labels)

('Birding', 'Nature', 'Arts', 'Concerts', 'Dance', 'Kids', 'Games', 'Volunteer', 'Waterfront', 'Fitness', 'Food', 'Education', 'Festivals', 'Community Events', 'Workshops', 'History', 'Tours', 'Talks', 'Dogs', 'Seniors', 'Holidays', 'Theater', 'Film', 'Sports', 'Astronomy', 'Accessible', 'Markets', 'Cultural Celebrations', 'Fishing', 'Water Sports', 'Hiking', 'Camping', 'Wildlife', 'Outdoor Movies', 'Other')
