In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

category_map = {
    'Birding': 'Birding',
    'Nature': 'Nature',
    'Arts & Crafts': 'Arts',
    'Concerts': 'Concerts',
    'Dance': 'Dance',
    'Best for Kids': 'Kids',
    'Free Summer Concerts': 'Concerts',
    'Games': 'Games',
    'Volunteer': 'Volunteer',
    'Waterfront': 'Waterfront',
    'Fitness': 'Fitness',
    'Food': 'Food',
    'Outdoor Fitness': 'Fitness',
    'Education': 'Education',
    'Festivals': 'Festivals',
    'Fall Festivals': 'Festivals',
    'GreenThumb Events': 'Community Events',
    'GreenThumb Workshops': 'Workshops',
    'GreenThumb Partner Events': 'Community Events',
    'History': 'History',
    'Tours': 'Tours',
    'Northern Manhattan Parks': 'Nature',
    'Fort Tryon Park Trust': 'Nature',
    'Talks': 'Talks',
    'Art': 'Arts',
    'Dogs': 'Dogs',
    'MillionTreesNYC: Volunteer: Tree Stewardship and Care': 'Volunteer',
    'Historic House Trust Sites': 'History',
    'Seniors': 'Seniors',
    'Halloween': 'Holidays',
    'Freshkills Park': 'Nature',
    'Freshkills Featured Events': 'Community Events',
    'Freshkills Tours': 'Tours',
    'Theater': 'Theater',
    'Film': 'Film',
    'MillionTreesNYC: Volunteer: Tree Planting': 'Volunteer',
    'Historic House Trust Festival': 'Festivals',
    'Sports': 'Sports',
    'Astronomy': 'Astronomy',
    'Arts, Culture & Fun Series': 'Arts',
    'Pumpkin Fest': 'Festivals',
    'Winter Holidays': 'Holidays',
    'Fall Foliage': 'Nature',
    'Accessible': 'Accessible',
    'Reforestation Stewardship': 'Nature',
    "It's My Park": 'Nature',
    'Urban Park Rangers': 'Community Events',
    'Thanksgiving': 'Holidays',
    "New Year's Eve": 'Holidays',
    'Poe Park Visitor Center': 'Nature',
    'City Parks Foundation': 'Community Events',
    'Markets': 'Markets',
    'Black History Month': 'Cultural Celebrations',
    "Saint Patrick's Day": 'Cultural Celebrations',
    "Valentine's Day": 'Cultural Celebrations',
    'Bike Month NYC': 'Sports',
    'Earth Day & Arbor Day': 'Cultural Celebrations',
    'Girls and Women in Sports': 'Sports',
    "Women's History Month": 'Cultural Celebrations',
    'Easter': 'Cultural Celebrations',
    'Fishing': 'Fishing',
    "Mother's Day": 'Cultural Celebrations',
    'Summer on the Hudson': 'Water Sports',
    'Free Summer Movies': 'Film',
    'Free Summer Theater': 'Theater',
    'Shape Up New York': 'Fitness',
    'Learn To Ride': 'Sports',
    'Holiday Lightings': 'Holidays',
    'School Break': 'Kids',
    'Kids Week': 'Kids',
    'Kayaking and Canoeing': 'Water Sports',
    "Father's Day": 'Cultural Celebrations',
    'SummerStage': 'Concerts',
    'CityParks Kids Arts': 'Kids',
    'Mobile Recreation Van Event': 'Community Events',
    'Wildflower Week': 'Nature',
    'Fourth of July': 'Cultural Celebrations',
    'LGBTQ Pride Month': 'Cultural Celebrations',
    'Partnerships for Parks Tree Workshops': 'Workshops',
    'Martin Luther King Jr. Day of Service': 'Cultural Celebrations',
    'Open House New York': 'Community Events',
    'Shakespeare in the Parks': 'Theater',
    'Running': 'Sports',
    'Forest Park Trust': 'Nature',
    'Summer Sports Experience': 'Sports',
    'Bocce Tournament': 'Games',
    "Santa's Coming to Town": 'Holidays',
    'City Parks Foundation Adults': 'Community Events',
    'Partnerships for Parks Training and Grant Deadlines': 'Community Events',
    'Community Input Meetings': 'Community Events',
    'D/M/WBE': 'Community Events',
    'Lunar New Year': 'Cultural Celebrations',
    'Hiking': 'Hiking',
    'Family Camping': 'Camping',
    'Fireworks': 'Holidays',
    'Ocean Breeze Summer Fitness': 'Fitness',
    'Wildlife': 'Wildlife',
    'Movies Under the Stars': 'Outdoor Movies',
    'National Night Out': 'Community Events',
    'Ocean Breeze Track & Field Athletic Complex': 'Sports',
    'Living With Deer in New York City': 'Wildlife',
    'Bronx River Greenway': 'Nature',
    'Cherry Blossom Festivals': 'Festivals',
    'CityParks PuppetMobile': 'Kids',
    'Art in the Parks: UNIQLO Park Expressions Grant': 'Arts',
    'Parks Without Borders': 'Nature',
    'Community Parks Initiative': 'Community Events',
    'Anchor Parks': 'Nature',
    "She's On Point": 'Sports',
    'National Trails Day': 'Nature',
    'NYC Parks Senior Games': 'Sports',
    'Hispanic Heritage Month': 'Cultural Celebrations',
    'City of Water Day': 'Cultural Celebrations',
    'Art in the Parks: Celebrating 50 Years': 'Arts',
    'Brooklyn Beach Sports Festival': 'Sports',
    'Youth Tech Workshops': 'Workshops',
    'Native American Heritage Month': 'Cultural Celebrations',
    'GreenThumb 40th Anniversary': 'Community Events',
    'My Summer House NYC': 'Community Events',
    'Rockaway Beach': 'Waterfront',
    'Cool Pools NYC': 'Community Events',
    'Recreation Center Open House': 'Community Events',
    'Workshops': 'Workshops',
    'Dogs in Parks: Town Hall': 'Community Events',
    'Summer Solstice Celebrations': 'Cultural Celebrations',
    'Veterans Day': 'Cultural Celebrations',
}

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vanzy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanzy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
def get_new_category_name(old_category_name):
    return category_map.get(old_category_name, "Other")
    
event_categories = pd.read_csv(r'C:\Users\vanzy\OneDrive\Documents\Notebooks\NYC_Parks_Events_Listing___Event_Categories.csv')
event_categories_unique = event_categories.drop_duplicates(subset='event_id')
event_listing = pd.read_csv(r'C:\Users\vanzy\OneDrive\Documents\Notebooks\NYC_Parks_Events_Listing___Event_Listing.csv')
merged_data = pd.merge(event_categories_unique, event_listing, on="event_id", how="left")

merged_data['new_category_name'] = merged_data['name'].apply(get_new_category_name)
merged_data['concat_event_name'] = merged_data['title'] + " " + merged_data['snippet']

smaller_df = merged_data[['new_category_name', 'concat_event_name']]
smaller_df_cleaned = smaller_df.dropna(subset=['new_category_name', 'concat_event_name'])

events = list(zip(smaller_df_cleaned['concat_event_name'], smaller_df_cleaned['new_category_name']))
event_names, labels = zip(*events)

In [27]:
smaller = pd.read_csv(r'smaller_dataframe.csv')
print(smaller_df.head())


print("Event Names:", event_names[:5])  # Display first 5 event names
print("Labels:", labels[:5])  # Display first 5 labels

ParserError: Error tokenizing data. C error: Expected 2 fields in line 2176, saw 6


In [15]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)

    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    
    return ' '.join(tokens)

event_names_preprocessed = [preprocess_text(name) for name in event_names]

In [16]:
print(event_names_preprocessed[:5])
labels[:5]

['bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .', 'bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .', 'bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .', 'bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .', 'bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .']


('Birding', 'Birding', 'Birding', 'Birding', 'Birding')

In [17]:
vectorizer = TfidfVectorizer(max_features=80)
X = vectorizer.fit_transform(event_names_preprocessed)

In [21]:
grouped_data = merged_data.groupby('new_category_name')
train_data = []
test_data = []

for name, group in grouped_data:
    selected_samples = group
    
    event_names_in_group = selected_samples['concat_event_name'].apply(preprocess_text)
    labels_in_group = selected_samples['new_category_name']

    X_group = vectorizer.transform(event_names_in_group)

    X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(X_group, labels_in_group, test_size=0.2, random_state=42)

    train_data.extend(list(zip(X_train_group.toarray(), y_train_group)))
    test_data.extend(list(zip(X_test_group.toarray(), y_test_group)))

train_data_nltk = [(dict(enumerate(x)), y) for x, y in train_data]
test_data_nltk = [(dict(enumerate(x)), y) for x, y in test_data]

In [22]:
classifier = NaiveBayesClassifier.train(train_data_nltk)
print("Model accuracy:", accuracy(classifier, test_data_nltk))

Model accuracy: 0.5664324235986272


In [23]:
y_pred = [classifier.classify(x[0]) for x in test_data_nltk]
y_true = [x[1] for x in test_data_nltk]

# Encoding labels to use with classification_report
label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_true)
y_pred_encoded = label_encoder.transform(y_pred)

print(classification_report(y_true_encoded, y_pred_encoded, target_names=label_encoder.classes_, zero_division=1))

                       precision    recall  f1-score   support

           Accessible       0.29      0.55      0.38      1704
                 Arts       0.40      0.88      0.55      2978
            Astronomy       1.00      0.00      0.00        15
              Birding       0.83      0.41      0.55       497
              Camping       1.00      0.00      0.00         2
     Community Events       1.00      0.00      0.00       210
             Concerts       0.84      0.31      0.46       540
Cultural Celebrations       1.00      0.00      0.00        15
                Dance       0.96      0.35      0.51       327
                 Dogs       1.00      0.00      0.00        34
            Education       0.74      0.56      0.64      2108
            Festivals       1.00      0.03      0.06       181
                 Film       0.82      0.21      0.34       461
              Fishing       1.00      0.00      0.00         4
              Fitness       0.91      0.76      0.83  