In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertModel
from sklearn.linear_model import LogisticRegression
import tensorflow as tf

category_map = {
    'Birding': 'Birding',
    'Nature': 'Nature',
    'Arts & Crafts': 'Arts',
    'Concerts': 'Concerts',
    'Dance': 'Dance',
    'Best for Kids': 'Kids',
    'Free Summer Concerts': 'Concerts',
    'Games': 'Games',
    'Volunteer': 'Volunteer',
    'Waterfront': 'Waterfront',
    'Fitness': 'Fitness',
    'Food': 'Food',
    'Outdoor Fitness': 'Fitness',
    'Education': 'Education',
    'Festivals': 'Festivals',
    'Fall Festivals': 'Festivals',
    'GreenThumb Events': 'Community Events',
    'GreenThumb Workshops': 'Workshops',
    'GreenThumb Partner Events': 'Community Events',
    'History': 'History',
    'Tours': 'Tours',
    'Northern Manhattan Parks': 'Nature',
    'Fort Tryon Park Trust': 'Nature',
    'Talks': 'Talks',
    'Art': 'Arts',
    'Dogs': 'Dogs',
    'MillionTreesNYC: Volunteer: Tree Stewardship and Care': 'Volunteer',
    'Historic House Trust Sites': 'History',
    'Seniors': 'Seniors',
    'Halloween': 'Holidays',
    'Freshkills Park': 'Nature',
    'Freshkills Featured Events': 'Community Events',
    'Freshkills Tours': 'Tours',
    'Theater': 'Theater',
    'Film': 'Film',
    'MillionTreesNYC: Volunteer: Tree Planting': 'Volunteer',
    'Historic House Trust Festival': 'Festivals',
    'Sports': 'Sports',
    'Astronomy': 'Astronomy',
    'Arts, Culture & Fun Series': 'Arts',
    'Pumpkin Fest': 'Festivals',
    'Winter Holidays': 'Holidays',
    'Fall Foliage': 'Nature',
    'Accessible': 'Accessible',
    'Reforestation Stewardship': 'Nature',
    "It's My Park": 'Nature',
    'Urban Park Rangers': 'Community Events',
    'Thanksgiving': 'Holidays',
    "New Year's Eve": 'Holidays',
    'Poe Park Visitor Center': 'Nature',
    'City Parks Foundation': 'Community Events',
    'Markets': 'Markets',
    'Black History Month': 'Cultural Celebrations',
    "Saint Patrick's Day": 'Cultural Celebrations',
    "Valentine's Day": 'Cultural Celebrations',
    'Bike Month NYC': 'Sports',
    'Earth Day & Arbor Day': 'Cultural Celebrations',
    'Girls and Women in Sports': 'Sports',
    "Women's History Month": 'Cultural Celebrations',
    'Easter': 'Cultural Celebrations',
    'Fishing': 'Fishing',
    "Mother's Day": 'Cultural Celebrations',
    'Summer on the Hudson': 'Water Sports',
    'Free Summer Movies': 'Film',
    'Free Summer Theater': 'Theater',
    'Shape Up New York': 'Fitness',
    'Learn To Ride': 'Sports',
    'Holiday Lightings': 'Holidays',
    'School Break': 'Kids',
    'Kids Week': 'Kids',
    'Kayaking and Canoeing': 'Water Sports',
    "Father's Day": 'Cultural Celebrations',
    'SummerStage': 'Concerts',
    'CityParks Kids Arts': 'Kids',
    'Mobile Recreation Van Event': 'Community Events',
    'Wildflower Week': 'Nature',
    'Fourth of July': 'Cultural Celebrations',
    'LGBTQ Pride Month': 'Cultural Celebrations',
    'Partnerships for Parks Tree Workshops': 'Workshops',
    'Martin Luther King Jr. Day of Service': 'Cultural Celebrations',
    'Open House New York': 'Community Events',
    'Shakespeare in the Parks': 'Theater',
    'Running': 'Sports',
    'Forest Park Trust': 'Nature',
    'Summer Sports Experience': 'Sports',
    'Bocce Tournament': 'Games',
    "Santa's Coming to Town": 'Holidays',
    'City Parks Foundation Adults': 'Community Events',
    'Partnerships for Parks Training and Grant Deadlines': 'Community Events',
    'Community Input Meetings': 'Community Events',
    'D/M/WBE': 'Community Events',
    'Lunar New Year': 'Cultural Celebrations',
    'Hiking': 'Hiking',
    'Family Camping': 'Camping',
    'Fireworks': 'Holidays',
    'Ocean Breeze Summer Fitness': 'Fitness',
    'Wildlife': 'Wildlife',
    'Movies Under the Stars': 'Outdoor Movies',
    'National Night Out': 'Community Events',
    'Ocean Breeze Track & Field Athletic Complex': 'Sports',
    'Living With Deer in New York City': 'Wildlife',
    'Bronx River Greenway': 'Nature',
    'Cherry Blossom Festivals': 'Festivals',
    'CityParks PuppetMobile': 'Kids',
    'Art in the Parks: UNIQLO Park Expressions Grant': 'Arts',
    'Parks Without Borders': 'Nature',
    'Community Parks Initiative': 'Community Events',
    'Anchor Parks': 'Nature',
    "She's On Point": 'Sports',
    'National Trails Day': 'Nature',
    'NYC Parks Senior Games': 'Sports',
    'Hispanic Heritage Month': 'Cultural Celebrations',
    'City of Water Day': 'Cultural Celebrations',
    'Art in the Parks: Celebrating 50 Years': 'Arts',
    'Brooklyn Beach Sports Festival': 'Sports',
    'Youth Tech Workshops': 'Workshops',
    'Native American Heritage Month': 'Cultural Celebrations',
    'GreenThumb 40th Anniversary': 'Community Events',
    'My Summer House NYC': 'Community Events',
    'Rockaway Beach': 'Waterfront',
    'Cool Pools NYC': 'Community Events',
    'Recreation Center Open House': 'Community Events',
    'Workshops': 'Workshops',
    'Dogs in Parks: Town Hall': 'Community Events',
    'Summer Solstice Celebrations': 'Cultural Celebrations',
    'Veterans Day': 'Cultural Celebrations',
}

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vanzy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanzy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

def bert_feature_extraction(text):
    with tf.device('/GPU:0'):
        inputs = tokenizer.encode_plus(text, return_tensors='tf', add_special_tokens=True, max_length=250, truncation=True, padding='max_length')
        outputs = model(inputs)
        return outputs.last_hidden_state[:,0,:].numpy()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [26]:
def get_new_category_name(old_category_name):
    return category_map.get(old_category_name, "Other")
    
event_categories = pd.read_csv(r'C:\Users\vanzy\OneDrive\Documents\Notebooks\NYC_Parks_Events_Listing___Event_Categories.csv')
event_categories_unique = event_categories.drop_duplicates(subset='event_id')
event_listing = pd.read_csv(r'C:\Users\vanzy\OneDrive\Documents\Notebooks\NYC_Parks_Events_Listing___Event_Listing.csv')
merged_data = pd.merge(event_categories_unique, event_listing, on="event_id", how="left")
merged_data_cleaned = merged_data.dropna()

merged_data_cleaned['new_category_name'] = merged_data_cleaned['name'].apply(get_new_category_name)
merged_data_cleaned['concat_event_name'] = merged_data_cleaned['title'] + " " + merged_data_cleaned['snippet']

events = list(zip(merged_data_cleaned['concat_event_name'], merged_data_cleaned['new_category_name']))
event_names, labels = zip(*events)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_cleaned['new_category_name'] = merged_data_cleaned['name'].apply(get_new_category_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_cleaned['concat_event_name'] = merged_data_cleaned['title'] + " " + merged_data_cleaned['snippet']


In [27]:
print("Event Names:", event_names[:5])  # Display first 5 event names
print("Labels:", labels[:5])  # Display first 5 labels

Event Names: ('Bird Walks at The New York Botanical Garden The diverse habitats of the Botanical Garden offer visitors a chance to see dozens of species of birds throughout the year. Bring your binoculars and walk the Garden grounds with an expert.', 'Bird Walks at The New York Botanical Garden The diverse habitats of the Botanical Garden offer visitors a chance to see dozens of species of birds throughout the year. Bring your binoculars and walk the Garden grounds with an expert.', 'Bird Walks at The New York Botanical Garden The diverse habitats of the Botanical Garden offer visitors a chance to see dozens of species of birds throughout the year. Bring your binoculars and walk the Garden grounds with an expert.', 'Bird Walks at The New York Botanical Garden The diverse habitats of the Botanical Garden offer visitors a chance to see dozens of species of birds throughout the year. Bring your binoculars and walk the Garden grounds with an expert.', 'Bird Walks at The New York Botanical 

In [28]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)

    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    
    return ' '.join(tokens)

event_names_preprocessed = [preprocess_text(name) for name in event_names]

In [29]:
print(event_names_preprocessed[:5])

['bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .', 'bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .', 'bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .', 'bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .', 'bird walk new york botan garden divers habitat botan garden offer visitor chanc see dozen speci bird throughout year . bring binocular walk garden ground expert .']


In [30]:
grouped_data = merged_data_cleaned.groupby('new_category_name')
train_data = []
test_data = []

In [32]:
for name, group in grouped_data:
    selected_samples = group

    event_names_in_group = selected_samples['concat_event_name'].apply(preprocess_text)
    labels_in_group = selected_samples['new_category_name']

    X_group = [bert_feature_extraction(text) for text in event_names_in_group]
    X_group = [item[0] for item in X_group]

    if len(X_group) > 1:
        X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(X_group, labels_in_group, test_size=0.2, random_state=42)
        train_data.extend(list(zip(X_train_group, y_train_group)))
        test_data.extend(list(zip(X_test_group, y_test_group)))
    else:
        # If there is only one sample in the group, add it to the training data
        train_data.extend(list(zip(X_group, labels_in_group)))

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

In [33]:
classifier = LogisticRegression(max_iter=2000)
classifier.fit(X_train, y_train)

In [34]:
y_pred = classifier.predict(X_test)

label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.transform(y_pred)

print(classification_report(y_true_encoded, y_pred_encoded, target_names=label_encoder.classes_, zero_division=1))

                  precision    recall  f1-score   support

      Accessible       0.60      0.75      0.67        16
            Arts       0.89      1.00      0.94        78
         Birding       1.00      1.00      1.00        14
Community Events       0.40      0.40      0.40        10
        Concerts       0.67      0.67      0.67         6
           Dance       1.00      0.67      0.80         6
            Dogs       1.00      0.00      0.00         2
       Education       0.52      0.87      0.65       154
       Festivals       1.00      1.00      1.00         2
         Fitness       0.67      0.67      0.67        12
            Food       1.00      0.00      0.00         4
           Games       1.00      0.67      0.80         6
         History       0.32      0.14      0.19        58
        Holidays       1.00      0.00      0.00         6
            Kids       0.67      0.80      0.73        10
         Markets       1.00      0.00      0.00         2
          Nat

In [35]:
from joblib import dump

# Save the model to a file
dump(classifier, 'KOJA-CR.joblib')

['KOJA-CR.joblib']