In [39]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertModel
from sklearn.linear_model import LogisticRegression
import tensorflow as tf

category_map = {
    'Conference': 'Educational Events',
    'Seminar': 'Educational Events',
    'Training': 'Educational Events',
    'Webinar': 'Online Events',
    'Panel': 'Discussions & Panels',
    'Keynote': 'Performances & Shows',
    'Symposium': 'Discussions & Panels',
    'Exhibition': 'Exhibitions & Fairs',
    'Launch': 'Launch & Networking Events',
    'Networking': 'Launch & Networking Events',
    'Meeting': 'Discussions & Panels',
    'Retreat': 'Retreats',
    'Hackathon': 'Hackathons',
    'Dinner': 'Food & Culinary Events',
    'Charity': 'Charity & Fundraising Events',
    'Fundraising': 'Charity & Fundraising Events',
    'Awards': 'Awards & Recognition Events',
    'Concert': 'Performances & Shows',
    'Festival': 'Festivals',
    'Performance': 'Performances & Shows',
    'Theater': 'Performances & Shows',
    'Screening': 'Other',
    'Dance': 'Performances & Shows',
    'Comedy': 'Performances & Shows',
    'Sports': 'Sports & Fitness Events',
    'Marathon': 'Sports & Fitness Events',
    'Tournament': 'Sports & Fitness Events',
    'Class': 'Special Occasions',
    'Lecture': 'Educational Events',
    'Reading': 'Educational Events',
    'Poetry': 'Poetry & Literary Events',
    'Fashion': 'Exhibitions & Fairs',
    'Food': 'Food & Culinary Events',
    'Tasting': 'Food & Culinary Events',
    'Cultural': 'Religious & Cultural Events',
    'Fair': 'Exhibitions & Fairs',
    'Parade': 'Religious & Cultural Events',
    'Wedding': 'Celebrations & Parties',
    'Party': 'Celebrations & Parties',
    'Anniversary': 'Celebrations & Parties',
    'Birthday': 'Celebrations & Parties',
    'Shower': 'Celebrations & Parties',
    'Graduation': 'Celebrations & Parties',
    'Reunion': 'Celebrations & Parties',
    'Retirement': 'Celebrations & Parties',
    'Holiday': 'Celebrations & Parties',
    'Religious': 'Religious & Cultural Events',
    'Run/Walk': 'Sports & Fitness Events',
    'Volunteer': 'Community & Volunteer Events',
    'Community': 'Community & Volunteer Events',
    'Cleanup': 'Community & Volunteer Events',
    'Rally': 'Community & Volunteer Events',
    'Protest': 'Protests & Movements',
    'Workshop': 'Educational Events',
    'Recruitment': 'Recruitment & Mentorship',
    'Mentorship': 'Recruitment & Mentorship',
    'Auction': 'Charity & Fundraising Events',
    'Cooking': 'Food & Culinary Events'
}

In [40]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vanzy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanzy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

def bert_feature_extraction(text):
    with tf.device('/GPU:0'):
        inputs = tokenizer.encode_plus(text, return_tensors='tf', add_special_tokens=True, max_length=250, truncation=True, padding='max_length')
        outputs = model(inputs)
        return outputs.last_hidden_state[:,0,:].numpy()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [42]:
event_categories = pd.read_csv(r'C:\Users\vanzy\OneDrive\Documents\GitHub\Koja\AI\Notebooks\Custom_Training_Data.csv')
event_categories['category'] = event_categories['category'].str.strip()
event_categories['category'] = event_categories['category'].map(category_map).fillna('Other')

events = list(zip(event_categories['event_title'], event_categories['category']))
event_names, labels = zip(*events)

In [43]:
print("Event Names:", event_names[:5])  # Display first 5 event names
print("Labels:", labels[:5])  # Display first 5 labels

Event Names: ('Global Data Science', 'The Future of Luxury', 'Smart Technology and Business Transformation', 'The Future of Inclusive Cloud Computing', 'Data Governance & Security')
Labels: ('Educational Events', 'Educational Events', 'Educational Events', 'Educational Events', 'Educational Events')


In [44]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)

    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    
    return ' '.join(tokens)

event_names_preprocessed = [preprocess_text(name) for name in event_names]

In [45]:
print(event_names_preprocessed[:5])

['global data scienc', 'futur luxuri', 'smart technolog busi transform', 'futur inclus cloud comput', 'data govern & secur']


In [46]:
grouped_data = event_categories.groupby('category')
train_data = []
test_data = []

In [47]:
for name, group in grouped_data:
    selected_samples = group

    event_names_in_group = selected_samples['event_title'].apply(preprocess_text)
    labels_in_group = selected_samples['category']

    X_group = [bert_feature_extraction(text) for text in event_names_in_group]
    X_group = [item[0] for item in X_group]

    if len(X_group) > 1:
        X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(X_group, labels_in_group, test_size=0.2, random_state=42)
        train_data.extend(list(zip(X_train_group, y_train_group)))
        test_data.extend(list(zip(X_test_group, y_test_group)))
    else:
        # If there is only one sample in the group, add it to the training data
        train_data.extend(list(zip(X_group, labels_in_group)))

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

In [48]:
classifier = LogisticRegression(max_iter=2000)
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
y_pred = classifier.predict(X_test)

label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.transform(y_pred)

print(classification_report(y_true_encoded, y_pred_encoded, target_names=label_encoder.classes_, zero_division=1))

                              precision    recall  f1-score   support

 Awards & Recognition Events       0.94      0.76      0.84        21
      Celebrations & Parties       0.58      0.66      0.62       196
Charity & Fundraising Events       0.88      0.86      0.87        49
Community & Volunteer Events       0.56      0.64      0.60        99
        Discussions & Panels       0.77      0.74      0.75        91
          Educational Events       0.64      0.62      0.63        94
         Exhibitions & Fairs       0.57      0.53      0.55        73
                   Festivals       0.77      0.77      0.77        31
      Food & Culinary Events       0.71      0.75      0.73       106
                  Hackathons       0.38      0.29      0.32        21
  Launch & Networking Events       0.48      0.51      0.50        49
               Online Events       0.73      0.69      0.71        35
                       Other       0.83      0.83      0.83       115
        Performance

In [50]:
from joblib import dump

# Save the model to a file
dump(classifier, 'KOJA-CR.joblib')

['KOJA-CR.joblib']