In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
import json


In [None]:
# Load data from JSON file
with open('./EventData.json', 'r') as f:
    data = json.load(f)
# Create pandas dataframe
df = pd.DataFrame(data)




In [None]:
# Feature engineering
df['date'] = pd.to_datetime(df['date'])

def extract_start_end_hour(time_str):
    if time_str:
        time_components = time_str.split('-')
        if len(time_components) >= 2:
            start_time_parts = time_components[0].split(':') if ':' in time_components[0] else time_components[0].split('.')
            start_hour = int(start_time_parts[0]) if start_time_parts[0].isdigit() else 0
            start_minute = int(start_time_parts[1]) if len(start_time_parts) > 1 and start_time_parts[1].isdigit() else 0
            
            end_time_parts = time_components[-1].split(':') if ':' in time_components[-1] else time_components[-1].split('.')
            end_hour = int(end_time_parts[0]) if end_time_parts[0].isdigit() else 0
            end_minute = int(end_time_parts[1]) if len(end_time_parts) > 1 and end_time_parts[1].isdigit() else 0
            
            return start_hour, start_minute, end_hour, end_minute
    return 0, 0, 0, 0

df['start_hour'], df['start_minute'], df['end_hour'], df['end_minute'] = zip(*df['time'].apply(extract_start_end_hour))
df['duration'] = (df['end_hour'] * 60 + df['end_minute']) - (df['start_hour'] * 60 + df['start_minute'])
df['title'] = df['title'].fillna('')
df['description'] = df['description'].fillna('')


In [None]:
# Define features and target
features = ['title', 'start_hour', 'start_minute', 'end_hour', 'end_minute', 'duration', 'nation', 'description']
target = 'category'

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Define preprocessing pipeline
# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'title'),
        ('text_desc', TfidfVectorizer(), 'description'),
        ('categorical', OneHotEncoder(handle_unknown='ignore'), ['nation']),
        ('numeric', SimpleImputer(strategy='mean'), ['start_hour', 'start_minute', 'end_hour', 'end_minute', 'duration'])
    ],
    remainder='passthrough'
)

In [None]:
# Define the model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])


In [None]:

# Train the model
model.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
print(len(X_test))
print(classification_report(y_test, y_pred))

In [None]:
# Predict categories for new data
new_data = {
    "title": ["Live wmusic - Tomas Rimeika & Anton \u00c5nell - at Gotlands' Pub!"],
    "start_hour": [18],
    "start_minute": [0],
    "end_hour": [2],
    "end_minute": [0],
    "duration": [1000],
    "nation": ["Gotlands nation"],
    "description": [""]
}
predicted_categories = model.predict(pd.DataFrame(new_data))
print("Predicted categories for new data:", predicted_categories)

In [None]:
with open('./events.json', 'r') as f:
    live_events_data = json.load(f)

In [None]:

# Create a DataFrame from the loaded data
live_events_df = pd.DataFrame(live_events_data)

# Feature engineering
live_events_df['date'] = pd.to_datetime(live_events_df['date'])
live_events_df['start_hour'] = live_events_df['time'].apply(extract_start_hour)


def extract_start_hour(time_str):
    if time_str:
        return int(time_str.split(':')[0])
    return 0



In [None]:
live_events_df['start_hour'] = live_events_df['time'].apply(extract_start_hour)
# Define features to be used for prediction
live_features = ['title', 'start_hour']

# Define preprocessing pipeline for live events
live_preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'title'),
        ('numeric', SimpleImputer(strategy='mean'), ['start_hour'])
    ],
    remainder='passthrough'
)

In [None]:
 #Preprocess live event data
preprocessed_live_events = live_preprocessor.fit_transform(live_events_df[live_features])
print(preprocessed_live_events)
# Load the trained model
model = Pipeline([
    ('preprocessor', preprocessor),  # Assuming you have preprocessor defined previously
    ('classifier', RandomForestClassifier())
])

In [None]:
# Predict categories for live events
predicted_categories = model.predict(preprocessed_live_events)

# Add predicted categories to the DataFrame
live_events_df['predicted_category'] = predicted_categories

# Display the DataFrame with predicted categories
print(live_events_df[['title', 'predicted_category']])