In [4]:
import pandas as pd
from timezonefinder import TimezoneFinder
from datetime import datetime

# Read the data
file_path = '/Users/uyakut/Desktop/group_project/eventsim/output/listen_events.json'
df = pd.read_json(file_path, lines=True)

# Convert timestamp to seconds
df['ts_seconds'] = df['ts'] // 1000

# Sort by timestamp
df = df.sort_values('ts_seconds')

# Group by minute intervals
df['group'] = ((df['ts_seconds'] - df['ts_seconds'].min()) // 60)

# Get unique number of groups
num_groups = df['group'].nunique()

# Set max files per output file
max_files = 500
groups_per_file = num_groups // max_files + 1 if num_groups > max_files else 1

# Group the dataframe
groups = df.groupby('group')

# Helper function to get timezone based on state
def get_timezone_from_state(state):
    # A mapping of US state names to timezone offsets (simplified for this example)
    state_timezones = {
        'Alabama': 'US/Central', 'Alaska': 'US/Alaska', 'Arizona': 'US/Arizona',
        'Arkansas': 'US/Central', 'California': 'US/Pacific', 'Colorado': 'US/Mountain',
        'Connecticut': 'US/Eastern', 'Delaware': 'US/Eastern', 'Florida': 'US/Eastern',
        'Georgia': 'US/Eastern', 'Hawaii': 'US/Hawaii', 'Idaho': 'US/Mountain',
        'Illinois': 'US/Central', 'Indiana': 'US/Eastern', 'Iowa': 'US/Central',
        'Kansas': 'US/Central', 'Kentucky': 'US/Eastern', 'Louisiana': 'US/Central',
        'Maine': 'US/Eastern', 'Maryland': 'US/Eastern', 'Massachusetts': 'US/Eastern',
        'Michigan': 'US/Eastern', 'Minnesota': 'US/Central', 'Mississippi': 'US/Central',
        'Missouri': 'US/Central', 'Montana': 'US/Mountain', 'Nebraska': 'US/Central',
        'Nevada': 'US/Pacific', 'New Hampshire': 'US/Eastern', 'New Jersey': 'US/Eastern',
        'New Mexico': 'US/Mountain', 'New York': 'US/Eastern', 'North Carolina': 'US/Eastern',
        'North Dakota': 'US/Central', 'Ohio': 'US/Eastern', 'Oklahoma': 'US/Central',
        'Oregon': 'US/Pacific', 'Pennsylvania': 'US/Eastern', 'Rhode Island': 'US/Eastern',
        'South Carolina': 'US/Eastern', 'South Dakota': 'US/Central', 'Tennessee': 'US/Central',
        'Texas': 'US/Central', 'Utah': 'US/Mountain', 'Vermont': 'US/Eastern',
        'Virginia': 'US/Eastern', 'Washington': 'US/Pacific', 'West Virginia': 'US/Eastern',
        'Wisconsin': 'US/Central', 'Wyoming': 'US/Mountain'
    }
    
    return state_timezones.get(state, 'Unknown')

# Function to classify the user agent as mobile or desktop
def classify_device(user_agent):
    mobile_keywords = ['Mobile', 'Android', 'iPhone', 'iPad', 'Windows Phone', 'BlackBerry']
    desktop_keywords = ['Windows NT', 'Macintosh', 'Linux', 'Chrome OS']
    
    # Check for mobile device
    if any(keyword in user_agent for keyword in mobile_keywords):
        return 'Mobile'
    # Check for desktop device
    elif any(keyword in user_agent for keyword in desktop_keywords):
        return 'Desktop'
    else:
        return 'Unknown'

# Assuming 'state' column exists, directly apply timezone mapping
df['timezone'] = df['state'].apply(get_timezone_from_state)

# Add user device classification based on userAgent
df['device_type'] = df['userAgent'].apply(classify_device)

# Process and save the groups
current_file = 1
group_list = []

for idx, (group, group_data) in enumerate(groups, start=1):
    group_list.append(group_data)
   
    if len(group_list) >= groups_per_file:
        output_file = f'group_{current_file}.json'
        pd.concat(group_list).drop(columns=['group']).to_json(output_file, orient='records', lines=True)
        print(f"Saved: {output_file}")
        current_file += 1
        group_list = []  
    
    if current_file > max_files:
        break


ModuleNotFoundError: No module named 'timezonefinder'