In [3]:
import os
import sys
import re
import json
import glob
import datetime
from collections import Counter

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import psycopg2

from nltk.corpus import stopwords
from wordcloud import WordCloud
from datetime import datetime, timedelta

In [4]:
os.chdir('../..')

In [5]:
db_url = "postgresql+psycopg2://postgres:1001@localhost/Week-0_Features"

In [6]:
engine = create_engine(db_url, echo=True)

In [7]:
def load_dataframe_into_database(df, table_name):
    df.to_sql(table_name, engine, if_exists='replace', index_label='id')

In [8]:
from src.loader import SlackDataLoader

In [9]:
from src.config import cfg

Output File: slack_data.csv
Path: data\Anonymized_B6SlackExport_25Nov23\anonymized
Channel: data\Anonymized_B6SlackExport_25Nov23\anonymized\channels.json
Userfile: data\Anonymized_B6SlackExport_25Nov23\anonymized\users.json


In [10]:
import src.utils as utils

In [11]:
slack_data_loader = SlackDataLoader(cfg.path)

In [12]:
user_names_by_id, user_ids_by_name = slack_data_loader.get_user_map()

In [13]:
def calculate_user_metrics(messages):
    user_metrics = {}

    for message in messages:
        user_id = message.get('user')
        if user_id is not None:
            user_name = user_names_by_id.get(user_id)
            if user_name is not None:
                # Initialize metrics if the user is not in the dictionary
                if user_name not in user_metrics:
                    user_metrics[user_name] = {'Reply count': 0, 'Mention count': 0, 'Message count': 0}

                # Update metrics
                user_metrics[user_name]['Reply count'] += message.get('reply_count', 0)
                user_metrics[user_name]['Mention count'] += message.get('mention_count', 0)
                user_metrics[user_name]['Message count'] += 1

    return user_metrics

In [14]:
# Analyze each channel
all_channel_messages = []
for channel in slack_data_loader.channels:
    channel_name = channel['name']
    channel_messages = slack_data_loader.get_channel_messages(channel_name)
    all_channel_messages.extend(channel_messages)

In [15]:
# Calculate user metrics for all messages
user_metrics = calculate_user_metrics(all_channel_messages)

In [16]:
user_metrics_df = pd.DataFrame.from_dict(user_metrics, orient='index')

In [17]:
load_dataframe_into_table(user_metrics_df, 'messages_count')

2023-11-30 19:57:37,505 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-11-30 19:57:37,507 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-11-30 19:57:37,511 INFO sqlalchemy.engine.Engine select current_schema()
2023-11-30 19:57:37,512 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-11-30 19:57:37,515 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-11-30 19:57:37,517 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-11-30 19:57:37,529 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-11-30 19:57:37,530 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname

AttributeError: 'OptionEngine' object has no attribute 'execute'

In [None]:
top_users_reply = user_metrics_df.sort_values(by='Reply count', ascending=False).head(10)
bottom_users_reply = user_metrics_df.sort_values(by='Reply count').head(10)

In [None]:
print(top_users_reply)

In [None]:
print(bottom_users_reply)

In [None]:
top_users_mention = user_metrics_df.sort_values(by='Mention count', ascending=False).head(10)
bottom_users_mention = user_metrics_df.sort_values(by='Mention count').head(10)

In [None]:
print(top_users_mention)

In [None]:
print(bottom_users_mention)

In [None]:
top_users_message = user_metrics_df.sort_values(by='Message count', ascending=False).head(10)
bottom_users_message = user_metrics_df.sort_values(by='Message count').head(10)

In [None]:
print(top_users_message)

In [None]:
print(bottom_users_message)

In [None]:
# Function to calculate reaction count for each user
def calculate_reaction_count(messages):
    reaction_count = {}

    for message in messages:
        user_id = message.get('user')
        reactions = message.get('reactions', [])

        if user_id is not None:
            user_name = user_names_by_id.get(user_id)
            if user_name is not None:
                # Initialize reaction count if the user is not in the dictionary
                if user_name not in reaction_count:
                    reaction_count[user_name] = 0

                # Update reaction count
                reaction_count[user_name] += len(reactions)

    return reaction_count

In [None]:
reaction_count = calculate_reaction_count(all_channel_messages)
reaction_count_df = pd.DataFrame.from_dict(reaction_count, orient='index', columns=['Reaction count'])

In [None]:
top_users_reaction = reaction_count_df.sort_values(by='Reaction count', ascending=False).head(10)
bottom_users_reaction = reaction_count_df.sort_values(by='Reaction count').head(10)

In [None]:
print(top_users_reaction)

In [None]:
print(bottom_users_reaction)

In [None]:
# Function to extract messages with their metadata
def extract_messages_with_metadata(messages):
    extracted_messages = []

    for message in messages:
        user_id = message.get('user')
        user_name = user_names_by_id.get(user_id)
        timestamp = message.get('ts')
        text = message.get('text', '')
        reply_count = message.get('reply_count', 0)
        reaction_count = len(message.get('reactions', []))
        mention_count = message.get('mention_count', 0)

        extracted_messages.append({
            'User': user_name,
            'Timestamp': timestamp,
            'Text': text,
            'Reply count': reply_count,
            'Reaction count': reaction_count,
            'Mention count': mention_count
        })

    return extracted_messages

In [None]:
messages_with_metadata = extract_messages_with_metadata(all_channel_messages)

In [None]:
messages_df = pd.DataFrame(messages_with_metadata)

In [None]:
top_messages_by_replies = messages_df.sort_values(by='Reply count', ascending=False).head(10)
top_messages_by_reactions = messages_df.sort_values(by='Reaction count', ascending=False).head(10)
top_messages_by_mentions = messages_df.sort_values(by='Mention count', ascending=False).head(10)

In [None]:
print(top_messages_by_replies)

In [None]:
print(top_messages_by_reactions)

In [None]:
print(top_messages_by_mentions)

In [None]:
# Function to calculate activity metrics for each channel
def calculate_channel_activity(channels):
    channel_activity = {}

    for channel in channels:
        channel_name = channel['name']
        channel_messages = slack_data_loader.get_channel_messages(channel_name)

        message_count = len(channel_messages)
        reply_count = sum(message.get('reply_count', 0) for message in channel_messages)
        reaction_count = sum(len(message.get('reactions', [])) for message in channel_messages)

        channel_activity[channel_name] = {
            'Message count': message_count,
            'Reply and Reaction count': reply_count + reaction_count
        }

    return channel_activity

In [None]:
channel_activity = calculate_channel_activity(slack_data_loader.channels)

In [None]:
channel_activity_df = pd.DataFrame.from_dict(channel_activity, orient='index')

In [None]:
most_active_channel = channel_activity_df.idxmax(axis=0)['Reply and Reaction count']

In [None]:
print(most_active_channel)

In [None]:
# Plot 2D scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(channel_activity_df['Message count'], channel_activity_df['Reply and Reaction count'], c=range(len(channel_activity_df)), cmap='viridis', alpha=0.7)
plt.colorbar(label='Channel Index')
plt.title('2D Scatter Plot of Channel Activity')
plt.xlabel('Number of Messages')
plt.ylabel('Sum of Replies and Reactions')
plt.grid(True)
plt.show()

In [None]:
# Function to calculate the time difference between two Unix timestamps
def calculate_time_difference_unix(timestamp1, timestamp2):
    return (float(timestamp2) - float(timestamp1)) / 60.0  # Convert to minutes

In [None]:
# Function to calculate the fraction of messages replied within the first 5 minutes
def calculate_fraction_replied_within_5_minutes(messages):
    replied_within_5_minutes = 0
    total_messages = len(messages)

    for message in messages:
        message_timestamp = float(message.get('ts'))
        replies = message.get('replies', [])

        if replies:
            first_reply_timestamp = float(replies[0].get('ts'))
            time_difference = calculate_time_difference_unix(message_timestamp, first_reply_timestamp)

            if time_difference <= 5:
                replied_within_5_minutes += 1

    return replied_within_5_minutes / total_messages


In [None]:
fraction_replied_within_5_minutes = calculate_fraction_replied_within_5_minutes(all_channel_messages)

In [None]:
print(f"\nThe fraction of messages replied within the first 5 minutes is: {fraction_replied_within_5_minutes:.2%}")

In [None]:
# Function to extract data for the 2D scatter plot
def extract_scatter_plot_data(messages):
    scatter_plot_data = []

    for message in messages:
        user_id = message.get('user')
        user_name = user_names_by_id.get(user_id)
        timestamp = float(message.get('ts'))
        replies = message.get('replies', [])

        if replies:
            first_reply_timestamp = float(replies[0].get('ts'))
            time_difference = calculate_time_difference_unix(timestamp, first_reply_timestamp)
            time_of_day = (timestamp % (24 * 3600)) / 3600.0  # Convert to hours

            # Use channel index as color
            channel = next((channel for channel in slack_data_loader.channels if channel['id'] == message.get('channel')), None)
            channel_index = slack_loader.channels.index(channel) if channel is not None else -1
            
            scatter_plot_data.append({
                'Time Difference (minutes)': time_difference,
                'Time of Day (hours)': time_of_day,
                'Channel Index': channel_index
            })

    return scatter_plot_data

In [None]:
# Extract data for the 2D scatter plot
scatter_plot_data = extract_scatter_plot_data(all_channel_messages)

In [None]:
scatter_plot_df = pd.DataFrame(scatter_plot_data)

In [None]:
# Plot 2D scatter plot
plt.figure(figsize=(12, 8))
scatter = plt.scatter(scatter_plot_df['Time Difference (minutes)'], scatter_plot_df['Time of Day (hours)'], c=scatter_plot_df['Channel Index'], cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label='Channel Index')
plt.title('2D Scatter Plot: Time Difference vs. Time of Day')
plt.xlabel('Time Difference (minutes)')
plt.ylabel('Time of Day (24hr format)')
plt.grid(True)
plt.show()