In [17]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from pprint import pprint
import string
import os
import re
import sys

In [None]:
# load data
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import SlackDataLoader
import src.utils as utils

In [None]:
path = './../../../anonymized/'
class DataLoader:
    def __init__(self, path):
        self.path = path
    def load_data(self):
        loader = SlackDataLoader(self.path)
        all_channels = loader.get_channels()
        channel_names = [channel['name'] for channel in all_channels]

        df = []
        for name in channel_names:
            channel_path = path + name + '/'
            channel_df = loader.slack_parser(channel_path)
            df.append(channel_df)
        
        all_df = pd.concat(df)
        return all_df
    

In [None]:
data_loader_obj = DataLoader(path)
df = data_loader_obj.load_data()


In [None]:
df.dropna()

In [14]:
len(df)
df.head()

Unnamed: 0,msg_type,msg_content,sender_name,msg_sent_time,msg_dist_type,time_thread_start,reply_count,reply_users_count,reply_users,tm_thread_end,channel
0,message,same! what's got a lot of holes but still hold...,Elizabeth Hall,1661420476.890989,text,1661343150.595419,0,0,0,0.0,
1,message,<@U03TEPYRM2P>,Nancy Craig,1661423005.712039,user,1661423005.712039,1,1,U03TEPYRM2P,1661423028.780879,
2,message,I am with you,Garrett Bell,1661423028.780879,text,1661423005.712039,0,0,0,0.0,
3,message,*<!here> Community Building Session REMINDER!*...,Vanessa Norman,1661428200.705819,broadcast,0.0,0,0,0,0.0,
4,message,Join the Call,Vanessa Norman,1661428630.743209,text,0.0,0,0,0,0.0,


In [None]:
# Convert 'msg_sent_time' to datetime and extract the date
df['msg_sent_time'] = pd.to_datetime(df['msg_sent_time']).dt.date



# Group by 'msg_sent_time' and concatenate all messages in the same day
df_grouped = df.groupby('msg_sent_time')['msg_content'].apply(' '.join).reset_index()

# Preprocess your data
df_grouped['msg_content'] = df_grouped['msg_content'].apply(remove_stopwords)

# Vectorize your text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_grouped['msg_content'])

# Train a logistic regression model
model = LogisticRegression()
model.fit(X, df_grouped['msg_sent_time'])

# Make predictions on your data
predictions = model.predict(X)

# Create a new DataFrame for your predictions
df_predictions = pd.DataFrame({'date': df_grouped['msg_sent_time'], 'sentiment': predictions})

# Plot the time series trend of sentiments
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_predictions, x='date', y='sentiment')
plt.title('Time Series Trend of Sentiments')
plt.show()