## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))

In [3]:
df = pd.read_json('../data/result.json')

In [4]:
df.head()

Unnamed: 0,name,type,id,messages
0,Vent Here,public_channel,1137849815,"{'id': 1, 'type': 'service', 'date': '2017-05-..."
1,Vent Here,public_channel,1137849815,"{'id': 2, 'type': 'message', 'date': '2017-05-..."
2,Vent Here,public_channel,1137849815,"{'id': 3, 'type': 'message', 'date': '2017-05-..."
3,Vent Here,public_channel,1137849815,"{'id': 4, 'type': 'message', 'date': '2017-05-..."
4,Vent Here,public_channel,1137849815,"{'id': 5, 'type': 'message', 'date': '2017-05-..."


In [5]:
df_message = df[['messages']]

In [6]:
df_message.head()

Unnamed: 0,messages
0,"{'id': 1, 'type': 'service', 'date': '2017-05-..."
1,"{'id': 2, 'type': 'message', 'date': '2017-05-..."
2,"{'id': 3, 'type': 'message', 'date': '2017-05-..."
3,"{'id': 4, 'type': 'message', 'date': '2017-05-..."
4,"{'id': 5, 'type': 'message', 'date': '2017-05-..."


In [7]:
df_message.iloc[1]['messages']

{'id': 2,
 'type': 'message',
 'date': '2017-05-27T17:36:54',
 'date_unixtime': '1495895814',
 'edited': '2023-01-19T02:15:20',
 'edited_unixtime': '1674083720',
 'from': 'Vent Here',
 'from_id': 'channel1137849815',
 'text': ["A place to ask for advice, look for comfort or simply ramble on about what's on your mind with no judgement.\n\nSend ur thoughts to your admin: ",
  {'type': 'mention', 'text': '@unihorse'}],
 'text_entities': [{'type': 'plain',
   'text': "A place to ask for advice, look for comfort or simply ramble on about what's on your mind with no judgement.\n\nSend ur thoughts to your admin: "},
  {'type': 'mention', 'text': '@unihorse'}]}

In [8]:
# Extract the keys and values from the 'messages' column
messages_list = df_message['messages'].apply(pd.Series)

# Create a new dataframe from the extracted keys and values
df_messages_expanded = pd.DataFrame(messages_list)

# Display the new dataframe
df_messages_expanded.head()

Unnamed: 0,id,type,date,date_unixtime,actor,actor_id,action,title,text,text_entities,...,sticker_emoji,mime_type,width,height,duration_seconds,message_id,photo,forwarded_from,performer,poll
0,1,service,2017-05-27T17:17:47,1495894667,Vent Here,channel1137849815,create_channel,Vent Here,,[],...,,,,,,,,,,
1,2,message,2017-05-27T17:36:54,1495895814,,,,,"[A place to ask for advice, look for comfort o...","[{'type': 'plain', 'text': 'A place to ask for...",...,,,,,,,,,,
2,3,message,2017-05-27T18:37:51,1495899471,,,,,Hey unihorse 🐴.\nHide my identity.\nI have a c...,"[{'type': 'plain', 'text': 'Hey unihorse 🐴. Hi...",...,,,,,,,,,,
3,4,message,2017-05-27T18:48:49,1495900129,,,,,Hey unihorse 🐴\nHide my identity.\nI got a con...,"[{'type': 'plain', 'text': 'Hey unihorse 🐴 Hid...",...,,,,,,,,,,
4,5,message,2017-05-28T05:44:35,1495939475,,,,,Hey unihorse 🐴.\nPlease hide my identity.\nHer...,"[{'type': 'plain', 'text': 'Hey unihorse 🐴. Pl...",...,,,,,,,,,,


In [9]:
df_filtered = df_messages_expanded.set_index('id')[['date', 'text']]
df_filtered.head()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2017-05-27T17:17:47,
2,2017-05-27T17:36:54,"[A place to ask for advice, look for comfort o..."
3,2017-05-27T18:37:51,Hey unihorse 🐴.\nHide my identity.\nI have a c...
4,2017-05-27T18:48:49,Hey unihorse 🐴\nHide my identity.\nI got a con...
5,2017-05-28T05:44:35,Hey unihorse 🐴.\nPlease hide my identity.\nHer...


In [10]:
df_filtered = df_filtered.drop([1, 2])
df_filtered.head()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-05-27T18:37:51,Hey unihorse 🐴.\nHide my identity.\nI have a c...
4,2017-05-27T18:48:49,Hey unihorse 🐴\nHide my identity.\nI got a con...
5,2017-05-28T05:44:35,Hey unihorse 🐴.\nPlease hide my identity.\nHer...
6,2017-05-28T08:36:58,Hey unihorse 🐴.\nHide my identity.\nJust venti...
8,2017-05-28T09:55:31,Hey unihorse 🐴.\nHide my identity.\nIm so rich...


## Removing Emojis from the text column

In [11]:
import re

def remove_emojis(text):
    if not isinstance(text, str):
        return text
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df_filtered['text'] = df_filtered['text'].apply(remove_emojis)
df_filtered.head()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-05-27T18:37:51,Hey unihorse .\nHide my identity.\nI have a co...
4,2017-05-27T18:48:49,Hey unihorse \nHide my identity.\nI got a conf...
5,2017-05-28T05:44:35,Hey unihorse .\nPlease hide my identity.\nHere...
6,2017-05-28T08:36:58,Hey unihorse .\nHide my identity.\nJust ventin...
8,2017-05-28T09:55:31,Hey unihorse .\nHide my identity.\nIm so rich....


In [12]:
df_filtered['text'] = df_filtered['text'].str.replace(r'Hey unihorse \.\nHide my identity\.\n', '', regex=True)
df_filtered.head()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-05-27T18:37:51,I have a confession to make.\nI am a very free...
4,2017-05-27T18:48:49,Hey unihorse \nHide my identity.\nI got a conf...
5,2017-05-28T05:44:35,Hey unihorse .\nPlease hide my identity.\nHere...
6,2017-05-28T08:36:58,Just venting:\nTrying to set your friend up wi...
8,2017-05-28T09:55:31,"Im so rich. Im not bragging, im just so rich, ..."


In [13]:
df_filtered['text'] = df_filtered['text'].str.replace(r'Hey unihorse \nHide my identity\.\n', '', regex=True)
df_filtered.head()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-05-27T18:37:51,I have a confession to make.\nI am a very free...
4,2017-05-27T18:48:49,I got a confession to make am a dude with diff...
5,2017-05-28T05:44:35,Hey unihorse .\nPlease hide my identity.\nHere...
6,2017-05-28T08:36:58,Just venting:\nTrying to set your friend up wi...
8,2017-05-28T09:55:31,"Im so rich. Im not bragging, im just so rich, ..."


In [16]:
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download stopwords and punkt tokenizer
nltk.download('stopwords')
nltk.download('punkt')

# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess(text):
    if not isinstance(text, str):
        return []
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df_filtered['tokens'] = df_filtered['text'].apply(preprocess)

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df_filtered['tokens'])
corpus = [dictionary.doc2bow(tokens) for tokens in df_filtered['tokens']]

# Train the LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)

# Assign topics to each document
def get_topic(text):
    bow = dictionary.doc2bow(preprocess(text))
    topics = lda_model.get_document_topics(bow)
    if topics:
        return max(topics, key=lambda x: x[1])[0]
    return None

df_filtered['topic'] = df_filtered['text'].apply(get_topic)
df_filtered.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Admin/nltk_data'
    - 'c:\\Templates\\vent\\venv\\nltk_data'
    - 'c:\\Templates\\vent\\venv\\share\\nltk_data'
    - 'c:\\Templates\\vent\\venv\\lib\\nltk_data'
    - 'C:\\Users\\Admin\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
