## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))

In [3]:
from preprocessing import *

## Read the data

In [4]:
df = pd.read_json('../data/result.json')

In [5]:
df.head()

Unnamed: 0,name,type,id,messages
0,Vent Here,public_channel,1137849815,"{'id': 1, 'type': 'service', 'date': '2017-05-..."
1,Vent Here,public_channel,1137849815,"{'id': 2, 'type': 'message', 'date': '2017-05-..."
2,Vent Here,public_channel,1137849815,"{'id': 3, 'type': 'message', 'date': '2017-05-..."
3,Vent Here,public_channel,1137849815,"{'id': 4, 'type': 'message', 'date': '2017-05-..."
4,Vent Here,public_channel,1137849815,"{'id': 5, 'type': 'message', 'date': '2017-05-..."


## The message Column is the one we are interested in.

In [6]:
df_message = df[['messages']]

In [7]:
df_message.head()

Unnamed: 0,messages
0,"{'id': 1, 'type': 'service', 'date': '2017-05-..."
1,"{'id': 2, 'type': 'message', 'date': '2017-05-..."
2,"{'id': 3, 'type': 'message', 'date': '2017-05-..."
3,"{'id': 4, 'type': 'message', 'date': '2017-05-..."
4,"{'id': 5, 'type': 'message', 'date': '2017-05-..."


## The target Column is not clean, we will clean it.

### Let's understand it first

In [8]:
df_message.iloc[1]['messages']

{'id': 2,
 'type': 'message',
 'date': '2017-05-27T17:36:54',
 'date_unixtime': '1495895814',
 'edited': '2023-01-19T02:15:20',
 'edited_unixtime': '1674083720',
 'from': 'Vent Here',
 'from_id': 'channel1137849815',
 'text': ["A place to ask for advice, look for comfort or simply ramble on about what's on your mind with no judgement.\n\nSend ur thoughts to your admin: ",
  {'type': 'mention', 'text': '@unihorse'}],
 'text_entities': [{'type': 'plain',
   'text': "A place to ask for advice, look for comfort or simply ramble on about what's on your mind with no judgement.\n\nSend ur thoughts to your admin: "},
  {'type': 'mention', 'text': '@unihorse'}]}

### convert the column's dictionary to a dataframe(first to series then to dataframe)

In [9]:
# Extract the keys and values from the 'messages' column
messages_list = df_message['messages'].apply(pd.Series)

# Create a new dataframe from the extracted keys and values
df_messages_expanded = pd.DataFrame(messages_list)

# Display the new dataframe
df_messages_expanded.head()

Unnamed: 0,id,type,date,date_unixtime,actor,actor_id,action,title,text,text_entities,...,sticker_emoji,mime_type,width,height,duration_seconds,message_id,photo,forwarded_from,performer,poll
0,1,service,2017-05-27T17:17:47,1495894667,Vent Here,channel1137849815,create_channel,Vent Here,,[],...,,,,,,,,,,
1,2,message,2017-05-27T17:36:54,1495895814,,,,,"[A place to ask for advice, look for comfort o...","[{'type': 'plain', 'text': 'A place to ask for...",...,,,,,,,,,,
2,3,message,2017-05-27T18:37:51,1495899471,,,,,Hey unihorse 🐴.\nHide my identity.\nI have a c...,"[{'type': 'plain', 'text': 'Hey unihorse 🐴. Hi...",...,,,,,,,,,,
3,4,message,2017-05-27T18:48:49,1495900129,,,,,Hey unihorse 🐴\nHide my identity.\nI got a con...,"[{'type': 'plain', 'text': 'Hey unihorse 🐴 Hid...",...,,,,,,,,,,
4,5,message,2017-05-28T05:44:35,1495939475,,,,,Hey unihorse 🐴.\nPlease hide my identity.\nHer...,"[{'type': 'plain', 'text': 'Hey unihorse 🐴. Pl...",...,,,,,,,,,,


### we are only interested in the id, date and text columns for now

In [10]:
df_filtered = df_messages_expanded.set_index('id')[['date', 'text']]
df_filtered.head()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2017-05-27T17:17:47,
2,2017-05-27T17:36:54,"[A place to ask for advice, look for comfort o..."
3,2017-05-27T18:37:51,Hey unihorse 🐴.\nHide my identity.\nI have a c...
4,2017-05-27T18:48:49,Hey unihorse 🐴\nHide my identity.\nI got a con...
5,2017-05-28T05:44:35,Hey unihorse 🐴.\nPlease hide my identity.\nHer...


### the first two rows are not important, so we will drop them

In [11]:
df_filtered = df_filtered.drop([1, 2])
df_filtered.head()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-05-27T18:37:51,Hey unihorse 🐴.\nHide my identity.\nI have a c...
4,2017-05-27T18:48:49,Hey unihorse 🐴\nHide my identity.\nI got a con...
5,2017-05-28T05:44:35,Hey unihorse 🐴.\nPlease hide my identity.\nHer...
6,2017-05-28T08:36:58,Hey unihorse 🐴.\nHide my identity.\nJust venti...
8,2017-05-28T09:55:31,Hey unihorse 🐴.\nHide my identity.\nIm so rich...


### Now we remove emojis from the text column

In [12]:
remove_emojis(df_filtered, 'text')
df_filtered.head()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-05-27T18:37:51,Hey unihorse .\nHide my identity.\nI have a co...
4,2017-05-27T18:48:49,Hey unihorse \nHide my identity.\nI got a conf...
5,2017-05-28T05:44:35,Hey unihorse .\nPlease hide my identity.\nHere...
6,2017-05-28T08:36:58,Hey unihorse .\nHide my identity.\nJust ventin...
8,2017-05-28T09:55:31,Hey unihorse .\nHide my identity.\nIm so rich....


### Now let's try to remove the first two sentences from the text column as they all are the same

In [13]:
df_filtered = remove_first_two_sentences(df_filtered, 'text')
df_filtered.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-05-27T18:37:51,I have a confession to make. I am a very free ...
4,2017-05-27T18:48:49,Or a lover?
5,2017-05-28T05:44:35,Here is my confession. I am a university stude...
6,2017-05-28T08:36:58,Just venting:\nTrying to set your friend up wi...
8,2017-05-28T09:55:31,"Im so rich. Im not bragging, im just so rich, ..."


### shape of the dataframe before the non-english rows removal

In [14]:
df_filtered.shape

(21127, 2)

## Let's Remove non-english content from the text column

In [15]:
english_df = remove_non_english_rows(df_filtered, 'text')

### shape of the dataframe after removing non-english rows

In [16]:
english_df.shape

(20086, 2)

In [17]:
english_df.head(40)

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-05-27T18:37:51,I have a confession to make. I am a very free ...
5,2017-05-28T05:44:35,Here is my confession. I am a university stude...
6,2017-05-28T08:36:58,Just venting:\nTrying to set your friend up wi...
8,2017-05-28T09:55:31,"Im so rich. Im not bragging, im just so rich, ..."
11,2017-05-28T11:30:20,I need to let this out. I am a normal girl but...
13,2017-05-29T08:23:37,"['📊 ', {'type': 'bold', 'text': 'Please choose..."
14,2017-05-29T14:27:16,I need to get laid so bad right now. Thank you.
15,2017-05-29T22:27:13,['Hello this is unihorse 🐴.\nMoshi Moshi membe...
16,2017-05-30T13:22:39,I have a confession to make. I am a girl with ...
17,2017-06-12T19:34:05,I have a confession to make. I got a call the ...


In [18]:
import emoji
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Apply the function to the 'text' column
english_df['text'] = english_df['text'].apply(lambda x: remove_emojis(x) if isinstance(x, str) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_df['text'] = english_df['text'].apply(lambda x: remove_emojis(x) if isinstance(x, str) else x)


In [19]:
english_df.tail()

Unnamed: 0_level_0,date,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
23804,2024-11-24T05:00:31,"['Hey Unihorse \nI am ', {'type': 'mention_nam..."
23805,2024-11-24T05:00:35,['Hey Unihorse \nI am Ketie\nI need to vent\n...
23806,2024-11-24T05:00:45,"[""Hey Unihorse \nHide my Identity\nI need to v..."
23807,2024-11-24T05:00:46,"[""Hey Unihorse \nHide my Identity\nI need to v..."
23808,2024-11-24T05:00:47,['Hey Unihorse \nHide my Identity\nI need to v...


In [20]:
english_df.to_csv('../data/cleaned_messages.csv', index=False)