## **Cleaning Datasets**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Read product info dataset
df1 = pd.read_csv('/content/drive/MyDrive/Capstone_Datasets/reddit_normal_posts.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Capstone_Datasets/reddit_happy_posts.csv')

In [None]:
# shuffle the dataset
df1 = df1.sample(frac=1)
df1["label"]= "normal"
df1.head() # display dataset

Unnamed: 0,Title,Content,Score,URL,Created_UTC,label
4017,Average German life,,8347,https://www.reddit.com/gallery/193xa3k,1704963000.0,normal
3344,Average female reddit experience,,3495,https://i.redd.it/kmha1g0lpfxc1.jpeg,1714403000.0,normal
1604,average gen Z dating experience,,12042,https://i.redd.it/vcu2jpf5pjbb1.jpg,1689173000.0,normal
4636,Thank You [A Story About a Very Ordinary Couple],,6055,https://www.reddit.com/gallery/1cdle0c,1714138000.0,normal
5221,What a kind and normal person!/s,,4489,https://i.redd.it/rccea2hmhmxc1.jpeg,1714486000.0,normal


In [None]:
# shuffle the dataset
df2 = df2.sample(frac=1)
df2["label"]= "happy"
df2.head() # display dataset

Unnamed: 0,Title,Content,Score,URL,Created_UTC,label
3381,Photo of a North Korean family being delighted...,,7480,http://i.imgur.com/CBTOh.jpg,1345391000.0,happy
2093,Glad he’s back and feeling all the support 💕,,131902,https://i.redd.it/qdaaufwyy1o91.jpg,1663261000.0,happy
6319,Anya Taylor Joy (recent photos),,393,https://www.reddit.com/gallery/16th53l,1695811000.0,happy
3353,Did you also get pregnant from sitting on a pu...,,4208,https://i.redd.it/82k8nf8kdbnc1.jpeg,1709992000.0,happy
8176,Raden is super delighted that she got noticed ...,,1123,https://i.redd.it/dkiaoyyhucec1.png,1706088000.0,happy


**Labeling**

In [None]:
df1_new = df1[['label','Title']]
df2_new = df2[['label','Title']]

# Rename the 'Title' column to 'cleaned_text'
df1_new.rename(columns={'Title': 'cleaned_text'}, inplace=True)
df2_new.rename(columns={'Title': 'cleaned_text'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_new.rename(columns={'Title': 'cleaned_text'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_new.rename(columns={'Title': 'cleaned_text'}, inplace=True)


In [None]:
df1_new.head()

Unnamed: 0,label,cleaned_text
4017,normal,Average German life
3344,normal,Average female reddit experience
1604,normal,average gen Z dating experience
4636,normal,Thank You [A Story About a Very Ordinary Couple]
5221,normal,What a kind and normal person!/s


In [None]:
df2_new.head()

Unnamed: 0,label,cleaned_text
3381,happy,Photo of a North Korean family being delighted...
2093,happy,Glad he’s back and feeling all the support 💕
6319,happy,Anya Taylor Joy (recent photos)
3353,happy,Did you also get pregnant from sitting on a pu...
8176,happy,Raden is super delighted that she got noticed ...


**Merged**

In [None]:
df1_new.to_csv('/content/drive/MyDrive/Capstone_Datasets/reddit_normal_label_new.csv', index=False)
df2_new.to_csv('/content/drive/MyDrive/Capstone_Datasets/reddit_happy_label_new.csv', index=False)

In [None]:
combined_df = pd.concat([df1_new, df2_new], ignore_index=True)

In [None]:
combined_df.to_csv('/content/drive/MyDrive/Capstone_Datasets/merged1.csv', index=False)

In [None]:
combined_df.head()

Unnamed: 0,label,cleaned_text
0,normal,Average German life
1,normal,Average female reddit experience
2,normal,average gen Z dating experience
3,normal,Thank You [A Story About a Very Ordinary Couple]
4,normal,What a kind and normal person!/s


In [None]:
df_combined_new = combined_df[['label','cleaned_text']]

df_combined_new.rename(columns={'cleaned_text': 'post'}, inplace=True)

# **Merged with existing datasets**

In [None]:
df3 = pd.read_csv('/content/drive/MyDrive/Capstone_Datasets/merged_output.csv')
df3.head()

Unnamed: 0,post,label
0,"I can always feel my heartbeat 18M, physically...",anxiety
1,My dad is hospitalized and had to be put in a ...,anxiety
2,Feels like anxiety is turning my brain to stew...,anxiety
3,What do you do when anxiety kicks in around ot...,anxiety
4,Finding a job is a nightmare I want to rant ab...,anxiety


In [None]:
combined_all_df = pd.concat([df_combined_new,df3], ignore_index=True)

combined_all_df.to_csv('/content/drive/MyDrive/Capstone_Datasets/all_merged_1.csv', index=False)

In [None]:
combined_all_df.head()

Unnamed: 0,label,post
0,normal,Average German life
1,normal,Average female reddit experience
2,normal,average gen Z dating experience
3,normal,Thank You [A Story About a Very Ordinary Couple]
4,normal,What a kind and normal person!/s


In [None]:
df4 = pd.read_csv('/content/drive/MyDrive/Capstone_Datasets/all_merged_1.csv')

df4['label'].value_counts()

label
depression    21208
anxiety       19976
lonely        11545
normal        10000
happy         10000
Name: count, dtype: int64

In [None]:
df4['post'] = df4['post'].astype(str)
df4['label'] = df4['label'].astype(str)
print(df4.dtypes)

label    object
post     object
dtype: object


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df4['label'] = label_encoder.fit_transform(df4['label'])
print(df4.dtypes)

label     int64
post     object
dtype: object


In [None]:
df4 = df4.drop(df4[df4['label'] == 4].index)
print(df4['label'].value_counts())

label
1    21208
0    19976
3    11545
2    10000
Name: count, dtype: int64


In [None]:
# Periksa distribusi label
print(df4['label'].value_counts())

# Periksa panjang teks
df4['text_length'] = df4['post'].apply(len)
print(df4['text_length'].describe())

# Periksa beberapa contoh teks
print(df4['post'].head())

label
1    21208
0    19976
3    11545
2    10000
Name: count, dtype: int64
count    62729.000000
mean       735.255847
std        859.415233
min          3.000000
25%        196.000000
50%        504.000000
75%        977.000000
max      32765.000000
Name: text_length, dtype: float64
10000    Photo of a North Korean family being delighted...
10001         Glad he’s back and feeling all the support 💕
10002                      Anya Taylor Joy (recent photos)
10003    Did you also get pregnant from sitting on a pu...
10004    Raden is super delighted that she got noticed ...
Name: post, dtype: object


In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Inisialisasi stemmer dan lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Inisialisasi stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Menghapus URL
    text = re.sub(r'http\S+', '', text)
    # Menghapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Menghapus angka
    text = re.sub(r'\d+', '', text)
    # Mengubah teks menjadi huruf kecil
    text = text.lower()
    # Menghapus stopwords dan melakukan stemming atau lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Terapkan pembersihan teks pada kolom 'text'
df4['cleaned_text'] = df4['post'].apply(clean_text)

# Pratinjau teks yang telah dibersihkan
print(df4[['post', 'cleaned_text']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                                    post  \
10000  Photo of a North Korean family being delighted...   
10001       Glad he’s back and feeling all the support 💕   
10002                    Anya Taylor Joy (recent photos)   
10003  Did you also get pregnant from sitting on a pu...   
10004  Raden is super delighted that she got noticed ...   

                                            cleaned_text  
10000  photo north korean family delighted meet aweso...  
10001                   glad he’s back feeling support 💕  
10002                       anya taylor joy recent photo  
10003        also get pregnant sitting public toilet joy  
10004  raden super delighted got noticed favorite rak...  


In [None]:
df4.head()

Unnamed: 0,label,post,text_length,cleaned_text
10000,2,Photo of a North Korean family being delighted...,76,photo north korean family delighted meet aweso...
10001,2,Glad he’s back and feeling all the support 💕,44,glad he’s back feeling support 💕
10002,2,Anya Taylor Joy (recent photos),31,anya taylor joy recent photo
10003,2,Did you also get pregnant from sitting on a pu...,63,also get pregnant sitting public toilet joy
10004,2,Raden is super delighted that she got noticed ...,76,raden super delighted got noticed favorite rak...


In [None]:
df4 = df4.drop(columns=['text_length','post'])
df4.head()

Unnamed: 0,label,cleaned_text
10000,2,photo north korean family delighted meet aweso...
10001,2,glad he’s back feeling support 💕
10002,2,anya taylor joy recent photo
10003,2,also get pregnant sitting public toilet joy
10004,2,raden super delighted got noticed favorite rak...
