# Import libraries

- download stopwords, punkt, and wordnet
- stopwords is for import stopwords
- punkt is for import word_tokenize
- wordnet is for import WordNetLemmatizer

In [53]:
import tensorflow as tf
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load dataset

In [54]:
df = pd.read_csv('Stress.csv')
df.head(10)

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.8,1521614353
1,assistance,8lbrx9,"(0, 5)","Hey there r/assistance, Not sure if this is th...",0,1.0,1527009817
2,ptsd,9ch1zh,"(15, 20)",My mom then hit me with the newspaper and it s...,1,0.8,1535935605
3,relationships,7rorpp,"[5, 10]","until i met my new boyfriend, he is amazing, h...",1,0.6,1516429555
4,survivorsofabuse,9p2gbc,"[0, 5]",October is Domestic Violence Awareness Month a...,1,0.8,1539809005
5,relationships,7tx7et,"(30, 35)",I think he doesn't want to put in the effort f...,1,1.0,1517274027
6,domesticviolence,7iphly,"[25, 30]",It was a big company so luckily I didn't have ...,0,0.8,1512854409
7,anxiety,5m3k80,"(5, 10)",It cleared up and I was okay but. On Monday ...,1,0.8,1483582174
8,relationships,7nhy1v,"(50, 55)",I actually give an assistant half my emergency...,1,0.6,1514843984
9,assistance,61eiq6,"[15, 20]",I just feel like the street life has fucked my...,1,1.0,1490428087


# Data cleaning
- remove stopwords
- lemmatization
- remove punctuation

```
df['text'] = [word_tokenize(text) for text in df['text']]

for index,text in enumerate(df['text']):
  df['text'][index] = [word for word in text if word.lower() not in stop_words]

```



In [55]:
# tokenize words
df['text'] = df['text'].apply(word_tokenize)

# remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda text: [word.lower() for word in text if word.lower() not in stop_words])

# implement lemmatization
# comment this if you want to
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda text: [lemmatizer.lemmatize(word) for word in text])

# remove punctuation
df['text'] = df['text'].apply(lambda text: [word for word in text if word.isalnum()])
print(df['text'])

0       [said, felt, way, suggeted, go, rest, trigger,...
1       [hey, sure, right, place, post, go, currently,...
2       [mom, hit, newspaper, shocked, would, know, li...
3       [met, new, boyfriend, amazing, kind, sweet, go...
4       [october, domestic, violence, awareness, month...
                              ...                        
2833    [week, ago, precious, ignored, jan, 1, happy, ...
2834    [ability, cope, anymore, trying, lot, thing, t...
2835    [case, first, time, reading, post, looking, pe...
2836    [find, normal, good, relationship, main, probl...
2837    [talking, mom, morning, said, sister, trauma, ...
Name: text, Length: 2838, dtype: object


# Split dataset
in this dataset there is a column named text and label
- text : what people said in each subreddit
- label : indicator if someone is stressed based by the text (0 -> not stressed, 1 -> stressed)

In [56]:
kalimat = df['text'].values
label = df['label'].values
print(kalimat[0], "\n", label[0])

['said', 'felt', 'way', 'suggeted', 'go', 'rest', 'trigger', 'ahead', 'youi', 'hypocondriac', 'like', 'decide', 'look', 'feeling', 'doom', 'hope', 'maybe', 'getting', 'sucked', 'rabbit', 'hole', 'ludicrous', 'conspiracy', 'stupid', 'psychic', 'test', 'new', 'age', 'something', 'could', 'even', 'laugh', 'road', 'ended', 'reading', 'sense', 'doom', 'indicative', 'various', 'health', 'ailment', 'one', 'prone', 'top', 'doom', 'gloom', 'worried', 'heart', 'happen', 'physical', '48', 'hour'] 
 1


# Split data into train and test

- random_state is used so that you will get the same result as i do

In [57]:
kalimat_latih, kalimat_test, label_latih, label_test = train_test_split(kalimat, label, test_size=0.2, random_state=42)

# Tokenization

In [61]:
tokenizer = Tokenizer(num_words=150000, oov_token='x')
tokenizer.fit_on_texts(kalimat_latih)

sekuens_latih = tokenizer.texts_to_sequences(kalimat_latih)
sekuens_test = tokenizer.texts_to_sequences(kalimat_test)

padded_latih = pad_sequences(sekuens_latih)
padded_test = pad_sequences(sekuens_test)

print(f"Kalimat latih: {kalimat_latih} \n")
print(f"Sekuens latih: {sekuens_latih} \n")
print(f"Padded latih: {padded_latih} \n")

Kalimat latih: [list(['tell', 'anyone', 'even', 'family', 'felt', 'protect', 'already', 'insolved', 'cps', 'trust', 'though', 'awesome', 'knew', 'brought', 'abuse', 'case', 'would', 'inevitably', 'send', 'unnecessary', 'path', 'social', 'worker', 'blame', 'abuser', 'jump', 'conclusion', 'negligent', 'parent', 'parent', 'irresponsible', 'enough', 'know', 'going', 'know', 'didnt', 'know', 'could', 'know', 'went', 'hour', 'assumed', 'hanging', 'friend', 'least', 'planned', 'assumed', 'went', 'school', 'got', 'education', 'beating', 'teacher', 'room'])
 list(['stop', 'productive', 'thing', 'talking', 'people', 'remembering', 'appointment', 'etc', 'unemployed', 'almost', 'broke', 'looking', 'work', 'causing', 'feel', 'like', 'ca', 'swallow', 'get', 'full', 'breath', 'find', 'played', 'video', 'game', 'day', 'surfed', 'reddit', 'morning', 'read', 'review', 'book', 'coming', 'almost', '2', 'hour', 'med', 'though', 'trying', 'gaba', 'since', 'anyway', 'make', 'even', 'fun', 'sister', 'mad', 'c