# Sentiment Analysis on Economic News (Baseline)

This notebook contains text preprocessing and a baseline sentiment analysis
to understand public narratives on economic issues using Indonesian news data.


## 1. Load Libraries and Data

In [1]:
import pandas as pd

df = pd.read_csv("clean_text_data.csv")
df.head()

Unnamed: 0,clean_text
0,kkp setor pnbp rp m disokong izin pemanfaatan ...
1,bukan rp ribu gus ipul usul purbaya tambah ban...
2,amran bongkar ton bawang bombai ilegal selundu...
3,viva yoga koperasi tingkatkan aktivitas ekonom...
4,purbaya beri kredit rp t untuk industri furnit...


## 2. Text Cleaning



In [12]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tokens_clean     100 non-null    object
 1   sentiment_score  100 non-null    int64 
 2   sentiment        100 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


## 3. Tokenization and Stopword Removal

In [2]:
df["tokens"] = df["clean_text"].apply(lambda x: x.split())
df[["tokens"]].head()

Unnamed: 0,tokens
0,"[kkp, setor, pnbp, rp, m, disokong, izin, pema..."
1,"[bukan, rp, ribu, gus, ipul, usul, purbaya, ta..."
2,"[amran, bongkar, ton, bawang, bombai, ilegal, ..."
3,"[viva, yoga, koperasi, tingkatkan, aktivitas, ..."
4,"[purbaya, beri, kredit, rp, t, untuk, industri..."


In [3]:
!pip install nltk



In [4]:
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords

stop_words = set(stopwords.words("indonesian"))

df["tokens_clean"] = df["tokens"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

df[["tokens_clean"]].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tokens_clean
0,"[kkp, setor, pnbp, rp, m, disokong, izin, pema..."
1,"[rp, ribu, gus, ipul, usul, purbaya, bantuan, ..."
2,"[amran, bongkar, ton, bawang, bombai, ilegal, ..."
3,"[viva, yoga, koperasi, tingkatkan, aktivitas, ..."
4,"[purbaya, kredit, rp, t, industri, furnitur, t..."


In [5]:
df_tokens = df[["tokens_clean"]]
df_tokens.to_csv("tokenized_text.csv", index=False)


## 4. Baseline Sentiment Scoring


In [6]:
import pandas as pd

df = pd.read_csv("tokenized_text.csv")
df.head()


Unnamed: 0,tokens_clean
0,"['kkp', 'setor', 'pnbp', 'rp', 'm', 'disokong'..."
1,"['rp', 'ribu', 'gus', 'ipul', 'usul', 'purbaya..."
2,"['amran', 'bongkar', 'ton', 'bawang', 'bombai'..."
3,"['viva', 'yoga', 'koperasi', 'tingkatkan', 'ak..."
4,"['purbaya', 'kredit', 'rp', 't', 'industri', '..."


In [7]:
positive_words = [
    "naik", "stabil", "tumbuh", "positif", "baik", "meningkat", "aman", "kuat"
]

negative_words = [
    "turun", "krisis", "mahal", "risiko", "negatif", "lemah", "ancam", "tekan"
]

In [8]:
import ast

def sentiment_score(tokens):
    pos = sum(1 for word in tokens if word in positive_words)
    neg = sum(1 for word in tokens if word in negative_words)
    return pos - neg

df["tokens_clean"] = df["tokens_clean"].apply(ast.literal_eval)
df["sentiment_score"] = df["tokens_clean"].apply(sentiment_score)

df[["sentiment_score"]].head()


Unnamed: 0,sentiment_score
0,0
1,0
2,0
3,0
4,0


In [9]:
def sentiment_label(score):
    if score > 0:
        return "positive"
    elif score < 0:
        return "negative"
    else:
        return "neutral"

df["sentiment"] = df["sentiment_score"].apply(sentiment_label)
df["sentiment"].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,92
positive,4
negative,4


In [11]:
df.to_csv("baseline_sentiment_result.csv", index=False)
