In [30]:
!pip install -q contractions scikit-learn Sastrawi googletrans==4.0.0-rc1 langdetect pandas matplotlib yfinance tensorflow xgboost

# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from bs4 import BeautifulSoup
import re
import unicodedata
import nltk
import contractions
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from wordcloud import WordCloud
from collections import Counter
from langdetect import detect
from googletrans import Translator
from sklearn.metrics import r2_score
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit

In [31]:
# Load  dataset
url = 'https://raw.githubusercontent.com/22bayusetia/PyCuan/main/Sentiment%20Analysis/data_finance.csv'
# url = 'https://raw.githubusercontent.com/hairulysin/Algoritma_Academy/main/dataset.csv'
df = pd.read_csv(url, delimiter=',', encoding='latin-1', header=None)
df = df.drop(0)
df.columns = ['label', 'en_text', 'id_text']
df = df[['label', 'id_text', 'en_text']]
# df.info()

In [32]:
nltk.download('stopwords')
nltk.download('punkt')

# Load stopwords for Indonesian
indonesian_stopwords = set(nltk.corpus.stopwords.words('indonesian'))

# Load Sastrawi stemmer and stopword remover
factory1 = StopWordRemoverFactory()
stopword_sastrawi = factory1.create_stop_word_remover()

factory2 = StemmerFactory()
stemmer_sastrawi = factory2.create_stemmer()

# Data preprocessing functions

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def stopwords_removal(words, language):
    if language == 'english':
        list_stopwords = nltk.corpus.stopwords.words('english')
    elif language == 'indonesian':
        list_stopwords = indonesian_stopwords
    return [word for word in words if word not in list_stopwords]

def preprocess_text_sastrawi(text):
    tokens = nltk.word_tokenize(text)
    tokens = [stopword_sastrawi.remove(token) for token in tokens]
    # tokens = [stemmer_sastrawi.stem(token) for token in tokens if token != '']
    return " ".join(tokens)

def pre_process_text(text, language):
    text = text.lower()
    text = strip_html_tags(text)
    text = text.translate(text.maketrans("\n\t\r", "   "))
    text = remove_accented_chars(text)
    text = contractions.fix(text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I | re.A)
    text = re.sub(' +', ' ', text)
    text = preprocess_text_sastrawi(text) if language == 'indonesian' else text
    return text

# Apply data preprocessing
df['en_text'] = df['en_text'].apply(lambda x: pre_process_text(x, 'english'))
df['id_text'] = df['id_text'].apply(lambda x: pre_process_text(x, 'indonesian'))
df = df.drop_duplicates(subset=['en_text', 'id_text'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


In [35]:
def perform_sentiment_analysis(df):
    # Upsample the minority class after train-validation-test split
    data_majority = df[df['label'] == "positive"]
    data_minority = df[df['label'] == "negative"]

    data_minority_upsampled = resample(data_minority,
                                       replace=True,
                                       n_samples=data_majority.shape[0],
                                       random_state=123)

    df_balance_upsampled = pd.concat([data_majority, data_minority_upsampled])
    df_balanced_upsampled = df.drop_duplicates(subset=['en_text', 'id_text'])

    # Split data (80:10:10)
    X_train, X_temp, y_train, y_temp = train_test_split(df_balance_upsampled.en_text,
                                                        df_balance_upsampled.label,
                                                        test_size=0.2,
                                                        random_state=42)

    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Convert text sequences to dense feature vectors using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(max_features=3000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf = tfidf_vectorizer.transform(X_val)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Initialize the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the Random Forest classifier with validation set
    rf_classifier.fit(X_train_tfidf, y_train)
    y_pred = rf_classifier.predict(X_test_tfidf)

    # Evaluate the model on training, validation, and test sets
    train_acc = rf_classifier.score(X_train_tfidf, y_train)
    val_acc = rf_classifier.score(X_val_tfidf, y_val)
    test_acc = rf_classifier.score(X_test_tfidf, y_test)

    # Metrics Evaluation
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Sentiment Prediction Distribution
    predicted_sentiments = rf_classifier.predict(X_test_tfidf)
    positive_percentage = (predicted_sentiments == 'positive').sum() / len(predicted_sentiments) * 100
    negative_percentage = 100 - positive_percentage

    return train_acc, val_acc, test_acc, precision, recall, f1, positive_percentage, negative_percentage

sentiment_results = perform_sentiment_analysis(df)

In [40]:
import yfinance as yf
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from math import sqrt

# Function for time series forecasting
def perform_time_series_forecasting(df, stock_symbol, start_date, end_date, seq_length=30, forecast_days=5):
    # Download historical stock data
    df_stock = yf.download(stock_symbol, start=start_date, end=end_date)
    ts = df_stock['Open'].values

    # Standardize the time series data
    scaler = StandardScaler()
    ts_scaled = scaler.fit_transform(np.array(ts).reshape(-1, 1))

    # Prepare training data
    X_train = []
    y_train = []

    for i in range(len(ts_scaled) - seq_length):
        X_train.append(ts_scaled[i:i + seq_length])
        y_train.append(ts_scaled[i + seq_length])

    X_train = np.array(X_train)
    y_train = np.array(y_train)

    train_size = int(len(X_train) * 0.8)
    X_train, X_test = X_train[:train_size], X_train[train_size:]
    y_train, y_test = y_train[:train_size], y_train[train_size:]

    # Build and train LSTM model
    model = keras.Sequential()
    model.add(LSTM(128, activation='relu', return_sequences=True, input_shape=(seq_length, 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(128, activation='relu', return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mean_squared_error')
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

    # Generate forecast for the next 'forecast_days'
    X_forecast = np.copy(X_test[-1])
    forecasted_values = []

    for _ in range(forecast_days):
        forecasted_value = model.predict(X_forecast.reshape(1, seq_length, 1))
        forecasted_values.append(forecasted_value[0, 0])

        X_forecast = np.roll(X_forecast, -1)
        X_forecast[-1] = forecasted_value

    forecasted_values = scaler.inverse_transform(np.array(forecasted_values).reshape(-1, 1))

    # Evaluate the forecasting results
    rmse_test = sqrt(mean_squared_error(y_test, model.predict(X_test)))
    mae_test = mean_absolute_error(y_test, model.predict(X_test))

    # Return the forecasting results
    weighted_metric = (rmse_test + mae_test) / 2
    last_date = df_stock.index[-1]
    forecast_dates = pd.date_range(last_date, periods=forecast_days + 1)[1:]

    return weighted_metric, forecast_dates, forecasted_values

# Definisi variabel-variabel
stock_symbol = 'AAPL'
start_date = '2022-01-01'
end_date = '2022-12-31'

# Panggil fungsi perform_time_series_forecasting dengan data frame df
time_series_results = perform_time_series_forecasting(df, stock_symbol, start_date, end_date)


[*********************100%%**********************]  1 of 1 completed
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50


In [43]:
def combine_weights(sentiment_weight, time_series_weight, sentiment_weight_ratio=0.65):
    combined_weight = (sentiment_weight_ratio * sentiment_weight) + ((1 - sentiment_weight_ratio) * time_series_weight)
    return combined_weight

sentiment_weight = perform_sentiment_analysis(df)

time_series_weight, _, _ = perform_time_series_forecasting(df, stock_symbol, start_date, end_date)

final_weight = combine_weights(sentiment_weight, time_series_weight)

print("Weight from Sentiment Analysis:", sentiment_weight)
print("Weight from Time Series Forecasting:", time_series_weight)
print("Combined Weight:", final_weight)


[*********************100%%**********************]  1 of 1 completed
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Weight from Sentiment Analysis: 0.984287283837274
Weight from Time Series Forecasting: 0.365977414918195
Combined Weight: 0.7678788297155964
