**Sentiment analysis**

In [None]:
# !pip install datasets

In [None]:
# !huggingface-cli login

In [None]:
# !pip install numpy==1.26.4 scipy==1.11.4 gensim==4.3.1 tsfresh==0.20.0 tensorflow==2.15

In [None]:
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

In [None]:
import pandas as pd

df = pd.read_csv("hf://datasets/kdave/Indian_Financial_News/training_data_26000.csv")

In [None]:
# pd.set_option('display.max_colwidth', None)
df.head()

In [None]:
df["Sentiment"]=df["Sentiment"].apply(lambda x: 2 if x=="Positive" else 0 if x=="Negative" else 1)
df.head()

In [None]:
df["Sentiment"].value_counts()

In [None]:
import re
import nltk
import numpy as np

In [None]:
from transformers import AutoConfig

# Load the configuration first
config = AutoConfig.from_pretrained("bert-base-uncased", num_labels=3)

# Load the model using the modified configuration
bert_model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", config=config)

In [None]:
bert_tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
def preprocess_function(batch):
    source_ids=bert_tokenizer(batch['Summary'], truncation=True, padding="max_length", max_length=128)

    return {
        "input_ids": source_ids["input_ids"],
        "attention_mask": source_ids["attention_mask"],
        "token_type_ids": source_ids["token_type_ids"],
        "labels": batch["Sentiment"]
    }

In [None]:
# Install the datasets library if you haven't already
# !pip install datasets

from datasets import Dataset

# Convert the pandas DataFrame to a datasets Dataset
df_train = Dataset.from_pandas(df_train)
df_test = Dataset.from_pandas(df_test)

# Now call the map function on the datasets Dataset object
df_preprocessed_train = df_train.map(preprocess_function, batched=True)
df_preprocessed_test = df_test.map(preprocess_function, batched=True)

In [None]:
df_preprocessed_train

In [None]:
df_preprocessed_test

In [None]:
import tensorflow as tf
# from tensorflow import keras

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": df_preprocessed_train["input_ids"],
        "attention_mask": df_preprocessed_train["attention_mask"],
        "token_type_ids": df_preprocessed_train["token_type_ids"]
    },
    df_preprocessed_train["labels"]
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": df_preprocessed_test["input_ids"],
        "attention_mask": df_preprocessed_test["attention_mask"],
        "token_type_ids": df_preprocessed_test["token_type_ids"]
    },
    df_preprocessed_test["labels"]
)).batch(16)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]
bert_model.compile(optimizer=optimizer,loss=loss,metrics=metrics)

In [None]:
history=bert_model.fit(train_dataset,epochs=5,validation_data=test_dataset)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
bert_model.save('bert_sentiment_financial_model.keras')

In [None]:
bert_tokenizer.save_pretrained('bert_financial_tokenizer')

In [None]:
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     # You can add more metrics here, e.g., accuracy, precision, recall, F1-score
#     from sklearn.metrics import accuracy_score
#     accuracy = accuracy_score(labels, preds)
#     return {"accuracy": accuracy}

In [None]:
# !cp -r lstm_sentiment_model.h5 /content/drive/MyDrive/

**Text Summarization**

In [None]:
# Load model directly
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer

bart_tokenizer = AutoTokenizer.from_pretrained("ainize/bart-base-cnn")
bart_model = AutoModelForSeq2SeqLM.from_pretrained("ainize/bart-base-cnn")

In [None]:
import pandas as pd

df = pd.read_csv("hf://datasets/kdave/Indian_Financial_News/training_data_26000.csv")

In [None]:
# pd.set_option('display.max_colwidth', None)
df.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
def summary_preprocess_function(batch):
    source_ids=bart_tokenizer(batch['Content'], truncation=True, padding="max_length", max_length=128)
    target_ids=bart_tokenizer(batch['Summary'], truncation=True, padding="max_length", max_length=128)
    labels=target_ids["input_ids"]
    labels=[[(number if number!=bart_tokenizer.pad_token_id else -100) for number in individual_label] for individual_label in labels]

    return {
        "input_ids": source_ids["input_ids"],
        "attention_mask": source_ids["attention_mask"],
        "labels": labels
    }


In [None]:
# Install the datasets library if you haven't already
# !pip install datasets

from datasets import Dataset

# Convert the pandas DataFrame to a datasets Dataset
# df_train = Dataset.from_pandas(df_train)
# df_test = Dataset.from_pandas(df_test)

df_summary_preprocessed_train=df_train.map(summary_preprocess_function, batched=True)
df_summary_preprocessed_test=df_test.map(summary_preprocess_function, batched=True)

In [None]:
df_summary_preprocessed_train

In [None]:
df_summary_preprocessed_test

In [None]:
training_args=TrainingArguments(
    output_dir="/content",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    remove_unused_columns=True,
    report_to="none"
)

In [None]:
trainer=Trainer(
    model=bart_model,
    args=training_args,
    train_dataset=df_summary_preprocessed_train,
    eval_dataset=df_summary_preprocessed_test
)

In [None]:
trainer.train()

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print(eval_results)

In [None]:
# Save the model using the save_pretrained method
bart_model.save_pretrained('bart_summary_financial_model')

In [None]:
bart_tokenizer.save_pretrained('bart_financial_tokenizer')

In [None]:
!cp -r bart_summary_financial_model /content/drive/MyDrive/

In [None]:
!cp -r bart_financial_tokenizer /content/drive/MyDrive/