In [1]:
!pip install transformers[torch] accelerate -U
!pip install datasets
!pip install gradio

Collecting accelerate
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Collecting transformers[torch]
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers[torch])
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m333.2/333.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.46.2-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1

In [2]:
import gradio as gr
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the FinBERT pipeline and tokenizer
pipe = pipeline("text-classification", model="ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# Get the list of stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Initialize vectorizer and SVM model
vectorizer = TfidfVectorizer()
svm_model = SVC(probability=True)

# Dummy data for SVM model initialization (for demonstration purposes)
# Replace this with actual training data
texts = ["stock market is bullish", "stock market is bearish", "neutral market today"]
labels = ["bullish", "bearish", "neutral"]

# Preprocess the dummy data
preprocessed_texts = [re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE).lower() for text in texts]
X = vectorizer.fit_transform(preprocessed_texts)
y = LabelEncoder().fit_transform(labels)
svm_model.fit(X, y)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Hybrid model for sentiment classification
def hybrid_model(text):
    try:
        # Preprocess the text
        preprocessed_text = preprocess_text(text)

        # SVM prediction
        tfidf_text = vectorizer.transform([preprocessed_text])
        preds_svm = svm_model.predict_proba(tfidf_text)[0]

        # FinBERT prediction
        finbert_result = pipe(preprocessed_text)
        preds_finbert = np.zeros(3)
        for res in finbert_result:
            if res['label'] == 'positive':
                preds_finbert[2] += res['score']
            elif res['label'] == 'neutral':
                preds_finbert[1] += res['score']
            else:
                preds_finbert[0] += res['score']

        # Normalize FinBERT scores to sum to 1
        if preds_finbert.sum() > 0:
            preds_finbert /= preds_finbert.sum()

        # Hybrid prediction
        combined_preds = (preds_svm + preds_finbert) / 2
        sentiment_idx = np.argmax(combined_preds)

        # Map the prediction to sentiment labels
        sentiment_map = {0: "bearish", 1: "neutral", 2: "bullish"}
        return sentiment_map[sentiment_idx]
    except Exception as e:
        print(f"Error in hybrid_model: {e}")
        return "Error"

# Process the uploaded CSV file and generate sentiment distribution table
def process_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        if 'text' not in df.columns:
            return "CSV must contain a 'text' column for sentiment analysis.", None

        df['predicted_sentiment'] = df['text'].apply(hybrid_model)

        # Generate sentiment distribution table
        sentiment_counts = df['predicted_sentiment'].value_counts()
        table_html = sentiment_counts.to_frame().reset_index()
        table_html.columns = ['Sentiment', 'Count']
        table_html = table_html.to_html(index=False)

        return df.to_html(), table_html
    except Exception as e:
        print(f"Error in process_csv: {e}")
        return f"Error processing CSV: {e}", None

# Interface for file upload and text input
def upload_page(file):
    df_html, table_html = process_csv(file.name)
    return df_html, table_html

def analyze_text(text):
    hybrid_result = hybrid_model(text)
    return hybrid_result

# Create Gradio interface with tabs for file upload and text input
with gr.Blocks() as demo:
    gr.Markdown("# Financial Sentiment Analysis")
    with gr.Tab("Upload CSV"):
        file_input = gr.File(label="Upload CSV", type="filepath")
        file_output = gr.HTML()
        table_output = gr.HTML()
        file_button = gr.Button("Analyze CSV")
        file_button.click(upload_page, inputs=file_input, outputs=[file_output, table_output])
    with gr.Tab("Analyze Text"):
        text_input = gr.Textbox(label="Enter text to analyze sentiment")
        text_output = gr.Textbox(label="Sentiment Classification")
        text_button = gr.Button("Analyze Text")
        text_button.click(analyze_text, inputs=text_input, outputs=text_output)

# Launch the interface
demo.launch()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1264f949d7970f78dc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import io
from PIL import Image

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the FinBERT pipeline and tokenizer
pipe = pipeline("text-classification", model="ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# Get the list of stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Initialize vectorizer and SVM model
vectorizer = TfidfVectorizer()
svm_model = SVC(probability=True)

# Dummy data for SVM model initialization (for demonstration purposes)
# Replace this with actual training data
texts = ["stock market is bullish", "stock market is bearish", "neutral market today"]
labels = ["bullish", "bearish", "neutral"]

# Preprocess the dummy data
preprocessed_texts = [re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE).lower() for text in texts]
X = vectorizer.fit_transform(preprocessed_texts)
y = LabelEncoder().fit_transform(labels)
svm_model.fit(X, y)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Hybrid model for sentiment classification
def hybrid_model(text):
    try:
        # Preprocess the text
        preprocessed_text = preprocess_text(text)

        # SVM prediction
        tfidf_text = vectorizer.transform([preprocessed_text])
        preds_svm = svm_model.predict_proba(tfidf_text)[0]

        # FinBERT prediction
        finbert_result = pipe(preprocessed_text)
        preds_finbert = np.zeros(3)
        for res in finbert_result:
            if res['label'] == 'positive':
                preds_finbert[2] += res['score']
            elif res['label'] == 'neutral':
                preds_finbert[1] += res['score']
            else:
                preds_finbert[0] += res['score']

        # Normalize FinBERT scores to sum to 1
        if preds_finbert.sum() > 0:
            preds_finbert /= preds_finbert.sum()

        # Hybrid prediction
        combined_preds = (preds_svm + preds_finbert) / 2
        sentiment_idx = np.argmax(combined_preds)

        # Map the prediction to sentiment labels
        sentiment_map = {0: "bearish", 1: "neutral", 2: "bullish"}
        return sentiment_map[sentiment_idx]
    except Exception as e:
        print(f"Error in hybrid_model: {e}")
        return "Error"

# Process the uploaded CSV file and generate sentiment distribution table and chart
def process_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        if 'text' not in df.columns:
            return "CSV must contain a 'text' column for sentiment analysis.", None, None

        df['predicted_sentiment'] = df['text'].apply(hybrid_model)

        # Generate sentiment distribution table
        sentiment_counts = df['predicted_sentiment'].value_counts()
        table_html = sentiment_counts.to_frame().reset_index()
        table_html.columns = ['Sentiment', 'Count']
        table_html = table_html.to_html(index=False)

        # Generate sentiment distribution chart
        fig, ax = plt.subplots(figsize=(3, 2))  # Further adjusted figure size
        sentiment_counts.plot(kind='bar', ax=ax)
        ax.set_xlabel('Sentiment', fontsize=8)
        ax.set_ylabel('Count', fontsize=8)
        ax.set_title('Sentiment Distribution', fontsize=10)
        ax.tick_params(axis='both', which='major', labelsize=6)
        plt.tight_layout()
        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)

        return df.to_html(), table_html, img
    except Exception as e:
        print(f"Error in process_csv: {e}")
        return f"Error processing CSV: {e}", None, None

# Interface for file upload and text input
def upload_page(file):
    df_html, table_html, img = process_csv(file.name)
    return df_html, table_html, img

def analyze_text(text):
    hybrid_result = hybrid_model(text)
    return hybrid_result

# Create Gradio interface with tabs for file upload and text input
with gr.Blocks() as demo:
    gr.Markdown("# Financial Sentiment Analysis")
    with gr.Tab("Upload CSV"):
        file_input = gr.File(label="Upload CSV", type="filepath")
        file_output = gr.HTML()
        table_output = gr.HTML()
        img_output = gr.Image()
        file_button = gr.Button("Analyze CSV")
        file_button.click(upload_page, inputs=file_input, outputs=[file_output, table_output, img_output])
    with gr.Tab("Analyze Text"):
        text_input = gr.Textbox(label="Enter text to analyze sentiment")
        text_output = gr.Textbox(label="Sentiment Classification")
        text_button = gr.Button("Analyze Text")
        text_button.click(analyze_text, inputs=text_input, outputs=text_output)

# Launch the interface
demo.launch()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://3b13ae9e51d39b6ed8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




HERE

In [None]:
import gradio as gr
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import matplotlib.pyplot as plt
import io
from PIL import Image

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the FinBERT pipeline and tokenizer
pipe = pipeline("text-classification", model="ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# Get the list of stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load the dataset
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")
df_train = pd.DataFrame(dataset['train'])

# Preprocess the training data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df_train['preprocessed_text'] = df_train['text'].apply(preprocess_text)

# Initialize vectorizer and SVM model
vectorizer = TfidfVectorizer()
svm_model = SVC(probability=True)

# Dummy labels for fitting purposes
dummy_labels = np.random.choice(['bullish', 'bearish', 'neutral'], size=len(df_train))
label_encoder = LabelEncoder()
y_dummy = label_encoder.fit_transform(dummy_labels)

# Vectorize the preprocessed texts
X = vectorizer.fit_transform(df_train['preprocessed_text'])

# Train the SVM model
svm_model.fit(X, y_dummy)

# Hybrid model for sentiment classification
def hybrid_model(text):
    try:
        # Preprocess the text
        preprocessed_text = preprocess_text(text)

        # SVM prediction
        tfidf_text = vectorizer.transform([preprocessed_text])
        preds_svm = svm_model.predict_proba(tfidf_text)[0]

        # FinBERT prediction
        finbert_result = pipe(preprocessed_text)
        preds_finbert = np.zeros(3)
        for res in finbert_result:
            if res['label'] == 'positive':
                preds_finbert[2] += res['score']
            elif res['label'] == 'neutral':
                preds_finbert[1] += res['score']
            else:
                preds_finbert[0] += res['score']

        # Normalize FinBERT scores to sum to 1
        if preds_finbert.sum() > 0:
            preds_finbert /= preds_finbert.sum()

        # Hybrid prediction with adjusted weighted combination
        weight_svm = 0.3  # Adjusted weight for SVM
        weight_finbert = 0.7  # Adjusted weight for FinBERT
        combined_preds = (weight_svm * preds_svm) + (weight_finbert * preds_finbert)
        sentiment_idx = np.argmax(combined_preds)

        # Map the prediction to sentiment labels
        sentiment_map = {0: "bearish", 1: "neutral", 2: "bullish"}
        return sentiment_map[sentiment_idx]
    except Exception as e:
        print(f"Error in hybrid_model: {e}")
        return "Error"

# Process the uploaded CSV file and generate sentiment distribution table and chart
def process_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        if 'text' not in df.columns:
            return "CSV must contain a 'text' column for sentiment analysis.", None, None

        df['predicted_sentiment'] = df['text'].apply(hybrid_model)

        # Generate sentiment distribution table
        sentiment_counts = df['predicted_sentiment'].value_counts()
        table_html = sentiment_counts.to_frame().reset_index()
        table_html.columns = ['Sentiment', 'Count']
        table_html = table_html.to_html(index=False)

        # Generate sentiment distribution chart
        fig, ax = plt.subplots(figsize=(3, 2))  # Further adjusted figure size
        sentiment_counts.plot(kind='bar', ax=ax)
        ax.set_xlabel('Sentiment', fontsize=8)
        ax.set_ylabel('Count', fontsize=8)
        ax.set_title('Sentiment Distribution', fontsize=10)
        ax.tick_params(axis='both', which='major', labelsize=6)
        plt.tight_layout()
        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)

        return df.to_html(), table_html, img
    except Exception as e:
        print(f"Error in process_csv: {e}")
        return f"Error processing CSV: {e}", None, None

# Interface for file upload and text input
def upload_page(file):
    df_html, table_html, img = process_csv(file.name)
    return df_html, table_html, img

def analyze_text(text):
    hybrid_result = hybrid_model(text)
    return hybrid_result

# Create Gradio interface with tabs for file upload and text input
with gr.Blocks() as demo:
    gr.Markdown("# Financial Sentiment Analysis")
    with gr.Tab("Upload CSV"):
        file_input = gr.File(label="Upload CSV", type="filepath")
        file_output = gr.HTML()
        table_output = gr.HTML()
        img_output = gr.Image()
        file_button = gr.Button("Analyze CSV")
        file_button.click(upload_page, inputs=file_input, outputs=[file_output, table_output, img_output])
    with gr.Tab("Analyze Text"):
        text_input = gr.Textbox(label="Enter text to analyze sentiment")
        text_output = gr.Textbox(label="Sentiment Classification")
        text_button = gr.Button("Analyze Text")
        text_button.click(analyze_text, inputs=text_input, outputs=text_output)

# Launch the interface
demo.launch()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://1412b7cb4e4a404a6a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the FinBERT pipeline and tokenizer
pipe = pipeline("text-classification", model="ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# Get the list of stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Initialize vectorizer and SVM model
vectorizer = TfidfVectorizer()
svm_model = SVC()

# Dummy data for SVM model initialization (for demonstration purposes)
# Replace this with actual training data
texts = ["stock market is bullish", "stock market is bearish", "neutral market today"]
labels = ["bullish", "bearish", "neutral"]

# Preprocess the dummy data
preprocessed_texts = [preprocess_text(text) for text in texts]
X = vectorizer.fit_transform(preprocessed_texts)
y = LabelEncoder().fit_transform(labels)
svm_model.fit(X, y)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Hybrid model for sentiment classification
def hybrid_model(text):
    try:
        # Preprocess the text
        preprocessed_text = preprocess_text(text)

        # SVM prediction
        tfidf_text = vectorizer.transform([preprocessed_text])
        preds_svm = svm_model.decision_function(tfidf_text)
        preds_svm = ensure_correct_shape(preds_svm, num_classes=3)

        # FinBERT prediction
        finbert_result = pipe(preprocessed_text)
        preds_finbert = np.array([res['score'] for res in finbert_result])
        preds_finbert = ensure_correct_shape(preds_finbert, num_classes=3)

        # Hybrid prediction
        combined_preds = (preds_svm + preds_finbert) / 2
        sentiment_idx = np.argmax(combined_preds, axis=1)[0]

        # Map the prediction to sentiment labels
        sentiment_map = {0: "bearish", 1: "neutral", 2: "bullish"}
        return sentiment_map[sentiment_idx]
    except Exception as e:
        print(f"Error in hybrid_model: {e}")
        return "Error"

# Ensure predictions have the correct shape
def ensure_correct_shape(preds, num_classes=3):
    if preds.ndim == 1:
        preds = np.expand_dims(preds, axis=-1)
    if preds.shape[1] != num_classes:
        preds = np.tile(preds, (1, num_classes // preds.shape[1]))
    return preds

# Process the uploaded CSV file
def process_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        if 'text' not in df.columns:
            return "CSV must contain a 'text' column for sentiment analysis."

        df['predicted_sentiment'] = df['text'].apply(hybrid_model)
        return df
    except Exception as e:
        print(f"Error in process_csv: {e}")
        return "Error processing CSV"

# Interface for file upload and text input
def upload_page(file):
    df = process_csv(file.name)
    return df.to_html() if isinstance(df, pd.DataFrame) else df

def analyze_text(text):
    hybrid_result = hybrid_model(text)
    return hybrid_result

# Create Gradio interface with tabs for file upload and text input
with gr.Blocks() as demo:
    gr.Markdown("# Financial Sentiment Analysis")
    with gr.Tab("Upload CSV"):
        file_input = gr.File(label="Upload CSV", type="filepath")
        file_output = gr.HTML()
        file_button = gr.Button("Analyze CSV")
        file_button.click(upload_page, inputs=file_input, outputs=file_output)
    with gr.Tab("Analyze Text"):
        text_input = gr.Textbox(label="Enter text to analyze sentiment")
        text_output = gr.Textbox(label="Hybrid Sentiment")
        text_button = gr.Button("Analyze Text")
        text_button.click(analyze_text, inputs=text_input, outputs=text_output)

# Launch the interface
demo.launch()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://7857995c760ecb35eb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the FinBERT pipeline and tokenizer
pipe = pipeline("text-classification", model="ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# Get the list of stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Initialize vectorizer and SVM model
vectorizer = TfidfVectorizer()
svm_model = SVC(probability=True)

# Dummy data for SVM model initialization (for demonstration purposes)
# Replace this with actual training data
texts = ["stock market is bullish", "stock market is bearish", "neutral market today"]
labels = ["bullish", "bearish", "neutral"]

# Preprocess the dummy data
preprocessed_texts = [re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE).lower() for text in texts]
X = vectorizer.fit_transform(preprocessed_texts)
y = LabelEncoder().fit_transform(labels)
svm_model.fit(X, y)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Hybrid model for sentiment classification
def hybrid_model(text):
    try:
        # Preprocess the text
        preprocessed_text = preprocess_text(text)

        # SVM prediction
        tfidf_text = vectorizer.transform([preprocessed_text])
        preds_svm = svm_model.predict_proba(tfidf_text)[0]

        # FinBERT prediction
        finbert_result = pipe(preprocessed_text)
        preds_finbert = np.zeros(3)
        for res in finbert_result:
            if res['label'] == 'positive':
                preds_finbert[2] += res['score']
            elif res['label'] == 'neutral':
                preds_finbert[1] += res['score']
            else:
                preds_finbert[0] += res['score']

        # Normalize FinBERT scores to sum to 1
        if preds_finbert.sum() > 0:
            preds_finbert /= preds_finbert.sum()

        # Hybrid prediction
        combined_preds = (preds_svm + preds_finbert) / 2
        sentiment_idx = np.argmax(combined_preds)

        # Map the prediction to sentiment labels
        sentiment_map = {0: "bearish", 1: "neutral", 2: "bullish"}
        return sentiment_map[sentiment_idx]
    except Exception as e:
        print(f"Error in hybrid_model: {e}")
        return "Error"

# Process the uploaded CSV file and generate sentiment distribution table
def process_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        if 'text' not in df.columns:
            return "CSV must contain a 'text' column for sentiment analysis.", None

        df['predicted_sentiment'] = df['text'].apply(hybrid_model)

        # Generate sentiment distribution table
        sentiment_counts = df['predicted_sentiment'].value_counts()
        table_html = sentiment_counts.to_frame().reset_index()
        table_html.columns = ['Sentiment', 'Count']
        table_html = table_html.to_html(index=False)

        return df.to_html(), table_html
    except Exception as e:
        print(f"Error in process_csv: {e}")
        return f"Error processing CSV: {e}", None

# Interface for file upload and text input
def upload_page(file):
    df_html, table_html = process_csv(file.name)
    return df_html, table_html

def analyze_text(text):
    hybrid_result = hybrid_model(text)
    return hybrid_result

# Create Gradio interface with tabs for file upload and text input
with gr.Blocks() as demo:
    gr.Markdown("# Financial Sentiment Analysis")
    with gr.Tab("Upload CSV"):
        file_input = gr.File(label="Upload CSV", type="filepath")
        file_output = gr.HTML()
        table_output = gr.HTML()
        file_button = gr.Button("Analyze CSV")
        file_button.click(upload_page, inputs=file_input, outputs=[file_output, table_output])
    with gr.Tab("Analyze Text"):
        text_input = gr.Textbox(label="Enter text to analyze sentiment")
        text_output = gr.Textbox(label="Hybrid Sentiment")
        text_button = gr.Button("Analyze Text")
        text_button.click(analyze_text, inputs=text_input, outputs=text_output)

# Launch the interface
demo.launch()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://257dbfffb1834f2bdd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


