In [None]:
import pandas as pd
import unicodedata
from io import StringIO

def normalize_text_columns(df):
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].apply(
            lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("ascii") if isinstance(x, str) else x
        )
    return df

# Read and fix decoding issues using open() + StringIO
def load_and_clean_csv(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
        content = f.read()
    df = pd.read_csv(StringIO(content))
    df = normalize_text_columns(df)
    return df

# Function to check for encoding issues
def find_bad_encoding_line(file_path, encoding='utf-8'):
    bad_lines = []
    with open(file_path, 'rb') as f:
        for i, line in enumerate(f):
            try:
                line.decode(encoding)
            except UnicodeDecodeError as e:
                bad_lines.append((i + 1, str(e), line))

    if bad_lines:
        print(f" Found {len(bad_lines)} bad line(s) in '{file_path}':\n")
        for line_num, error_msg, raw_bytes in bad_lines:
            print(f" Line {line_num}: {error_msg}")
            print(f" Bytes: {raw_bytes}\n")
    else:
        print(f" All lines decoded successfully in '{file_path}'.")

news_file = "/content/hul_only_et.csv"
trading_file = "/content/HUL_2019_to_2024.csv"

# # Check for bad encoding in the cleaned files
print("\nChecking bad encoding in cleaned news file...")
find_bad_encoding_line(news_file)

print("\nChecking bad encoding in cleaned trading file...")
find_bad_encoding_line(trading_file)

# Load and clean the files
# news_df = load_and_clean_csv(news_file)
# trading_df = load_and_clean_csv(trading_file)

# # Now that the data is cleaned, let's save it and check if the cleaned data has any encoding issues
# cleaned_news_file = "/content/Cleaned_TCS_all_news.csv"
# cleaned_trading_file = "/content/Cleaned_TCS_2019_to_2024.csv"

# # Save the cleaned data to new files
# news_df.to_csv(cleaned_news_file, index=False)
# trading_df.to_csv(cleaned_trading_file, index=False)

# # Check for bad encoding in the cleaned files
# print("\nChecking bad encoding in cleaned news file...")
# find_bad_encoding_line(cleaned_news_file)

# print("\nChecking bad encoding in cleaned trading file...")
# find_bad_encoding_line(cleaned_trading_file)


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from google.colab import drive, files

#  Step 2: Define File Paths
news_file = "/content/hul_only_et.csv"
trading_file = "/content/HUL_2019_to_2024.csv"

#  Step 3: Sentiment Analyzer Class
class SentimentAnalyzer:
    def __init__(self, model_name="finbert"):
        self.model_name = model_name
        self.model_configs = {
            'finbert': {'model': 'ProsusAI/finbert', 'labels': ['positive', 'negative', 'neutral']}
        }
        self.config = self.model_configs[model_name.lower()]
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['model'])
        self.model = AutoModelForSequenceClassification.from_pretrained(self.config['model'])
        self.nlp = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer)

    def analyze_batch(self, texts):
        with torch.no_grad():
            results = self.nlp(texts, truncation=True, max_length=512)
        return [self._convert_to_score(res) for res in results]

    def _convert_to_score(self, result):
        sentiment = result['label'].lower()
        score = -float(result['score']) if sentiment == 'negative' else (0 if sentiment == 'neutral' else float(result['score']))
        return score

#  Step 4: Process News & Trading Data
def process_sentiment(news_file, trading_file):
    # Load CSVs
    news_df = pd.read_csv(news_file)
    trading_df = pd.read_csv(trading_file)

    print("ðŸ“… Sample news dates:")
    print(news_df['published_date'].dropna().head(10).to_list())

    print("\nðŸ“… Sample trading dates:")
    print(trading_df['Date'].dropna().head(10).to_list())


    news_df['published_date'] = pd.to_datetime(news_df['published_date'], errors='coerce').dt.normalize()
    trading_df['Date']       = pd.to_datetime(trading_df['Date'], format="%d-%b-%y", errors='coerce').dt.normalize()


# 2) THEN print samples
    print("ðŸ“… Sample news dates (postâ€‘parse):")
    print(news_df['published_date'].head(10).to_list())
    print(news_df['published_date'].dtype)

    print("\nðŸ“… Sample trading dates (postâ€‘parse):")
    print(trading_df['Date'].head(10).to_list())
    print(trading_df['Date'].dtype)




    # Initialize Sentiment Analyzer
    analyzer = SentimentAnalyzer()

    # Combine title and summary for sentiment analysis
    news_df['combined_text'] = news_df['title'].fillna('') + " " + news_df['summary'].fillna('')

    # Apply sentiment analysis
    news_df['sentiment_score'] = analyzer.analyze_batch(news_df['combined_text'].tolist())

    #  Step 5: Aggregate Sentiment Per Day
    daily_sentiment = news_df.groupby('published_date')['sentiment_score'].sum().reset_index()
    daily_sentiment.columns = ['news_date', 'net_sentiment_score']

    print("ðŸ“° daily_sentiment:")
    print(daily_sentiment.head())
    print(daily_sentiment.dtypes)

    #  Step 6: Map Sentiment to the Next Available Trading Day
    sentiment_map = {}
    for date, score in zip(daily_sentiment['news_date'], daily_sentiment['net_sentiment_score']):
        next_trading_day = trading_df[trading_df['Date'] >= date]['Date'].min()
        if pd.notna(next_trading_day):
            sentiment_map[next_trading_day] = sentiment_map.get(next_trading_day, 0) + score

    # Convert mapping to DataFrame
    sentiment_df = pd.DataFrame(list(sentiment_map.items()), columns=['Date', 'net_sentiment_score'])

    #  Step 7: Normalize Sentiment with Volatility Adjustment
    if not sentiment_df.empty:
        mean_sentiment = sentiment_df['net_sentiment_score'].mean()
        std_sentiment = sentiment_df['net_sentiment_score'].std(ddof=0)

        # Apply tanh normalization
        sentiment_df['normalized_sentiment'] = np.tanh(sentiment_df['net_sentiment_score'])

        # Apply volatility adjustment
        sentiment_df['normalized_sentiment'] *= (1 - (std_sentiment / (std_sentiment + 1)))

        # Keep only the final sentiment column
        sentiment_df = sentiment_df[['Date', 'normalized_sentiment']].rename(columns={'normalized_sentiment': 'net_sentiment_score'})

    # Merge with trading data
    trading_sentiment_df = trading_df.merge(sentiment_df, on='Date', how='left').fillna(0)

    #  Step 8: Save the Final Dataset
    trading_sentiment_df.to_csv("trading_data_with_net_sentiment.csv", index=False)
    news_df[['published_date', 'combined_text', 'sentiment_score']].to_csv("news_with_sentiment.csv", index=False)

    print(" Sentiment analysis and mapping completed!")

    return trading_sentiment_df

#  Step 8: Main Guard
if __name__ == "__main__":
    final_trading_df = process_sentiment(news_file, trading_file)

    #  Step 9: Download Output Files
    files.download("trading_data_with_net_sentiment.csv")
    files.download("news_with_sentiment.csv")
