In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
sys.path.append(os.path.abspath(''))

In [2]:
last_uid_grok = 0
last_uid_gemini = 0
sentiment_file_path_grok = '../../data/sentiment/news_sentiment_results.csv'
sentiment_file_path_gemini = '../../data/sentiment/news_sentiment_results_gemini.csv'
sentiment_file_path_gemini_hb = '../../data/sentiment/news_sentiment_results_gemini_hb.csv'
processed_news_file_path = '../../data/news/nasdaq_news_data_processed.csv'

sentiment_df_grok = pd.read_csv(sentiment_file_path_grok)
sentiment_df_gemini = pd.read_csv(sentiment_file_path_gemini)
sentiment_df_gemini_hb = pd.read_csv(sentiment_file_path_gemini_hb)

last_uid_grok = sentiment_df_grok['UID'].max()
last_uid_gemini = sentiment_df_gemini['UID'].max()
print(f"Initialized last_uid_grok from existing file: {last_uid_grok}")
print(f"Initialized last_uid_gemini from existing file: {last_uid_gemini}")

print(f"Loading processed news data from {processed_news_file_path}...")
news_df = pd.read_csv(processed_news_file_path, parse_dates=['Date'])
print(f"Loaded {len(news_df)} news items.")


Initialized last_uid_grok from existing file: 20000
Initialized last_uid_gemini from existing file: 209125
Loading processed news data from ../../data/news/nasdaq_news_data_processed.csv...
Loaded 209125 news items.


In [3]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209125 entries, 0 to 209124
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   UID      209125 non-null  int64         
 1   Date     209125 non-null  datetime64[ns]
 2   Ticker   209125 non-null  object        
 3   Title    209125 non-null  object        
 4   Summary  209125 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 8.0+ MB


In [4]:
news_df.isnull().sum()

UID        0
Date       0
Ticker     0
Title      0
Summary    0
dtype: int64

In [5]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209125 entries, 0 to 209124
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   UID      209125 non-null  int64         
 1   Date     209125 non-null  datetime64[ns]
 2   Ticker   209125 non-null  object        
 3   Title    209125 non-null  object        
 4   Summary  209125 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 8.0+ MB


In [6]:
def check_alignment(sentiment_df, news_df):
    """
    Check if sentiment_df and news_df are aligned by UID, Date, and Ticker.
    Handles different date formats by comparing only year-month-day.
    
    Parameters:
    -----------
    sentiment_df : DataFrame
        The sentiment dataframe (smaller one) with Date as year-month-day
    news_df : DataFrame
        The news dataframe (larger one) with Date as datetime including time
    
    Returns:
    --------
    dict
        Dictionary containing alignment stats and mismatches
    """
    # Get common UIDs
    common_uids = set(sentiment_df['UID']).intersection(set(news_df['UID']))
    
    print(f"Total UIDs in sentiment_df: {len(sentiment_df['UID'])}")
    print(f"Common UIDs between dataframes: {len(common_uids)}")
    
    # Check for UIDs in sentiment_df not found in news_df
    missing_uids = set(sentiment_df['UID']) - set(news_df['UID'])
    if missing_uids:
        print(f"Warning: {len(missing_uids)} UIDs in sentiment_df not found in news_df")
    
    # Initialize results
    results = {
        'total_sentiment_uids': len(sentiment_df['UID']),
        'common_uids': len(common_uids),
        'missing_uids': list(missing_uids)[:10] if missing_uids else [],  # Show first 10 only
        'date_mismatches': [],
        'ticker_mismatches': [],
        'both_mismatches': []
    }
    
    # Check alignment for common UIDs
    for uid in common_uids:
        sentiment_row = sentiment_df[sentiment_df['UID'] == uid].iloc[0]
        news_row = news_df[news_df['UID'] == uid].iloc[0]
        
        # Convert news_df datetime to date only for comparison
        if isinstance(news_row['Date'], pd.Timestamp):
            news_date = news_row['Date'].date()
        else:
            # Try to convert string to datetime and then get date part
            try:
                news_date = pd.to_datetime(news_row['Date']).date()
            except:
                news_date = news_row['Date']  # Keep as is if conversion fails
        
        # Convert sentiment_df date if needed
        if not isinstance(sentiment_row['Date'], pd.Timestamp) and not hasattr(sentiment_row['Date'], 'date'):
            try:
                sentiment_date = pd.to_datetime(sentiment_row['Date']).date()
            except:
                sentiment_date = sentiment_row['Date']  # Keep as is if conversion fails
        else:
            sentiment_date = sentiment_row['Date'].date() if hasattr(sentiment_row['Date'], 'date') else sentiment_row['Date']
        
        # Compare date and ticker
        date_match = str(sentiment_date) == str(news_date)
        ticker_match = sentiment_row['Ticker'] == news_row['Ticker']
        
        if not date_match and not ticker_match:
            results['both_mismatches'].append({
                'UID': uid,
                'sentiment_date': sentiment_date,
                'news_date': news_date,
                'sentiment_ticker': sentiment_row['Ticker'],
                'news_ticker': news_row['Ticker']
            })
        elif not date_match:
            results['date_mismatches'].append({
                'UID': uid,
                'sentiment_date': sentiment_date,
                'news_date': news_date
            })
        elif not ticker_match:
            results['ticker_mismatches'].append({
                'UID': uid,
                'sentiment_ticker': sentiment_row['Ticker'],
                'news_ticker': news_row['Ticker']
            })
    
    # Summary stats
    results['date_mismatch_count'] = len(results['date_mismatches'])
    results['ticker_mismatch_count'] = len(results['ticker_mismatches'])
    results['both_mismatch_count'] = len(results['both_mismatches'])
    results['total_mismatches'] = results['date_mismatch_count'] + results['ticker_mismatch_count'] + results['both_mismatch_count']
    results['alignment_percentage'] = 100 * (len(common_uids) - results['total_mismatches']) / len(common_uids) if common_uids else 0
    
    print(f"\nAlignment Results:")
    print(f"Date mismatches: {results['date_mismatch_count']}")
    print(f"Ticker mismatches: {results['ticker_mismatch_count']}")
    print(f"Both Date and Ticker mismatches: {results['both_mismatch_count']}")
    print(f"Total mismatches: {results['total_mismatches']}")
    print(f"Alignment percentage: {results['alignment_percentage']:.2f}%")
    
    return results

# Example usage:
results = check_alignment(sentiment_df_gemini, news_df)

Total UIDs in sentiment_df: 142028
Common UIDs between dataframes: 142028

Alignment Results:
Date mismatches: 0
Ticker mismatches: 0
Both Date and Ticker mismatches: 0
Total mismatches: 0
Alignment percentage: 100.00%


In [8]:
def get_news_uids_by_years(df, years=[2023]):
    """
    Find UIDs for news data from specified years.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing news data with 'Date' and 'UID' columns
    years : list, default [2022, 2023]
        List of years to filter data by
        
    Returns:
    --------
    dict : Dictionary with years as keys and lists of UIDs as values
    """
    # Make sure Date column is in datetime format
    if not pd.api.types.is_datetime64_dtype(df['Date']):
        df = df.copy()
        df['Date'] = pd.to_datetime(df['Date'])
    
    # Extract year from Date
    df['Year'] = df['Date'].dt.year
    
    # Create result dictionary
    result = {}
    
    # Get UIDs for each year
    for year in years:
        year_uids = df[df['Year'] == year]['UID'].tolist()
        result[year] = year_uids
        print(f"Found {len(year_uids)} news items from {year}")
    
    return result


uids_by_year = get_news_uids_by_years(news_df)
    
uids_by_year[2023]

Found 63327 news items from 2023


[145799,
 145800,
 145801,
 145802,
 145803,
 145804,
 145805,
 145806,
 145807,
 145808,
 145809,
 145810,
 145811,
 145812,
 145813,
 145814,
 145815,
 145816,
 145817,
 145818,
 145819,
 145820,
 145821,
 145822,
 145823,
 145824,
 145825,
 145826,
 145827,
 145828,
 145829,
 145830,
 145831,
 145832,
 145833,
 145834,
 145835,
 145836,
 145837,
 145838,
 145839,
 145840,
 145841,
 145842,
 145843,
 145844,
 145845,
 145846,
 145847,
 145848,
 145849,
 145850,
 145851,
 145852,
 145853,
 145854,
 145855,
 145856,
 145857,
 145858,
 145859,
 145860,
 145861,
 145862,
 145863,
 145864,
 145865,
 145866,
 145867,
 145868,
 145869,
 145870,
 145871,
 145872,
 145873,
 145874,
 145875,
 145876,
 145877,
 145878,
 145879,
 145880,
 145881,
 145882,
 145883,
 145884,
 145885,
 145886,
 145887,
 145888,
 145889,
 145890,
 145891,
 145892,
 145893,
 145894,
 145895,
 145896,
 145897,
 145898,
 145899,
 145900,
 145901,
 145902,
 145903,
 145904,
 145905,
 145906,
 145907,
 145908,
 145909,
 

In [None]:
def analyze_daily_unique_tickers_per_year(df, date_column = 'Date', ticker_column = 'Ticker'):
    """
    Calculates the average number of unique tickers with news per day, for each year.

    Args:
        df (pd.DataFrame): The input DataFrame.
        date_column (str): The name of the column containing the dates.
        ticker_column (str): The name of the column containing the tickers.

    Returns:
        dict: A dictionary where keys are the years (int) and values are the
              average number of unique tickers appearing per day in that year.
              Returns None if a critical error occurs, or an empty dict if no valid data.
    """
    # Make a copy to avoid modifying the original DataFrame
    df_copy = df.copy()

    # Convert the date column to datetime objects, coercing errors
    df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')

    # Drop rows where date conversion failed or ticker is missing
    initial_rows = len(df_copy)
    df_copy.dropna(subset=[date_column, ticker_column], inplace=True)
    dropped_rows = initial_rows - len(df_copy)
    if dropped_rows > 0:
        print(f"Warning: Dropped {dropped_rows} rows with invalid dates or missing tickers.")

    if df_copy.empty:
        print("DataFrame is empty after handling invalid dates/tickers.")
        return {}

    # Extract year and date
    df_copy['Year'] = df_copy[date_column].dt.year
    df_copy['DateOnly'] = df_copy[date_column].dt.date

    # --- Calculate Number of Unique Tickers Per Day ---
    # Group by Year and DateOnly, then count the number of unique tickers within each day's group.
    unique_tickers_per_day = df_copy.groupby(['Year', 'DateOnly'])[ticker_column].nunique()

    if unique_tickers_per_day.empty:
        print("No valid data found to group by Year and Date.")
        return {}

    # --- Calculate the Average of Daily Unique Ticker Counts per Year ---
    # Group the daily unique counts by Year and calculate the mean.
    avg_unique_tickers_per_day_by_year = unique_tickers_per_day.groupby('Year').mean()

    yearly_results = {}
    print("\n--- Average Unique Tickers Per Day By Year ---")
    for year, avg_count in avg_unique_tickers_per_day_by_year.items():
        yearly_results[year] = avg_count
        print(f"  Year {year}: {avg_count:.2f} unique tickers per day (on average)")

    return yearly_results

In [None]:
analyze_daily_unique_tickers_per_year(sentiment_df, 'Date', 'Ticker')

In [17]:
def get_sentiment_direction(sentiment):
    """Determine the direction of a sentiment value."""
    sentiment_mapping = {
        "Strongly Bearish": -3, "Bearish": -2, "Slightly Bearish": -1,
        "Neutral": 0,
        "Slightly Bullish": 1, "Bullish": 2, "Strongly Bullish": 3
    }
    
    if isinstance(sentiment, int) and 1 <= sentiment <= 7:
        # Convert 1-7 scale to direction
        if sentiment < 4:
            return "Bearish"
        elif sentiment == 4:
            return "Neutral"
        else:
            return "Bullish"
    elif isinstance(sentiment, str) and sentiment in sentiment_mapping:
        # String sentiment
        if sentiment_mapping[sentiment] < 0:
            return "Bearish"
        elif sentiment_mapping[sentiment] == 0:
            return "Neutral"
        else:
            return "Bullish"
    
    # Return None if sentiment format is unexpected
    return None

def compare_sentiment_in_range(df1: pd.DataFrame, df2: pd.DataFrame, 
                               start_uid: int, end_uid: int,
                               uid_col: str = 'UID',
                               date_col: str = 'Date',
                               ticker_col: str = 'Ticker',
                               sentiment_col: str = 'Sentiment',
                               df1_name: str = 'Model1', 
                               df2_name: str = 'Model2') -> dict:
    """
    Compares two sentiment dataframes within a specified UID range,
    checking if the Date and Ticker values match for each common UID.
    
    Parameters:
    -----------
    df1 : DataFrame
        First sentiment dataframe
    df2 : DataFrame
        Second sentiment dataframe
    start_uid : int
        Start of UID range to evaluate
    end_uid : int
        End of UID range to evaluate
    uid_col : str
        Name of the UID column
    date_col : str
        Name of the Date column
    ticker_col : str
        Name of the Ticker column
    sentiment_col : str
        Name of the Sentiment column
    df1_name : str
        Name identifier for the first dataframe
    df2_name : str
        Name identifier for the second dataframe
    
    Returns:
    --------
    dict
        Dictionary containing alignment stats and mismatches
    """
    # Filter dataframes by UID range
    df1_filtered = df1[(df1[uid_col] >= start_uid) & (df1[uid_col] <= end_uid)].copy()
    df2_filtered = df2[(df2[uid_col] >= start_uid) & (df2[uid_col] <= end_uid)].copy()
    
    print(f"UID Range Analysis: {start_uid} to {end_uid}")
    print(f"Records in {df1_name}: {len(df1_filtered)}")
    print(f"Records in {df2_name}: {len(df2_filtered)}")
    
    # Find common UIDs in the range
    common_uids = set(df1_filtered[uid_col]).intersection(set(df2_filtered[uid_col]))
    print(f"Common UIDs between dataframes in range: {len(common_uids)}")
    
    # Check for UIDs in range present in one df but not the other
    only_in_df1 = set(df1_filtered[uid_col]) - set(df2_filtered[uid_col])
    only_in_df2 = set(df2_filtered[uid_col]) - set(df1_filtered[uid_col])
    
    if only_in_df1:
        print(f"UIDs present only in {df1_name}: {len(only_in_df1)}")
    if only_in_df2:
        print(f"UIDs present only in {df2_name}: {len(only_in_df2)}")
    
    # Initialize results
    results = {
        'uid_range': {'start': start_uid, 'end': end_uid},
        'total_records': {df1_name: len(df1_filtered), df2_name: len(df2_filtered)},
        'common_uids': len(common_uids),
        'only_in_first_df': list(only_in_df1)[:10] if only_in_df1 else [],  # First 10 examples
        'only_in_second_df': list(only_in_df2)[:10] if only_in_df2 else [],  # First 10 examples
        'date_mismatches': [],
        'ticker_mismatches': [],
        'both_mismatches': [],
        'sentiment_comparison': {}
    }
    
    # Check alignment for common UIDs
    date_mismatch_count = 0
    ticker_mismatch_count = 0
    both_mismatch_count = 0
    exact_match_count = 0
    
    # Fine-grained sentiment agreement categories
    exact_agreement = 0
    moderate_agreement = 0
    slight_disagreement = 0
    disagreement = 0
    unknown_comparison = 0
    sentiment_comparison_examples = {
        'exact_agreement': [],
        'moderate_agreement': [],
        'slight_disagreement': [],
        'disagreement': []
    }
    
    for uid in common_uids:
        row1 = df1_filtered[df1_filtered[uid_col] == uid].iloc[0]
        row2 = df2_filtered[df2_filtered[uid_col] == uid].iloc[0]
        
        # Convert dates to compatible format for comparison
        if isinstance(row1[date_col], pd.Timestamp) or isinstance(row2[date_col], pd.Timestamp):
            date1 = pd.to_datetime(row1[date_col]).date() if hasattr(pd.to_datetime(row1[date_col]), 'date') else pd.to_datetime(row1[date_col])
            date2 = pd.to_datetime(row2[date_col]).date() if hasattr(pd.to_datetime(row2[date_col]), 'date') else pd.to_datetime(row2[date_col])
            date_match = str(date1) == str(date2)
        else:
            date_match = str(row1[date_col]) == str(row2[date_col])
            
        ticker_match = row1[ticker_col] == row2[ticker_col]
        
        # Check sentiment agreement if metadata matches
        if date_match and ticker_match:
            exact_match_count += 1
            
            # Compare sentiment if both have it
            if sentiment_col in row1.index and sentiment_col in row2.index:
                sentiment1 = row1[sentiment_col]
                sentiment2 = row2[sentiment_col]
                
                # Exact match
                if sentiment1 == sentiment2:
                    exact_agreement += 1
                    sentiment_comparison_examples['exact_agreement'].append({
                        'UID': uid,
                        f'{df1_name}_sentiment': sentiment1,
                        f'{df2_name}_sentiment': sentiment2,
                        'category': 'Exact agreement'
                    })
                else:
                    # Determine directions
                    direction1 = get_sentiment_direction(sentiment1)
                    direction2 = get_sentiment_direction(sentiment2)
                    
                    if direction1 is None or direction2 is None:
                        unknown_comparison += 1
                    # Disagreement: opposite directions
                    elif (direction1 == "Bearish" and direction2 == "Bullish") or (direction1 == "Bullish" and direction2 == "Bearish"):
                        disagreement += 1
                        sentiment_comparison_examples['disagreement'].append({
                            'UID': uid,
                            f'{df1_name}_sentiment': sentiment1,
                            f'{df2_name}_sentiment': sentiment2,
                            'category': 'Disagreement (opposite directions)'
                        })
                    # Slight disagreement: neutral vs bullish/bearish
                    elif (direction1 == "Neutral" and direction2 in ["Bullish", "Bearish"]) or (direction2 == "Neutral" and direction1 in ["Bullish", "Bearish"]):
                        slight_disagreement += 1
                        sentiment_comparison_examples['slight_disagreement'].append({
                            'UID': uid,
                            f'{df1_name}_sentiment': sentiment1,
                            f'{df2_name}_sentiment': sentiment2,
                            'category': 'Slight disagreement (neutral vs directional)'
                        })
                    # Moderate agreement: same direction but different intensity
                    elif direction1 == direction2:
                        moderate_agreement += 1
                        sentiment_comparison_examples['moderate_agreement'].append({
                            'UID': uid,
                            f'{df1_name}_sentiment': sentiment1,
                            f'{df2_name}_sentiment': sentiment2,
                            'category': 'Moderate agreement (same direction)'
                        })
        
        # Track mismatches
        if not date_match and not ticker_match:
            both_mismatch_count += 1
            results['both_mismatches'].append({
                'UID': uid,
                f'{df1_name}_date': row1[date_col],
                f'{df2_name}_date': row2[date_col],
                f'{df1_name}_ticker': row1[ticker_col],
                f'{df2_name}_ticker': row2[ticker_col]
            })
        elif not date_match:
            date_mismatch_count += 1
            results['date_mismatches'].append({
                'UID': uid,
                f'{df1_name}_date': row1[date_col],
                f'{df2_name}_date': row2[date_col]
            })
        elif not ticker_match:
            ticker_mismatch_count += 1
            results['ticker_mismatches'].append({
                'UID': uid,
                f'{df1_name}_ticker': row1[ticker_col],
                f'{df2_name}_ticker': row2[ticker_col]
            })
    
    # Summary stats
    total_mismatches = date_mismatch_count + ticker_mismatch_count + both_mismatch_count
    alignment_percentage = 100 * exact_match_count / len(common_uids) if common_uids else 0
    
    results['summary'] = {
        'exact_matches': exact_match_count,
        'date_mismatches': date_mismatch_count,
        'ticker_mismatches': ticker_mismatch_count,
        'both_mismatches': both_mismatch_count,
        'total_mismatches': total_mismatches,
        'alignment_percentage': alignment_percentage
    }
    
    # Calculate total sentiment comparisons
    total_sentiment_comparisons = exact_agreement + moderate_agreement + slight_disagreement + disagreement + unknown_comparison
    
    if total_sentiment_comparisons > 0:
        results['sentiment_comparison'] = {
            'exact_agreement': exact_agreement,
            'moderate_agreement': moderate_agreement,
            'slight_disagreement': slight_disagreement,
            'disagreement': disagreement,
            'unknown': unknown_comparison,
            'total': total_sentiment_comparisons,
            'examples': sentiment_comparison_examples
        }
    
    # Print summary
    print(f"\nAlignment Results:")
    print(f"Exact matches: {exact_match_count} ({alignment_percentage:.2f}%)")
    print(f"Date mismatches: {date_mismatch_count}")
    print(f"Ticker mismatches: {ticker_mismatch_count}")
    print(f"Both Date and Ticker mismatches: {both_mismatch_count}")
    print(f"Total mismatches: {total_mismatches}")
    
    if total_sentiment_comparisons > 0:
        print(f"\nSentiment Comparison (for matched records):")
        print(f"Exact agreement: {exact_agreement} ({100*exact_agreement/total_sentiment_comparisons:.2f}%)")
        print(f"Moderate agreement (same direction): {moderate_agreement} ({100*moderate_agreement/total_sentiment_comparisons:.2f}%)")
        print(f"Slight disagreement (neutral vs directional): {slight_disagreement} ({100*slight_disagreement/total_sentiment_comparisons:.2f}%)")
        print(f"Disagreement (opposite directions): {disagreement} ({100*disagreement/total_sentiment_comparisons:.2f}%)")
        if unknown_comparison > 0:
            print(f"Unknown comparisons: {unknown_comparison}")
    
    # Show sample mismatches if any
    if results['date_mismatches']:
        print("\nSample Date mismatches:")
        for i, mismatch in enumerate(results['date_mismatches'][:3]):
            print(f"  {i+1}. UID: {mismatch['UID']}, {df1_name} Date: {mismatch[f'{df1_name}_date']}, {df2_name} Date: {mismatch[f'{df2_name}_date']}")
    
    if results['ticker_mismatches']:
        print("\nSample Ticker mismatches:")
        for i, mismatch in enumerate(results['ticker_mismatches'][:3]):
            print(f"  {i+1}. UID: {mismatch['UID']}, {df1_name} Ticker: {mismatch[f'{df1_name}_ticker']}, {df2_name} Ticker: {mismatch[f'{df2_name}_ticker']}")
    
    # Show sample sentiment comparisons
    for category, examples in sentiment_comparison_examples.items():
        if examples:
            print(f"\nSample {category.replace('_', ' ').title()}:")
            for i, example in enumerate(examples[:2]):  # Show only 2 examples per category
                print(f"  {i+1}. UID: {example['UID']}, {df1_name}: {example[f'{df1_name}_sentiment']}, {df2_name}: {example[f'{df2_name}_sentiment']}")
    
    return results



In [18]:
# %% 
# Example usage:
results = compare_sentiment_in_range(sentiment_df_gemini_hb, sentiment_df_gemini, 
                                    start_uid=1, end_uid=1000, 
                                    df1_name='Sentiment', df2_name='News')

UID Range Analysis: 1 to 1000
Records in Sentiment: 1000
Records in News: 1000
Common UIDs between dataframes in range: 1000

Alignment Results:
Exact matches: 1000 (100.00%)
Date mismatches: 0
Ticker mismatches: 0
Both Date and Ticker mismatches: 0
Total mismatches: 0

Sentiment Comparison (for matched records):
Exact agreement: 587 (58.70%)
Moderate agreement (same direction): 166 (16.60%)
Slight disagreement (neutral vs directional): 171 (17.10%)
Disagreement (opposite directions): 16 (1.60%)
Unknown comparisons: 60

Sample Exact Agreement:
  1. UID: 1, Sentiment: Bullish, News: Bullish
  2. UID: 3, Sentiment: Neutral, News: Neutral

Sample Moderate Agreement:
  1. UID: 6, Sentiment: Slightly Bullish, News: Bullish
  2. UID: 9, Sentiment: Bullish, News: Slightly Bullish

Sample Slight Disagreement:
  1. UID: 2, Sentiment: Bullish, News: Neutral
  2. UID: 8, Sentiment: Neutral, News: Slightly Bullish

Sample Disagreement:
  1. UID: 29, Sentiment: Slightly Bearish, News: Slightly Bull

In [19]:
sentiment_df_gemini_hb[sentiment_df_gemini_hb['Sentiment'].isnull()]


Unnamed: 0,UID,Date,Ticker,Sentiment,Reason
1020,1021,2015-02-09,NTES,,
1021,1022,2015-02-09,VRSN,,
1022,1023,2015-02-09,AKAM,,
1023,1024,2015-02-10,PEP,,
1024,1025,2015-02-10,NTES,,
...,...,...,...,...,...
43194,43195,2018-04-27,INTC,,
43195,43196,2018-04-27,ADI,,
43196,43197,2018-04-28,PYPL,,
43197,43198,2018-04-28,EBAY,,


In [27]:
sentiment_df_gemini['Date'] = pd.to_datetime(sentiment_df_gemini['Date'], errors='coerce')
sentiment_2022_df = sentiment_df_gemini[sentiment_df_gemini['Date'].dt.year == 2022]
sentiment_2022_df[sentiment_2022_df['Sentiment'].isnull()]


Unnamed: 0,UID,Date,Ticker,Sentiment,Reason
40280,107038,2022-01-04,NVDA,,
40281,107039,2022-01-04,ZS,,
40282,107040,2022-01-04,INTC,,
40283,107041,2022-01-04,INTC,,
40284,107042,2022-01-04,EXC,,
...,...,...,...,...,...
77206,144114,2022-12-15,TMUS,,
77207,144115,2022-12-15,AMGN,,
77208,144116,2022-12-15,AMGN,,
77209,144117,2022-12-15,PDD,,


In [28]:
sentiment_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38731 entries, 40100 to 78830
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   UID        38731 non-null  int64         
 1   Date       38731 non-null  datetime64[ns]
 2   Ticker     38731 non-null  object        
 3   Sentiment  37621 non-null  object        
 4   Reason     37621 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 1.8+ MB


In [23]:
sentiment_df_gemini.tail()

Unnamed: 0,UID,Date,Ticker,Sentiment,Reason
78826,145734,2022-12-30,AEP,Slightly Bearish,American Electric Power closed slightly lower ...
78827,145735,2022-12-30,TSLA,Bearish,Tesla's self-driving system requires a human b...
78828,145736,2022-12-30,TSLA,Bullish,Tesla's shares surged after Elon Musk reassure...
78829,145737,2022-12-30,BIIB,Neutral,Biogen saw high options trading volume.
78830,145738,2022-12-30,TSLA,Neutral,Tesla was among the best performing components...
