In [1]:
import pandas as pd
import joblib
from sentence_transformers import SentenceTransformer, util
import numpy as np
import warnings

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# --- Load all necessary artifacts and data ---
print("Loading models and data...")
try:
    # Load the sentence transformer model
    # This will download the model the first time it's run
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    # Load our trained Model 1
    model_1 = joblib.load('model_data/model_1_market_financial.joblib')

    # Load the data files
    advisor_df = pd.read_csv('data/advisor_data_labeled.csv', parse_dates=['date'])
    press_df = pd.read_csv('data/press_release_data_labeled.csv', parse_dates=['date'])
    
    # Clean up text for better processing
    press_df['press_release_text'] = press_df['press_release_text'].fillna('')

except FileNotFoundError as e:
    print(f"Error loading artifact: {e}. Please ensure all model and data files are in the directory.")
    exit()

Loading models and data...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [4]:
def get_contradiction_score(post_text, company, post_date):
    """Calculates the contradiction between a post and a company's press release."""
    
    # Find the most recent press release on or before the post_date
    relevant_releases = press_df[
        (press_df['company'] == company) & 
        (press_df['date'] <= post_date)
    ].sort_values(by='date', ascending=False)
    
    if relevant_releases.empty:
        # No press release found, so we can't determine contradiction. Return a neutral score.
        return 0.5 

    latest_release_text = relevant_releases.iloc[0]['press_release_text']

    # Generate embeddings for both texts
    embedding1 = embedding_model.encode(post_text, convert_to_tensor=True)
    embedding2 = embedding_model.encode(latest_release_text, convert_to_tensor=True)
    
    # Calculate cosine similarity
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    
    # Convert similarity to a contradiction score (0 to 1 scale)
    # We use (1 - similarity) / 2 to map the [-1, 1] similarity range to [0, 1] contradiction range
    contradiction_score = (1 - cosine_similarity) / 2
    
    return contradiction_score

In [5]:
def get_advisor_risk(company):
    """Determines the risk associated with a company's advisor."""
    
    company_advisor_info = advisor_df[advisor_df['company'] == company]
    
    if company_advisor_info.empty:
        # No advisor information found, return a neutral risk score
        return 0.5
        
    status = company_advisor_info.iloc[0]['advisor_status']
    
    # Define risk scores based on status
    risk_map = {
        'Terminated': 0.9,
        'Under Investigation': 0.8,
        'Resigned': 0.7,
        'Not Found': 0.5, # Neutral
        'Active': 0.1     # Low risk
    }
    
    return risk_map.get(status, 0.5)

In [6]:
def validate_social_media_post(post_text, company, date_str):
    """
    Validates a social media post by checking for contradiction with official statements
    and assessing advisor-related reputational risk.
    """
    post_date = pd.to_datetime(date_str)
    
    print(f"\n--- Validating Post for '{company}' on {date_str} ---")
    print(f"Post Text: \"{post_text}\"")
    
    # Step 1: Get Contradiction Score
    contradiction_score = get_contradiction_score(post_text, company, post_date)
    print(f"Contradiction with Press Release: {contradiction_score:.2f}")

    # Step 2: Get Advisor Risk Score
    advisor_risk = get_advisor_risk(company)
    print(f"Advisor Risk Score: {advisor_risk:.2f}")
    
    # Step 3: Calculate Final Genuinity Score
    # Weights can be tuned based on how much we trust each signal
    w1 = 0.6 # Weight for contradiction
    w2 = 0.4 # Weight for advisor risk
    
    genuinity_score = (w1 * contradiction_score) + (w2 * advisor_risk)
    
    print(f"--- FINAL GENUINITY SCORE: {genuinity_score:.2f} ---")
    
    if genuinity_score > 0.65:
        print("Result: High likelihood the post is a genuine warning.")
    elif genuinity_score > 0.4:
        print("Result: Moderate likelihood. Worth monitoring.")
    else:
        print("Result: Low likelihood. May be unsubstantiated rumor.")
        
    return genuinity_score

In [None]:
# --- Example Usage ---
if __name__ == '__main__':
    # Example 1: A negative post about a company with a terminated advisor
    validate_social_media_post(
        post_text="This company is hiding massive losses, it's a total scam!",
        company="Innovate Corp",
        date_str="2023-09-12"
    )

    # Example 2: A negative post about a company with an active advisor and a positive press release
    validate_social_media_post(
        post_text="Source tells me their product is failing and they are faking sales numbers.",
        company="FutureTech",
        date_str="2023-09-05"
    )

In [28]:
import pandas as pd
import joblib
from sentence_transformers import SentenceTransformer, util
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [29]:
# --- Load all necessary artifacts and data ---
print("Loading models and data...")
try:
    # Load models
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    model_1 = joblib.load('model_data/model_1_market_financial.joblib')
    model_1_features = joblib.load('model_data/model_1_market_financial_features.joblib')

    # Load data files
    advisor_df = pd.read_csv('data/advisor_data_labeled.csv', parse_dates=['date'])
    press_df = pd.read_csv('data/press_release_data_labeled.csv', parse_dates=['date'])
    market_df = pd.read_csv('data/market_data_labeled.csv', parse_dates=['date'])
    financial_df = pd.read_csv('data/financial_data_labeled.csv', parse_dates=['date'])
    
    press_df['press_release_text'] = press_df['press_release_text'].fillna('')

except FileNotFoundError as e:
    print(f"Error loading artifact: {e}.")
    exit()

Loading models and data...


In [30]:
# --- Recreate the same feature engineering from Model 1 training ---
def create_model_1_features(company, post_date):
    """Prepares the feature vector for Model 1 for a specific company and date."""
    temp_market_df = market_df[(market_df['company'] == company) & (market_df['date'] >= post_date)]
    temp_fin_df = financial_df[(financial_df['company'] == company)]
    if temp_market_df.empty:
        return None

    if temp_fin_df.empty:
        return None

    # Merge with financial data
    merged = pd.merge(temp_market_df, temp_fin_df.drop(columns=['date'], errors='ignore'), on='company', how='left')
    
    # Create rolling features
    features_to_engineer = ['price_change_1d', 'volume_spike', 'abnormal_return']
    window = 30
    for feature in features_to_engineer:
        if feature in merged.columns:
            rolling_stats = merged[feature].rolling(window=window, min_periods=5)
            rolling_mean = rolling_stats.mean()
            rolling_std = rolling_stats.std()
            merged[f'{feature}_zscore'] = (merged[feature] - rolling_mean) / (rolling_std + 1e-6)
    
    # Get the latest row's data
    latest_data = merged[merged['date'] == post_date]
    if latest_data.empty:
        return None
        
    final_row = latest_data.iloc[0:1]
    final_row = final_row.fillna(0) # Fill any NaNs
    
    # Ensure all required features are present
    features_df = pd.DataFrame(columns=model_1_features)
    features_df = pd.concat([features_df, final_row], ignore_index=True)
    features_df = features_df[model_1_features].fillna(0)

    return features_df

In [37]:
def get_contradiction_score(post_text, company, post_date):
    """Calculates the contradiction between a post and a company's press release."""
    relevant_releases = press_df[(press_df['company'] == company) & (press_df['date'] <= post_date)].sort_values(by='date', ascending=False)
    if relevant_releases.empty: return 0.5 
    latest_release_text = relevant_releases.iloc[0]['press_release_text']
    embedding1 = embedding_model.encode(post_text, convert_to_tensor=True)
    embedding2 = embedding_model.encode(latest_release_text, convert_to_tensor=True)
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    return (1 - cosine_similarity) / 2

def get_advisor_risk(company):
    """Determines the risk associated with a company's advisor."""
    company_advisor_info = advisor_df[advisor_df['company'] == company]
    if company_advisor_info.empty: return 0.5
    status = company_advisor_info.iloc[0]['advisor_status']
    # risk_map = {'Terminated': 0.9, 'Under Investigation': 0.8, 'Resigned': 0.7, 'Not Found': 0.5, 'Active': 0.1}
    risk_map = {'Revoked': 0.9, 'Not Found': 0.5, 'Active': 0.1}
    return risk_map.get(status, 0.5)

def validate_social_media_post(post_text, company, date_str, company_cat):
    """Validates a post using Model 1, contradiction, and advisor risk."""
    post_date = pd.to_datetime(date_str)
    
    print(f"\n--- Validating Post for '{company}' on {date_str} ---")
    print(f"Post Text: \"{post_text}\"")
    
    # Step 1: Get Market & Financial Risk from Model 1
    # model_1_input = create_model_1_features(company, post_date)
    model_1_input = pd.read_csv('test_data\model_1_X_test_data.csv')
    if model_1_input is None:
        market_financial_risk = 0.5 # Neutral score if no data
    else:
        market_financial_risk = model_1.predict_proba(model_1_input[(model_1_input['company_cat']==company_cat)])[:, 1][0]
    print(f"Market/Financial Risk (Model 1): {market_financial_risk:.2f}")

    # Step 2: Get Contradiction Score
    contradiction_score = get_contradiction_score(post_text, company, post_date)
    print(f"Contradiction with Press Release: {contradiction_score:.2f}")

    # Step 3: Get Advisor Risk Score
    advisor_risk = get_advisor_risk(company)
    print(f"Advisor Risk Score: {advisor_risk:.2f}")
    
    # Step 4: Calculate Final Genuinity Score with all three components
    w1 = 0.4 # Weight for market/financial risk
    w2 = 0.3 # Weight for contradiction
    w3 = 0.3 # Weight for advisor risk
    
    genuinity_score = (w1 * market_financial_risk) + (w2 * contradiction_score) + (w3 * advisor_risk)
    
    print(f"--- FINAL GENUINITY SCORE: {genuinity_score:.2f} ---")
    
    if genuinity_score > 0.65:
        print("Result: High likelihood the post is a genuine warning.")
    elif genuinity_score > 0.4:
        print("Result: Moderate likelihood. Worth monitoring.")
    else:
        print("Result: Low likelihood. May be unsubstantiated rumor.")
        
    return genuinity_score

In [32]:
social_df = pd.read_csv('data/raw_social_data_labeled (1).csv')
icici_posts = social_df[social_df['company'] == 'Reliance']
icici_posts

Unnamed: 0,date,company,advisor_name,username,post_text,likes,retweets,comments,label
3,2024-12-13,Reliance,Peter Brown,travisbenton,Possible cell service customer under last stil...,4731,1425,589,0
4,2024-07-05,Reliance,Mandy Gonzalez,rebecca25,Nor author important lot half oil skin three t...,2802,1575,366,0
16,2023-09-21,Reliance,Logan Webster MD,christopheraguirre,Determine board produce pretty night civil wei...,2200,1849,751,0
19,2025-04-19,Reliance,Corey Johnson,heathdavid,Mention huge arrive government else off street...,3255,1430,538,0
29,2024-01-09,Reliance,Ryan Burke,trodriguez,Pay low spend wear reduce century country part...,2113,916,549,0
...,...,...,...,...,...,...,...,...,...
4977,2024-03-04,Reliance,James Allen,susan77,Make technology government guy account action ...,4,568,258,0
4982,2024-07-11,Reliance,Russell Meza,madison20,Future according wish attorney mind internatio...,1464,1422,854,0
4991,2024-05-05,Reliance,Theresa Jones,kmoody,Effect pay system whole dark source government...,393,895,130,0
4993,2025-03-20,Reliance,Brian Freeman,michael05,Produce perhaps institution western pay even a...,100,141,87,0


In [33]:
companyWithCat = pd.read_csv('data/company_to_category_map.csv')
companyWithCat

Unnamed: 0,company,company_cat
0,Infosys,0
1,Reliance,1
2,Tata Motors,2
3,Wipro,3


In [41]:
# --- Example Usage ---
if __name__ == '__main__':
    validate_social_media_post(
        # post_text="This company is hiding massive losses, it's a total scam!",
        # company="ICICI Bank",
        # date_str="2025-04-30"
        post_text=icici_posts.iloc[2]['post_text'],
        company="Reliance",
        date_str=icici_posts.iloc[2]['date'],
        company_cat = companyWithCat.loc[companyWithCat['company']=='Reliance', 'company_cat'].iloc[0]
    )


--- Validating Post for 'Reliance' on 2023-09-21 ---
Post Text: "Determine board produce pretty night civil weight successful."
Market/Financial Risk (Model 1): 0.99
Contradiction with Press Release: 0.47
Advisor Risk Score: 0.50
--- FINAL GENUINITY SCORE: 0.69 ---
