In [None]:
# Turn time of post into time ranges
# Try with date removed and with days from election

# clean text
# Try removing all tag annotations
# Need to keep if post is retweet
# count number of mentions? turn all mentions into same name (is this like stemming?)
# Stem http links?

# Need to consider bad spelling
# line 109  ðŸ‘ clean bad characters

'''
Please write each output in a .txt file. The first line in an output file must be
      (setf x *(
and the last line must be
      ) )

The lines in between are the prediction results in the format of

            (tweet_number  predict_label)

The tweet number here must correspond to the line/tweet number in each Excel file. E.g.,

(setf x *(
(1 0)
(2 -1)
(3 1)
) )
'''


'\nPlease write each output in a .txt file. The first line in an output file must be\n      (setf x *(\nand the last line must be\n      ) )\n\nThe lines in between are the prediction results in the format of\n\n            (tweet_number  predict_label)\n\nThe tweet number here must correspond to the line/tweet number in each Excel file. E.g.,\n\n(setf x *(\n(1 0)\n(2 -1)\n(3 1)\n) )\n'

In [None]:
import pandas as pd
import re
from datetime import datetime
from scipy.sparse import hstack
import numpy as np
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os
import torch


In [None]:
# Install huggingface_hub if not already installed
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login, HfApi, create_repo

# Login to Hugging Face (this will prompt for your token in Colab)
notebook_login()

# Define your Hugging Face repository name (change to your username/repo)
hf_repo_name = "Ajknight/obama-romney-sentiment-model"  # Replace with your actual username and desired repo name

# Create the repo if it doesn't exist
try:
    create_repo(hf_repo_name, repo_type="model")
except Exception as e:
    print(f"Repo already exists or error: {e}")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Repo already exists or error: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-692bd9eb-7c1d4f1674e67dff77b650ee;9c072f22-59e8-4f39-8d76-261816437ec4)

You already created this model repo: Ajknight/obama-romney-sentiment-model


# Parse Data

In [None]:
# Function to clean tweets
def clean_tweet_for_new_llm(text, genericize_mentions_in_retweets=False):
    if not isinstance(text, str):
        return ""

    original_text = text

    # Preserve 'RT' if it starts the tweet (case-insensitive)
    is_retweet = original_text.strip().lower().startswith('rt')
    retweet_prefix = 'rt ' if is_retweet else ''

    # If it's a retweet, remove the 'RT' prefix from the *original_text* before further processing
    if is_retweet:
        text = re.sub(r'^[Rr][Tt]\\s*', '', original_text).strip()
        # NEW: If it's a retweet, remove the mention that immediately follows it
        # This targets the first @mention in the string after 'RT' is removed.
        text = re.sub(r'^@\\w+\\s*', '', text).strip()
    else:
        text = original_text.strip()

    # Remove HTML/XML tags like <e>...</e> or <a>...</a>
    text = re.sub(r'<[^>]+>', '', text)

    # Replace URLs with a generic link indicator
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '_url_', text)
    text = re.sub(r'www\\.(?:[a-zA-Z]|[0-9]|[$_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '_url_', text)

    # Handle *other* mentions conditionally (the first one after RT is already handled if it was a retweet)
    if genericize_mentions_in_retweets:
        # Replace any remaining mentions with generic _mention_ indicator
        text = re.sub(r'@\\w+', '_mention_', text)
    else:
        # Remove any remaining mentions
        text = re.sub(r'@\\w+', '', text)

    # Remove non-alphanumeric characters (keeping spaces and underscore for _url_ and _mention_)
    text = re.sub(r'[^a-zA-Z0-9\\s_]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra spaces
    text = re.sub(r'\\s+', ' ', text).strip()

    # Add the retweet prefix back
    return (retweet_prefix + text).strip()

In [None]:
# Time formats for parsing
time_formats = [
    '%H:%M:%S%z',  # e.g., 10:02:57-05:00
    '%I:%M:%S %p', # e.g., 1:22:46 PM
    '%p %I:%M:%S', # e.g., PM 9:44:54
    '%I:%M %p',    # e.g., 1:22 PM
    '%H:%M',       # e.g., 10:02
    '%p %I:%M'     # e.g., PM 9:44
]

def parse_mixed_time_formats(time_str):
    if not isinstance(time_str, str):
        return pd.NaT
    time_str = time_str.strip() # Remove leading/trailing whitespace
    for fmt in time_formats:
        try:
            return datetime.strptime(time_str, fmt).time()
        except ValueError:
            continue
    return pd.NaT # Return NaT if no format matches

# Train Model

In [None]:
# Initialize sentiment analyzer (use device=0 for GPU if available)
sentiment_analyzer = pipeline('sentiment-analysis', device=0 if torch.cuda.is_available() else -1)

def get_sentiment_batch(df, text_col, label_col, score_col):
    texts = df[text_col].tolist()
    non_empty_indices = [i for i, t in enumerate(texts) if t.strip()]
    texts_non_empty = [texts[i] for i in non_empty_indices]

    labels = ['NEUTRAL'] * len(texts)
    scores = [0.0] * len(texts)

    if texts_non_empty:
        results = sentiment_analyzer(texts_non_empty, batch_size=32)
        for j, i in enumerate(non_empty_indices):
            labels[i] = results[j]['label']
            scores[i] = results[j]['score']

    df[label_col] = labels
    df[score_col] = scores

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


In [None]:
# Training function (adapted for multi-class)
def train_model(excel_file_path):
    # Load data
    df_obama = pd.read_excel(excel_file_path, sheet_name='Obama')
    df_romney = pd.read_excel(excel_file_path, sheet_name='Romney')


    # Preprocess (drop first row, rename, etc.)
    df_obama = df_obama.drop(index=0).reset_index(drop=True)
    df_romney = df_romney.drop(index=0).reset_index(drop=True)

    df_obama = df_obama.rename(columns={'Unnamed: 4': 'class'})
    df_romney = df_romney.rename(columns={'Unnamed: 4': 'class'})

    merged_df = pd.concat([df_obama, df_romney], ignore_index=True)

    columns_to_drop = [col for col in ['Unnamed: 0', 'Unnamed: 5'] if col in merged_df.columns]
    if columns_to_drop:
        merged_df = merged_df.drop(columns=columns_to_drop)

    print("Excel Loaded")

    # Clean tweets
    merged_df['tweet_clean_default'] = merged_df['Anootated tweet'].apply(clean_tweet_for_new_llm)
    merged_df['tweet_clean_generic_mentions'] = merged_df['Anootated tweet'].apply(lambda x: clean_tweet_for_new_llm(x, genericize_mentions_in_retweets=True))

    print("batching sentiment inital analysis")
    # Batch sentiment analysis
    get_sentiment_batch(merged_df, 'tweet_clean_default', 'sentiment_label_default', 'sentiment_score_default')
    get_sentiment_batch(merged_df, 'tweet_clean_generic_mentions', 'sentiment_label_generic', 'sentiment_score_generic')

    # Parse time
    merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')
    merged_df['time_parsed'] = merged_df['time'].astype(str).apply(parse_mixed_time_formats)
    merged_df['hour_of_day'] = merged_df['time_parsed'].apply(lambda x: x.hour if pd.notna(x) else pd.NaT)

    # Map sentiments
    sentiment_llm_mapping = {'POSITIVE': 1, 'NEGATIVE': -1, 'NEUTRAL': 0, 'ERROR': np.nan}
    merged_df['llm_sentiment_numeric_default'] = merged_df['sentiment_label_default'].map(sentiment_llm_mapping)
    merged_df['llm_sentiment_numeric_generic'] = merged_df['sentiment_label_generic'].map(sentiment_llm_mapping)

    merged_df['llm_sentiment_numeric_default'] = merged_df['llm_sentiment_numeric_default'].fillna(0)
    merged_df['llm_sentiment_numeric_generic'] = merged_df['llm_sentiment_numeric_generic'].fillna(0)
    merged_df['hour_of_day'] = merged_df['hour_of_day'].fillna(0).astype(int)

    print("DONE- batching sentiment inital analysis")

    # Include all classes: -1, 0, 1
    valid_classes = [-1, 0, 1]
    merged_df = merged_df[merged_df['class'].isin(valid_classes)]

    # Map labels to 0,1,2 for categorical: -1 -> 0 (negative), 0 -> 1 (neutral), 1 -> 2 (positive)
    label_map = {-1: 0, 0: 1, 1: 2}
    y_mapped = merged_df['class'].map(label_map)
    y = to_categorical(y_mapped, num_classes=3)

    classes = np.unique(y_mapped)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_mapped)
    class_weight_dict = dict(zip(classes, class_weights))
    print("Class weights:", class_weight_dict)

    # TF-IDF
    tfidf_vectorizer_default = TfidfVectorizer(max_features=10000)
    tfidf_vectorizer_generic = TfidfVectorizer(max_features=10000)

    X_text_default = tfidf_vectorizer_default.fit_transform(merged_df['tweet_clean_default'])
    X_text_generic = tfidf_vectorizer_generic.fit_transform(merged_df['tweet_clean_generic_mentions'])

    X_numeric = merged_df[['llm_sentiment_numeric_default', 'llm_sentiment_numeric_generic', 'hour_of_day']].values

    X = hstack([X_text_default, X_text_generic, X_numeric])
    X_dense = X.toarray()

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42, stratify=y_mapped)

    print("Data split training model")
    # Build model for multi-class
    input_dim = X_train.shape[1]
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # 3 classes

    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train with class weights
    model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.2, class_weight=class_weight_dict, callbacks=[early_stop])

    print("DONE- Data split training model")
    # Evaluate
    y_pred_proba = model.predict(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_test_labels = np.argmax(y_test, axis=1)

    # Map back for report
    reverse_map = {0: -1, 1: 0, 2: 1}
    y_test_orig = [reverse_map[label] for label in y_test_labels]
    y_pred_orig = [reverse_map[label] for label in y_pred]

    print("Accuracy:", accuracy_score(y_test_orig, y_pred_orig))
    print(classification_report(y_test_orig, y_pred_orig, target_names=['Negative (-1)', 'Neutral (0)', 'Positive (1)']))

    # Save model locally
    model.save('sentiment_model.h5')

    # Save vectorizers
    joblib.dump(tfidf_vectorizer_default, 'tfidf_default.pkl')
    joblib.dump(tfidf_vectorizer_generic, 'tfidf_generic.pkl')

    # Upload to Hugging Face
    api = HfApi()
    api.upload_file(path_or_fileobj='sentiment_model.h5', path_in_repo='sentiment_model.h5', repo_id=hf_repo_name, repo_type="model")
    api.upload_file(path_or_fileobj='tfidf_default.pkl', path_in_repo='tfidf_default.pkl', repo_id=hf_repo_name, repo_type="model")
    api.upload_file(path_or_fileobj='tfidf_generic.pkl', path_in_repo='tfidf_generic.pkl', repo_id=hf_repo_name, repo_type="model")

    print(f"Model and vectorizers saved to Hugging Face: https://huggingface.co/{hf_repo_name}")

    return model, tfidf_vectorizer_default, tfidf_vectorizer_generic

# Predict New input


In [None]:
# Prediction function (adapted for multi-class)
def predict_on_new_data(excel_file_path, model_path='sentiment_model.h5', tfidf_default_path='tfidf_default.pkl', tfidf_generic_path='tfidf_generic.pkl'):
    from tensorflow.keras.models import load_model
    import joblib

    # Load model and vectorizers
    neural_network_model = load_model(model_path)
    tfidf_vectorizer_default = joblib.load(tfidf_default_path)
    tfidf_vectorizer_generic = joblib.load(tfidf_generic_path)

    # Load new data (same as training logic)
    try:
        df_obama_new = pd.read_excel(excel_file_path, sheet_name='Obama')
        df_romney_new = pd.read_excel(excel_file_path, sheet_name='Romney')
    except FileNotFoundError:
        print(f"Error: Excel file '{excel_file_path}' not found.")
        return None
    except ValueError as e:
        print(f"Error loading Excel sheets: {e}. Check sheet names.")
        return None

    df_obama_new = df_obama_new.drop(index=0).reset_index(drop=True)
    df_romney_new = df_romney_new.drop(index=0).reset_index(drop=True)

    df_obama_new = df_obama_new.rename(columns={'Unnamed: 4': 'class'})
    df_romney_new = df_romney_new.rename(columns={'Unnamed: 4': 'class'})

    new_data_df = pd.concat([df_obama_new, df_romney_new], ignore_index=True)

    columns_to_drop_new = [col for col in ['Unnamed: 0', 'Unnamed: 5'] if col in new_data_df.columns]
    if columns_to_drop_new:
        new_data_df = new_data_df.drop(columns=columns_to_drop_new)

    new_data_df['tweet_clean_default'] = new_data_df['Anootated tweet'].apply(clean_tweet_for_new_llm)
    new_data_df['tweet_clean_generic_mentions'] = new_data_df['Anootated tweet'].apply(lambda x: clean_tweet_for_new_llm(x, genericize_mentions_in_retweets=True))

    # Batch sentiment analysis
    get_sentiment_batch(new_data_df, 'tweet_clean_default', 'sentiment_label_default', 'sentiment_score_default')
    get_sentiment_batch(new_data_df, 'tweet_clean_generic_mentions', 'sentiment_label_generic', 'sentiment_score_generic')

    new_data_df['date'] = pd.to_datetime(new_data_df['date'], errors='coerce')
    new_data_df['time_parsed'] = new_data_df['time'].astype(str).apply(parse_mixed_time_formats)
    new_data_df['hour_of_day'] = new_data_df['time_parsed'].apply(lambda x: x.hour if pd.notna(x) else pd.NaT)

    sentiment_llm_mapping = {'POSITIVE': 1, 'NEGATIVE': -1, 'NEUTRAL': 0, 'ERROR': np.nan}
    new_data_df['llm_sentiment_numeric_default'] = new_data_df['sentiment_label_default'].map(sentiment_llm_mapping)
    new_data_df['llm_sentiment_numeric_generic'] = new_data_df['sentiment_label_generic'].map(sentiment_llm_mapping)

    new_data_df['llm_sentiment_numeric_default'] = new_data_df['llm_sentiment_numeric_default'].fillna(0)
    new_data_df['llm_sentiment_numeric_generic'] = new_data_df['llm_sentiment_numeric_generic'].fillna(0)
    new_data_df['hour_of_day'] = new_data_df['hour_of_day'].fillna(0).astype(int)

    X_text_default_new = tfidf_vectorizer_default.transform(new_data_df['tweet_clean_default'])
    X_text_generic_new = tfidf_vectorizer_generic.transform(new_data_df['tweet_clean_generic_mentions'])
    X_numeric_new = new_data_df[['llm_sentiment_numeric_default', 'llm_sentiment_numeric_generic', 'hour_of_day']].values

    X_new = hstack([X_text_default_new, X_text_generic_new, X_numeric_new])
    X_new_dense = X_new.toarray()

    y_pred_proba_new = neural_network_model.predict(X_new_dense)
    y_pred = np.argmax(y_pred_proba_new, axis=1)

    # Map back to original labels: 0 -> -1, 1 -> 0, 2 -> 1
    reverse_map = {0: -1, 1: 0, 2: 1}
    new_data_df['pred_class'] = [reverse_map[label] for label in y_pred]

    return new_data_df

In [None]:
# Usage - Training
excel_file_path_train = '/content/drive/MyDrive/cs583/training-Obama-Romney-tweets.xlsx' #Training

# Train and save
model, tfidf_default, tfidf_generic = train_model(excel_file_path_train)


Excel Loaded
batching sentiment inital analysis


  merged_df['hour_of_day'] = merged_df['hour_of_day'].fillna(0).astype(int)


DONE- batching sentiment inital analysis
Class weights: {np.int64(0): np.float64(0.7697473174108688), np.int64(1): np.float64(1.0364466815809097), np.int64(2): np.float64(1.3586265884652982)}
Data split training model


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.3226 - loss: 1.1088 - val_accuracy: 0.3401 - val_loss: 1.0972
Epoch 2/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.3719 - loss: 1.0917 - val_accuracy: 0.3856 - val_loss: 1.0916
Epoch 3/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.3778 - loss: 1.0930 - val_accuracy: 0.3997 - val_loss: 1.0835
Epoch 4/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4586 - loss: 1.0494 - val_accuracy: 0.3716 - val_loss: 1.0843
Epoch 5/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6108 - loss: 0.9317 - val_accuracy: 0.3687 - val_loss: 1.0934
Epoch 6/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7996 - loss: 0.5855 - val_accuracy: 0.3418 - val_loss: 1.1237
Epoch 7/50
[1m112/112[0m 



Accuracy: 0.4087230215827338
               precision    recall  f1-score   support

Negative (-1)       0.50      0.37      0.43       963
  Neutral (0)       0.38      0.55      0.45       715
 Positive (1)       0.33      0.30      0.31       546

     accuracy                           0.41      2224
    macro avg       0.41      0.40      0.40      2224
 weighted avg       0.42      0.41      0.41      2224



Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  sentiment_model.h5          :   4%|3         | 1.12MB / 30.9MB            

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  tfidf_default.pkl           :  59%|#####9    |  616kB / 1.04MB            

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  tfidf_generic.pkl           : 100%|##########| 1.04MB / 1.04MB            

Model and vectorizers saved to Hugging Face: https://huggingface.co/Ajknight/obama-romney-sentiment-model


In [None]:
# Usage - Prediction
from sklearn.metrics import accuracy_score, classification_report

excel_file_path_pred = '/content/drive/MyDrive/cs583/training-Obama-Romney-tweets.xlsx' #Testing

# Predict
predicted_df = predict_on_new_data(excel_file_path_pred)

if predicted_df is not None:
    print("Predictions on new data generated successfully. Head of predicted_df:")
    print(predicted_df[['Anootated tweet', 'pred_class']].head())

    # Calculate and print Accuracy and F-score
    y_true = predicted_df['class']
    y_pred = predicted_df['pred_class']

    # Ensure both are numeric and handle potential non-numeric values if any slipped through
    y_true = pd.to_numeric(y_true, errors='coerce').dropna()
    y_pred = pd.to_numeric(y_pred, errors='coerce').dropna()

    # Align indices after dropping NaNs
    common_indices = y_true.index.intersection(y_pred.index)
    y_true = y_true.loc[common_indices]
    y_pred = y_pred.loc[common_indices]

    if not y_true.empty and not y_pred.empty:
        accuracy = accuracy_score(y_true, y_pred)
        report = classification_report(y_true, y_pred, target_names=['Negative (-1)', 'Neutral (0)', 'Positive (1)'], zero_division=0)

        print("\n--- Prediction Metrics ---")
        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)
    else:
        print("\nWarning: Not enough valid data points to calculate metrics.")

    # Generate the .txt output file
    output_filename = 'predictions.txt'
    with open(output_filename, 'w') as f:
        f.write('(setf x *(\n')
        for original_index, row in predicted_df.iterrows():
            # The tweet number here must correspond to the line/tweet number in each Excel file.
            # We use original_index + 1 to get 1-based indexing for tweet number
            f.write(f'  ({original_index + 1} {row["pred_class"]})\n')
        f.write(') )\n')
    print(f"Predictions saved to {output_filename} in the specified format.")


  new_data_df['hour_of_day'] = new_data_df['hour_of_day'].fillna(0).astype(int)


[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Predictions on new data generated successfully. Head of predicted_df:
                                     Anootated tweet  pred_class
0  Kirkpatrick, who wore a baseball cap embroider...           0
1  Question: If <e>Romney</e> and <e>Obama</e> ha...           0
2  #<e>obama</e> debates that Cracker Ass Cracker...           0
3  RT @davewiner Slate: Blame <e>Obama</e> for fo...          -1
4  @Hollivan @hereistheanswer  Youre missing the ...           0


ValueError: Number of classes, 4, does not match size of target_names, 3. Try specifying the labels parameter