# Trump

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import pandas as pd

In [None]:
sentiment_analyzer = pipeline("sentiment-analysis", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", top_k=None)

# Test the model with a sentence
result = sentiment_analyzer("improves GDP by 5% and increases homelessness by 5%")
print(result)

In [None]:
model_name ="lxyuan/distilbert-base-multilingual-cased-sentiments-student"

In [None]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
model

## Final data

In [None]:
df_train = pd.read_csv("train_trump.csv")

In [None]:
df_train.head()

In [None]:
# Drop the 'candidate' and 'publishedAt' columns
df_train_final = df_train.drop(columns=['candidate', 'publishedAt'])

df_train_final.head()

## Tokens

In [None]:
# check if all words in articles have tokens

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Print out words not found in tokenizer
def check_tokens_in_vocab(text, tokenizer):
    words = text.split()  # Split the article into words
    missing_tokens = []
    for word in words:
        tokenized_word = tokenizer.tokenize(word)
        if len(tokenized_word) == 0:  # If the tokenizer returns an empty list, the word is not tokenized
            print(f"Word not in tokenizer: {word}")
            missing_tokens.append(word)
    return missing_tokens

# Loop through all articles in the dataframe
df_train_final['missing_tokens'] = df_train_final['article'].apply(lambda x: check_tokens_in_vocab(x, tokenizer))

# Check the missing tokens for each article
missing_tokens = df_train_final['missing_tokens'].explode().dropna().unique()
print(f"Unique missing tokens: {missing_tokens}")

# Optionally, display the missing tokens for review
print(f"Total unique missing tokens: {len(missing_tokens)}")

# Check if some important words from word cloud are tokens if not add 

In [None]:
words = ['candidate', 'battleground', 'state', 'presidential', 'new york',
        'cnn', 'mark robinson', 'report', 'jd vance',
        'democratic', 'post', 'win', 'news', 
        'poll', 'among', 'voter', 'polling', 'support', 'former', 'president',
        'election', 'georgia', 'vote', 'race', 'democrat', 'texas', 
        'jury', 'bidenharris', 'federal', 'republican', 
        'vice', 'united', 'nominee', 'white', 'biden',
        'rhetoric', 'assassination',
        'administration', 'interview', 'debate',
        'campaign','gop', 'house', 'supporter', 
        'called', 'claim', 'analyst', 'rally', 'lie', 'call',
        'lead', 'vp', 'key', 'show', 'town hall', 'fox', 'policy', 
        'tim walz', 'attack', 'american', 'tariff', 'country', 'helene', 'host',
        'swing', 'pennsylvania', 'joe', 'hurricane', 'woman', 'political','obama', 
        'economy', 'issue', 'plan', 'record', 'elon musk']

# Function to check if each word in the array is tokenized
def check_multiple_words_tokenized(words_list, tokenizer):
    for word in words_list:
        tokenized_word = tokenizer.tokenize(word)
        print(f"Tokenized version of '{word}': {tokenized_word}")
        
check_multiple_words_tokenized(words, tokenizer)

# Add tokens

In [None]:
# Check which words need to be added as tokens
missing_tokens = []
for word in words:
    tokenized_word = tokenizer.tokenize(word)
    if len(tokenized_word) > 1 or any(t.startswith("##") for t in tokenized_word):
        missing_tokens.append(word)

# Add missing tokens to the tokenizer
if missing_tokens:
    tokenizer.add_tokens(missing_tokens)
    print(f"Added {len(missing_tokens)} new tokens: {missing_tokens}")
else:
    print("No new tokens needed.")

# Resize the model’s token embeddings to match the new tokenizer
model.resize_token_embeddings(len(tokenizer))

In [None]:
model.distilbert.embeddings