# Trump

In [1]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import pandas as pd

In [5]:
sentiment_analyzer = pipeline("sentiment-analysis", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", top_k=None)

# Test the model with a sentence
result = sentiment_analyzer("improves GDP by 5% and increases homelessness by 5%")
print(result)

[[{'label': 'positive', 'score': 0.7101832628250122}, {'label': 'negative', 'score': 0.17953164875507355}, {'label': 'neutral', 'score': 0.11028500646352768}]]


In [3]:
model_name ="lxyuan/distilbert-base-multilingual-cased-sentiments-student"

In [4]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [5]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## Final data

In [6]:
df_train = pd.read_csv("train_trump_sentiment.csv")

In [7]:
df_train.head()

Unnamed: 0,article,candidate,score,publishedAt
0,gutwrenching ad show horror abortion ban new w...,Donald Trump,-0.9274,2024-09-19T13:00:01Z
1,campaign influence operation say former state ...,Donald Trump,0.296,2024-10-11T15:19:22Z
2,new powerful ad going break powerful new ad ha...,Donald Trump,0.5267,2024-10-07T22:02:33Z
3,surrounded disaster mayhem entering final week...,Donald Trump,-0.9517,2024-10-03T00:14:05Z
4,agrees charlamagne tha god assessment trump ca...,Donald Trump,0.4404,2024-10-16T01:37:10Z


In [8]:
# Drop the 'candidate' and 'publishedAt' columns
df_train_final = df_train.drop(columns=['candidate', 'publishedAt'])

df_train_final.head()

Unnamed: 0,article,score
0,gutwrenching ad show horror abortion ban new w...,-0.9274
1,campaign influence operation say former state ...,0.296
2,new powerful ad going break powerful new ad ha...,0.5267
3,surrounded disaster mayhem entering final week...,-0.9517
4,agrees charlamagne tha god assessment trump ca...,0.4404


# Save the modified dataset (Once)

In [9]:
#df_train_final.to_csv("trump_train_final.csv", index=False)

## Tokens

In [10]:
# check if all words in articles have tokens

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Print out words not found in tokenizer
def check_tokens_in_vocab(text, tokenizer):
    words = text.split()  # Split the article into words
    missing_tokens = []
    for word in words:
        tokenized_word = tokenizer.tokenize(word)
        if len(tokenized_word) == 0:  # If the tokenizer returns an empty list, the word is not tokenized
            print(f"Word not in tokenizer: {word}")
            missing_tokens.append(word)
    return missing_tokens

# Loop through all articles in the dataframe
df_train_final['missing_tokens'] = df_train_final['article'].apply(lambda x: check_tokens_in_vocab(x, tokenizer))

# Check the missing tokens for each article
missing_tokens = df_train_final['missing_tokens'].explode().dropna().unique()
print(f"Unique missing tokens: {missing_tokens}")

# Optionally, display the missing tokens for review
print(f"Total unique missing tokens: {len(missing_tokens)}")

Unique missing tokens: []
Total unique missing tokens: 0


# Check if some important words from word cloud are tokens if not add 

In [12]:
words = ['candidate', 'battleground', 'state', 'presidential', 'new york',
        'cnn', 'mark robinson', 'report', 'jd vance',
        'democratic', 'post', 'win', 'news', 
        'poll', 'among', 'voter', 'polling', 'support', 'former', 'president',
        'election', 'georgia', 'vote', 'race', 'democrat', 'texas', 
        'jury', 'bidenharris', 'federal', 'republican', 
        'vice', 'united', 'nominee', 'white', 'biden',
        'rhetoric', 'assassination',
        'administration', 'interview', 'debate',
        'campaign','gop', 'house', 'supporter', 
        'called', 'claim', 'analyst', 'rally', 'lie', 'call',
        'lead', 'vp', 'key', 'show', 'town hall', 'fox', 'policy', 
        'tim walz', 'attack', 'american', 'tariff', 'country', 'helene', 'host',
        'swing', 'pennsylvania', 'joe', 'hurricane', 'woman', 'political','obama', 
        'economy', 'issue', 'plan', 'record', 'elon musk', 'endorse', 'washington',
        'israel', 'carolina', 'abc']

# Function to check if each word in the array is tokenized
def check_multiple_words_tokenized(words_list, tokenizer):
    for word in words_list:
        tokenized_word = tokenizer.tokenize(word)
        print(f"Tokenized version of '{word}': {tokenized_word}")
        
check_multiple_words_tokenized(words, tokenizer)

Tokenized version of 'candidate': ['candidate']
Tokenized version of 'battleground': ['battle', '##ground']
Tokenized version of 'state': ['state']
Tokenized version of 'presidential': ['presidential']
Tokenized version of 'new york': ['new', 'yo', '##rk']
Tokenized version of 'cnn': ['cn', '##n']
Tokenized version of 'mark robinson': ['mark', 'ro', '##bin', '##son']
Tokenized version of 'report': ['report']
Tokenized version of 'jd vance': ['j', '##d', 'van', '##ce']
Tokenized version of 'democratic': ['democratic']
Tokenized version of 'post': ['post']
Tokenized version of 'win': ['win']
Tokenized version of 'news': ['news']
Tokenized version of 'poll': ['poll']
Tokenized version of 'among': ['among']
Tokenized version of 'voter': ['voter']
Tokenized version of 'polling': ['poll', '##ing']
Tokenized version of 'support': ['support']
Tokenized version of 'former': ['former']
Tokenized version of 'president': ['president']
Tokenized version of 'election': ['election']
Tokenized version

# Add tokens

In [13]:
# Check which words need to be added as tokens
missing_tokens = []
for word in words:
    tokenized_word = tokenizer.tokenize(word)
    if len(tokenized_word) > 1 or any(t.startswith("##") for t in tokenized_word):
        missing_tokens.append(word)

# Add missing tokens to the tokenizer
if missing_tokens:
    tokenizer.add_tokens(missing_tokens)
    print(f"Added {len(missing_tokens)} new tokens: {missing_tokens}")
else:
    print("No new tokens needed.")

# Resize the model’s token embeddings to match the new tokenizer
model.resize_token_embeddings(len(tokenizer))

Added 30 new tokens: ['battleground', 'new york', 'cnn', 'mark robinson', 'jd vance', 'polling', 'georgia', 'democrat', 'texas', 'bidenharris', 'republican', 'biden', 'rhetoric', 'gop', 'analyst', 'vp', 'town hall', 'fox', 'tim walz', 'tariff', 'helene', 'pennsylvania', 'joe', 'obama', 'elon musk', 'endorse', 'washington', 'israel', 'carolina', 'abc']


Embedding(119577, 768, padding_idx=0)

In [14]:
model.distilbert.embeddings

Embeddings(
  (word_embeddings): Embedding(119577, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

# Save pretrained model 

In [15]:
model.save_pretrained("C:/Users/User/Documents/3rd year/Datsci346/Project/DS346proj/model_with_tokens")
tokenizer.save_pretrained("C:/Users/User/Documents/3rd year/Datsci346/Project/DS346proj/model_with_tokens")

('C:/Users/User/Documents/3rd year/Datsci346/Project/DS346proj/model_with_tokens\\tokenizer_config.json',
 'C:/Users/User/Documents/3rd year/Datsci346/Project/DS346proj/model_with_tokens\\special_tokens_map.json',
 'C:/Users/User/Documents/3rd year/Datsci346/Project/DS346proj/model_with_tokens\\vocab.txt',
 'C:/Users/User/Documents/3rd year/Datsci346/Project/DS346proj/model_with_tokens\\added_tokens.json',
 'C:/Users/User/Documents/3rd year/Datsci346/Project/DS346proj/model_with_tokens\\tokenizer.json')