In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import nltk
from nltk.corpus import twitter_samples

In [None]:

# DOWNLOAD DATASET
nltk.download('twitter_samples')

# TAKE DATASET
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# COMBINE TRAIN AND TEST DATA
tweets = positive_tweets + negative_tweets
labels = [1] * len(positive_tweets) + [0] * len(negative_tweets)

print(f'Number of Tweets : {len(tweets)}')
print(f'Number of Labels : {len(labels)}\n')

print(f'Type of Tweets : {type(tweets)}')
print(f'Type of Labels : {type(labels)}')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


Number of Tweets : 10000
Number of Labels : 10000

Type of Tweets : <class 'list'>
Type of Labels : <class 'list'>


In [None]:
# DISPLAY 5 TWEETS FOR EACH POSITIVE AND NEGATIVE TWEETS

positive_tweets[:5] , negative_tweets[:5]

(['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
  '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
  '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
  '@97sides CONGRATS :)',
  'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days'],
 ['hopeless for tmr :(',
  "Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
  '@Hegelbon That heart sliding into the waste basket. :(',
  '“@ketchBurning: I hate Japanese call him "bani" :( :(”\n\nMe too',
  'Dang starting next week I have "work" :('])

In [None]:

# DEFINE TOKENIZER
tokenizer = RobertaTokenizer.from_pretrained('siebert/sentiment-roberta-large-english')

# TOKENIZED
encodings = tokenizer(tweets, truncation=True, padding=True, max_length=128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# PREPARE DATASET

class TwitterSentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Membuat dataset
dataset = TwitterSentimentDataset(encodings, labels)

# Membuat dataloader
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
# TRAIN TEST SPLIT

from sklearn.model_selection import train_test_split

train_tweets, eval_tweets, train_labels, eval_labels = train_test_split(tweets, labels, test_size=0.2, random_state=42)

# TOKENIZED FOR VALIDATION DATA
eval_encodings = tokenizer(eval_tweets, truncation=True, padding=True, max_length=128)

# CREATE VALIDATION DATA
eval_dataset = TwitterSentimentDataset(eval_encodings, eval_labels)

In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

# DEFINE PRE-TRAINED MODEL
model = RobertaForSequenceClassification.from_pretrained('siebert/sentiment-roberta-large-english', num_labels=2)


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Inisialisasi Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset
)

# Mulai pelatihan
trainer.train()


pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.0236,0.002047
2,0.0157,4.5e-05
3,0.0045,1.9e-05


TrainOutput(global_step=3750, training_loss=0.015433593972524006, metrics={'train_runtime': 2948.2831, 'train_samples_per_second': 10.175, 'train_steps_per_second': 1.272, 'total_flos': 6607247751720000.0, 'train_loss': 0.015433593972524006, 'epoch': 3.0})

In [None]:
# Evaluate
trainer.evaluate()


{'eval_loss': 1.8797140000970103e-05,
 'eval_runtime': 31.9591,
 'eval_samples_per_second': 62.58,
 'eval_steps_per_second': 7.822,
 'epoch': 3.0}

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')


model.save_pretrained('/content/drive/MyDrive/Twitter-Sentiment-Analysis')
tokenizer.save_pretrained('/content/drive/MyDrive/Twitter-Sentiment-Analysis')

('/content/drive/MyDrive/Twitter-Sentiment-Analysis/tokenizer_config.json',
 '/content/drive/MyDrive/Twitter-Sentiment-Analysis/special_tokens_map.json',
 '/content/drive/MyDrive/Twitter-Sentiment-Analysis/vocab.json',
 '/content/drive/MyDrive/Twitter-Sentiment-Analysis/merges.txt',
 '/content/drive/MyDrive/Twitter-Sentiment-Analysis/added_tokens.json')

In [None]:
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')


('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.json',
 './sentiment_model/merges.txt',
 './sentiment_model/added_tokens.json')

In [None]:
# LOAD PRE-TRAINED MODEL THAT HAS BEEN TRAINED
model = RobertaForSequenceClassification.from_pretrained('./sentiment_model')
tokenizer = RobertaTokenizer.from_pretrained('./sentiment_model')

# CREATE FUNCTION TO PREDICT A TWEET
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1)
    return 'positive' if prediction.item() == 1 else 'negative'

In [None]:
# LETS TRY OUR MODEL CREATED

text_1 = 'Couldn’t be happier with my new phone! 📱 Fast, sleek, and the camera is everything I hoped for. #WorthIt'  # Label : Positive
text_2 ='This movie was so boring. The plot was flat and there was nothing interesting about it.'   # Label : Negative
text_3 = 'This app is so easy to use and really helps me organize my daily schedule'    # Label : Positive
text_4 = 'The customer service here is nonexistent. I’ve been on hold for over an hour. @BrandName, this is ridiculous. #Frustrated' # Label : Negative

# SARCASTIC COMMENT
text_5 = 'This product is absolutely perfect, if by perfect you mean completely useless and broken.'      # Label : Negative
text_6 = "Just finished this book. It's an absolute masterpiece... if you enjoy reading the same thing over and over again. #BestBookEver 📚" # Label : Positive
text_7 = 'Oh, what a wonderful surprise! My coffee shop order was wrong, just the way I like it. #BestMistakeEver' # Label : Positive
text_8 = 'I love how my gym is always crowded, makes me feel like I’m part of something bigger than myself. #Goals' # Label : Positive
text_9 = 'Just watched the latest episode. Incredible... if you love wasting your time on bad TV. #BestWasteOfTime' # Label : Negative
text_10 = 'Oh great, another update! My phone is even slower now! Can’t wait for the next one. #TechGoals' # Label : Negative

# PREDICT TWEET
print(f'{text_1} ---> {predict_sentiment(text_1)}')
print(f'{text_2} ---> {predict_sentiment(text_2)}')
print(f'{text_3} ---> {predict_sentiment(text_3)}')
print(f'{text_4} ---> {predict_sentiment(text_4)}')
print(f'{text_5} ---> {predict_sentiment(text_5)}')
print(f'{text_6} ---> {predict_sentiment(text_6)}')
print(f'{text_7} ---> {predict_sentiment(text_7)}')
print(f'{text_8} ---> {predict_sentiment(text_8)}')
print(f'{text_9} ---> {predict_sentiment(text_9)}')
print(f'{text_10} ---> {predict_sentiment(text_10)}')


Couldn’t be happier with my new phone! 📱 Fast, sleek, and the camera is everything I hoped for. #WorthIt ---> positive
This movie was so boring. The plot was flat and there was nothing interesting about it. ---> negative
This app is so easy to use and really helps me organize my daily schedule ---> positive
The customer service here is nonexistent. I’ve been on hold for over an hour. @BrandName, this is ridiculous. #Frustrated ---> negative
This product is absolutely perfect, if by perfect you mean completely useless and broken. ---> negative
Just finished this book. It's an absolute masterpiece... if you enjoy reading the same thing over and over again. #BestBookEver 📚 ---> positive
Oh, what a wonderful surprise! My coffee shop order was wrong, just the way I like it. #BestMistakeEver ---> positive
I love how my gym is always crowded, makes me feel like I’m part of something bigger than myself. #Goals ---> positive
Just watched the latest episode. Incredible... if you love wasting you