## Importing libraries 

In [13]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [46]:
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Load model directly
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Cleaning Dataset and EDA

In [3]:
df = pd.read_csv('Q2_20230202_majority 1.csv')

In [5]:
df['label_majority'].value_counts()

label_majority
in-favor              2907
against               1804
neutral-or-unclear    1040
Name: count, dtype: int64

In [6]:
df_infavor = df[df['label_majority'] == 'in-favor']
df_against = df[df['label_majority'] == 'against']
df_neutral = df[df['label_majority'] == 'neutral-or-unclear']

In [7]:
df_infavor = df_infavor.sample(n=len(df_neutral))
df_against = df_against.sample(n=len(df_neutral))

In [8]:
df_finetuning = pd.concat([df_infavor,df_against,df_neutral])[['tweet','label_majority']]

In [None]:
list_tweets = list(df_finetuning['tweet'])
list_labels = list(df_finetuning['label_majority'])

In [72]:
le = LabelEncoder()
y = le.fit_transform(list_labels)

In [74]:
list_tweets_tokenized = [tokenizer(tweet).input_ids for tweet in list_tweets]

In [77]:
max_token_length = max(len(list_tweets_tokenized) for tweet in list_tweets_tokenized)
pad_token_id = tokenizer.eos_token_id
X = [ tweet_tokenized + [pad_token_id]*(max_token_length-len(tweet_tokenized)) for tweet_tokenized in list_tweets_tokenized ]

## splitting training and testing set

In [86]:
def split_data(feature_arr, label_arr, train_size):
  X_train, X_test, y_train, y_test = train_test_split(feature_arr,label_arr,train_size=train_size,random_state=42,shuffle=True)
  return X_train,X_test, y_train, y_test

In [87]:
train_size = 0.8
X_train, X_test, y_train, y_test = split_data(X,y,train_size)

## Loading the final tensor dataset 

In [None]:
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

In [90]:
dataset = TensorDataset(X_train,y_train)
train_loader = DataLoader(dataset, batch_size=8,shuffle=True,)
test_loader = DataLoader(dataset, batch_size=8,shuffle=True)

In [4]:
# Learnings: 

# model.eval()
# 	• Disables dropout and sets batch norm to inference mode.
# 	• Does NOT disable gradient tracking.

# torch.no_grad()
# 	Disables gradient tracking (no .grad, saves memory, speeds up inference).

# model.train()
# • enables dropout and sets batch norm to inference mode.

# optimizer.zero_grad()
# sets all gradients set now to 0 


In [None]:
# make it into OOPS strcuture

# do we need a validation set and why? What are the different hyperparameters...epochs, batch size  
# the validation set in the code is actually a test set 