In [1]:
from datasets import load_dataset
import pandas as pd

# Load the dataset
ds = load_dataset("fewshot-goes-multilingual/sk_csfd-movie-reviews")
train_csfd = ds["train"]
test_csfd = ds["test"]

# Convert to pandas DataFrames
train_df_csfd = pd.DataFrame(train_csfd)
test_df_csfd = pd.DataFrame(test_csfd)

# If you want to save them to CSV
train_df_csfd.to_csv("sk_csfd-movie-reviews_raw_train.csv", index=False)
test_df_csfd.to_csv("sk_csfd-movie-reviews_raw_test.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df_csfd = pd.read_csv("sk_csfd-movie-reviews_raw_train.csv")
train_df_csfd.head()

Unnamed: 0,review_id,rating_str,rating_int,date,comment_language,comment,item_title,item_year,item_kind,item_genres,item_directors,item_screenwriters,item_cast
0,jcfcxXbNBSmREbS9Dc8lKw,2/5,2,2015-01-03,sk,"Vsetko je pekne a krasne, ked si odmyslime hl...",The F Word,2013,film,"['Komedie', 'Drama', 'Romantický']",['Michael Dowse'],['Elan Mastai'],"['Daniel Radcliffe', 'Zoe Kazan', 'Rafe Spall'..."
1,cOlLqZ72s8kku0Y-e_tH4A,2/5,2,2009-07-28,sk,Zbytočne veľa nechutností (nie som žiadna roz...,Saw 3,2006,film,"['Horor', 'Thriller', 'Mysteriózní']",['Darren Lynn Bousman'],['Leigh Whannell'],"['Tobin Bell', 'Shawnee Smith', 'Angus Macfady..."
2,nhp0hFOE7bWOfQZETgoE0A,5/5,5,2013-05-07,sk,ďalší výborný Almodóvar. Krásne precítené vní...,Mluv s ní,2002,film,"['Drama', 'Romantický', 'Komedie']",['Pedro Almodóvar'],['Pedro Almodóvar'],"['Javier Cámara', 'Darío Grandinetti', 'Leonor..."
3,ZlHLO98bkmmQoL8GsWRJiQ,1/5,1,2014-05-08,sk,Diky hercum by mohl mit serial dobry potencia...,První krok,2009,seriál,"['Drama', 'Komedie']","['Ján Sebechlebský', 'Jiří Vejdělek']","['Jiří Vejdělek', 'Marek Kopčaj', 'Marek Hlavi...","['Klára Issová', 'Dominik Turza', 'Pavel Kříž'..."
4,MmGDbK90cvLj7eVwg8c7mw,4/5,4,2011-01-06,sk,"Podobné filmy dokazujú, že aj romantické film...",Ondine,2009,film,"['Drama', 'Romantický']",['Neil Jordan'],['Neil Jordan'],"['Colin Farrell', 'Alicja Bachleda Curuś', 'St..."


### Data preparation -- sk_csfd-movie-reviews

In [3]:
df = train_df_csfd.drop(["review_id", "rating_str", "date", "comment_language", "item_title", "item_year", "item_kind", "item_genres", "item_directors", "item_screenwriters", "item_cast"], axis=1)
df.tail()


Unnamed: 0,rating_int,comment
24995,4,"Zábava to bola skvelá, ale chcem vidieť babu,..."
24996,2,Nudne spracovaný príbeh s dosť naivným scenár...
24997,3,2 hodiny cakania na skvele rozuzlenie....a ko...
24998,2,"ide o lepsiu zalezitost nez je jednotka, no s..."
24999,0,Toto je k smiechu! Všetci čo na tomto robili ...


In [4]:
df["rating"] = df["rating_int"].apply(
    lambda row: 'negative' if row <= 2
                else ('neutral' if row == 3 else 'positive')
)
df = df.drop(["rating_int"], axis=1)
df.rename(columns={'comment':'review'}, inplace=True)


In [5]:
df.head()

Unnamed: 0,review,rating
0,"Vsetko je pekne a krasne, ked si odmyslime hl...",negative
1,Zbytočne veľa nechutností (nie som žiadna roz...,negative
2,ďalší výborný Almodóvar. Krásne precítené vní...,positive
3,Diky hercum by mohl mit serial dobry potencia...,negative
4,"Podobné filmy dokazujú, že aj romantické film...",positive


In [6]:
import global_var as gvar
print(dir(gvar)) 

df['review'] = df['review'].apply(gvar.clean_text)
df['review'] = df['review'].apply(gvar.handle_negations)
df['review'] = df['review'].apply(gvar.preprocess_for_sentiment)
df.tail()

['STOP_WORDS_SK', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'clean_text', 'handle_negations', 'preprocess_for_sentiment', 're', 'simple_slovak_stemmer']


Unnamed: 0,review,rating
24995,zábav skvel chcem vidieť babu dá prednosť nich...,positive
24996,nudn spracovan príbeh dosť naivn scenár správa...,negative
24997,hodin cakan skvel rozuzlenie koniec absolutn o...,neutral
24998,ide leps zalezitost nez neg_jednotk neg_stal n...,negative
24999,smiech rob hanbiť barb pobehujúc zbraň dá opís...,negative


#### Model and more

In [7]:
# First, ensure you have necessary imports
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# If you need to import your preprocessing function
import global_var as gvar


# 1. Extract your features and labels
X = df['review'].values  # Your preprocessed text column
y = df['rating'].values  # Your sentiment labels ('positive', 'neutral', 'negative')

# 2. Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# 3. Define model parameters
vocab_size = 5000  # Max words to consider
embedding_dim = 150
hidden_dim = 128
output_dim = 3  # positive, neutral, negative
n_layers = 2
bidirectional = True
dropout = 0.2
batch_size = 32
n_epochs = 10


In [9]:

# 4. Create a simple tokenizer for the text
from collections import Counter

class SimpleTokenizer:
    def __init__(self, texts, max_words=5000):
        self.max_words = max_words
        self.word_index = {'<PAD>': 0, '<UNK>': 1}
        self.word_counts = Counter()
        
        # Count words
        for text in texts:
            for word in text.split():
                self.word_counts[word] += 1
        
        # Keep most common words
        for word, _ in self.word_counts.most_common(max_words - 2):
            if word not in self.word_index:
                self.word_index[word] = len(self.word_index)
        
        self.vocab_size = len(self.word_index)
    
    def encode_plus(self, text, max_length=100):
        tokens = []
        for word in text.split():
            if word in self.word_index:
                tokens.append(self.word_index[word])
            else:
                tokens.append(self.word_index['<UNK>'])
        
        # Truncate or pad as needed
        if len(tokens) > max_length:
            tokens = tokens[:max_length]
        else:
            tokens = tokens + [self.word_index['<PAD>']] * (max_length - len(tokens))
        
        return tokens


In [10]:
tokenizer = SimpleTokenizer(X_train)
print(f"Vocabulary size: {tokenizer.vocab_size}")

Vocabulary size: 5000


In [11]:

# 6. Define LSTM model
class LSTMSentimentModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                                 bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, 
                                 batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
            
        hidden = self.dropout(hidden)
        return self.fc(hidden)


In [12]:

from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = [torch.tensor(tokenizer.encode_plus(text)) for text in texts]
        
        # Convert string labels to integers
        self.label_map = {label: i for i, label in enumerate(set(labels))}
        self.labels = [self.label_map[label] for label in labels]
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [13]:
# Create datasets and dataloaders
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:

# 8. Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
model = LSTMSentimentModel(tokenizer.vocab_size, embedding_dim, hidden_dim, 
                         output_dim, n_layers, bidirectional, dropout).to(device)

# 9. Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# 10. Training loop
best_accuracy = 0

for epoch in range(n_epochs):
    # Training
    model.train()
    train_loss = 0
    train_correct = 0
    
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(predictions, 1)
        train_correct += (predicted == labels).sum().item()
    
    # Evaluation
    model.eval()
    test_loss = 0
    test_correct = 0
    
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            
            predictions = model(texts)
            loss = criterion(predictions, labels)
            
            test_loss += loss.item()
            _, predicted = torch.max(predictions, 1)
            test_correct += (predicted == labels).sum().item()
    
    # Print statistics
        train_loss /= len(train_loader)
        test_loss /= len(test_loader)
        train_accuracy = train_correct / len(train_dataset)
        test_accuracy = test_correct / len(test_dataset)
        
    print(f'Epoch: {epoch+1}/{n_epochs}')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')
        
    # Save best model
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), 'sentiment_model.pt')
        print('Saved best model')

# 11. Load the best model
model.load_state_dict(torch.load('sentiment_model.pt'))

# 12. Function to predict sentiment for new texts
def predict_sentiment(text, preprocess_func=None):
    if preprocess_func:
        text = preprocess_func(text)
    
    model.eval()
    with torch.no_grad():
        encoded = torch.tensor(tokenizer.encode_plus(text)).unsqueeze(0).to(device)
        prediction = model(encoded)
        probabilities = torch.nn.functional.softmax(prediction, dim=1)
        predicted_class = torch.argmax(prediction, dim=1).item()
        confidence = probabilities[0][predicted_class].item()
    
    # Convert back to string label
    inverse_label_map = {v: k for k, v in train_dataset.label_map.items()}
    sentiment = inverse_label_map[predicted_class]
    
    return sentiment, confidence



In [None]:
# 13. Test the model
test_review = "Film bol veľmi zaujímavý a príbeh bol výborný."
sentiment, confidence = predict_sentiment(test_review, preprocess_func=gvar.preprocess_for_sentiment)
print(f"Review: {test_review}")
print(f"Predicted sentiment: {sentiment} (confidence: {confidence:.4f})")