In [None]:
import pandas as pd

df = pd.read_csv("wiki_movie_plots_deduped_with_summaries.csv")

In [4]:
df = df[['Title', 'Genre', 'Plot', 'PlotSummary', 'Release Year']]
df = df.rename(columns={
    'Title': 'title',
    'Genre': 'genre',
    'Plot': 'plot',
    'PlotSummary': 'summary',
    'Release Year': 'release_year'
})
print(df.columns)

Index(['title', 'genre', 'plot', 'summary', 'release_year'], dtype='object')


In [6]:
df = df.dropna(subset=['title', 'genre', 'plot', 'summary', 'release_year'])

# Remove rows with empty strings or just spaces
df = df[
    (df['title'].str.strip() != '') &
    (df['genre'].str.strip() != '') &
    (df['plot'].str.strip() != '') &
    (df['summary'].str.strip() != '')
]

In [7]:
import re

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces/newlines
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\'"-]', '', text)  # remove weird symbols
    return text.strip()

df['plot'] = df['plot'].apply(clean_text)
df['summary'] = df['summary'].apply(clean_text)

In [8]:
df = df.drop_duplicates(subset=['title'])

In [9]:
df.to_csv("cleaned_movie_plots.csv", index=False)
print("Data cleaned and saved to cleaned_movie_plots.csv")

Data cleaned and saved to cleaned_movie_plots.csv


In [2]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import torch

# Load your dataset
df = pd.read_csv('cleaned_movie_plots.csv')

# Load MarianMT English->Russian model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-ru'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_marionmt(text):
    if not isinstance(text, str) or text.strip() == '':
        return ''
    # Tokenize input
    batch = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
    # Generate translation ids
    generated_ids = model.generate(**batch)
    # Decode translation
    translated = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return translated

# Add a new column for Russian translations
df['plot_ru'] = ''

# Translate each plot (you can add a progress bar or just print status)
for idx, plot in enumerate(df['plot']):
    if idx % 50 == 0:
        print(f'Translating plot {idx}/{len(df)}')
    df.at[idx, 'plot_ru'] = translate_marionmt(plot)

# Save the dataset with Russian plots
df.to_csv('movie_plots_with_russian_offline.csv', index=False)
print("Offline translation done! Saved as movie_plots_with_russian_offline.csv")



Translating plot 0/32406


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Translating plot 50/32406
Translating plot 100/32406
Translating plot 150/32406
Translating plot 200/32406
Translating plot 250/32406
Translating plot 300/32406
Translating plot 350/32406
Translating plot 400/32406
Translating plot 450/32406
Translating plot 500/32406
Translating plot 550/32406
Translating plot 600/32406
Translating plot 650/32406
Translating plot 700/32406
Translating plot 750/32406
Translating plot 800/32406
Translating plot 850/32406
Translating plot 900/32406
Translating plot 950/32406
Translating plot 1000/32406
Translating plot 1050/32406
Translating plot 1100/32406
Translating plot 1150/32406
Translating plot 1200/32406
Translating plot 1250/32406
Translating plot 1300/32406
Translating plot 1350/32406
Translating plot 1400/32406
Translating plot 1450/32406
Translating plot 1500/32406
Translating plot 1550/32406
Translating plot 1600/32406
Translating plot 1650/32406
Translating plot 1700/32406
Translating plot 1750/32406
Translating plot 1800/32406
Translating 


KeyboardInterrupt



In [3]:
df.to_csv('checkpoint_before_stop.csv', index=False)
print("✅ Saved progress to checkpoint_before_stop.csv")

✅ Saved progress to checkpoint_before_stop.csv
