In [1]:
import pandas as pd
import numpy as np 

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
import pandas as pd
import numpy as np 

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
def clean_data(df):
    df = df.dropna()
    # df['Review'] = df['Review'].fillna('').astype(str)
    df['Review'] = df['Review'].str.replace('[^\w\s]', '')
    df['Review'] = df['Review'].str.lower()
    
    return df

In [4]:
train_data = pd.read_csv('train.csv')
train_data = clean_data(train_data)
train_data

In [5]:
test_data = pd.read_csv('test.csv')
test_data = clean_data(test_data)
test_data

In [6]:
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

In [7]:
# do some preprocessing 

class IMDbDataset(Dataset):
    def __init__(self, reviews, ratings, tokenizer, max_length):
        self.reviews = reviews
        self.ratings = ratings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        rating = self.ratings[idx] if self.ratings is not None else 0

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'rating': torch.tensor(rating, dtype=torch.float)
        }

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
MAX_LENGTH = 256

train_reviews, val_reviews, train_ratings, val_ratings = train_test_split(
    train_data['Review'], train_data['Rating'], test_size=0.1, random_state=42)

train_dataset = IMDbDataset(train_reviews.values, train_ratings.values, tokenizer, MAX_LENGTH)
val_dataset = IMDbDataset(val_reviews.values, val_ratings.values, tokenizer, MAX_LENGTH)
test_dataset = IMDbDataset(test_data['Review'].values, None, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [8]:
# modeling 
class IMDbRegressionModel(torch.nn.Module):
    def __init__(self, model_name):
        super(IMDbRegressionModel, self).__init__()
        self.xlm_roberta = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits

model = IMDbRegressionModel('xlm-roberta-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    # for batch in data_loader:
    for batch in tqdm(data_loader, total=len(data_loader)):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        ratings = batch['rating'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = torch.nn.MSELoss()(outputs.squeeze(), ratings)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual = []
    with torch.no_grad():
        # for batch in data_loader:
        for batch in tqdm(data_loader, total=len(data_loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ratings = batch['rating'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend(outputs.squeeze().tolist())
            actual.extend(ratings.tolist())
    
    return r2_score(actual, predictions)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 4
best_r2 = -float('inf')
best_model = None

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_r2 = evaluate(model, val_loader, device)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val R2: {val_r2:.4f}")
    
    if val_r2 > best_r2:
        best_r2 = val_r2
        best_model = model.state_dict()

model.load_state_dict(best_model)

In [10]:
# evaluate model
from sklearn.metrics import r2_score

In [11]:
# predict test samples
# submission = pd.DataFrame()
# submission

model.eval()
predictions = []

with torch.no_grad():
    # for batch in test_loader:
    for batch in tqdm(test_loader, total=len(test_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(outputs.squeeze().tolist())

submission = pd.DataFrame({'Rating': predictions})

submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

In [12]:
import zipfile
import joblib
import os

if not os.path.exists(os.path.join(os.getcwd(), 'ChandMidi.ipynb')):
    %notebook -e ChandMidi.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)
            
submission.to_csv('submission.csv', index=False)
file_names = ['ChandMidi.ipynb', 'submission.csv']
compress(file_names)

In [13]:
num_epochs = 2

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_r2 = evaluate(model, val_loader, device)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val R2: {val_r2:.4f}")
    
    if val_r2 > best_r2:
        best_r2 = val_r2
        best_model = model.state_dict()

model.load_state_dict(best_model)

In [14]:
# predict test samples
# submission = pd.DataFrame()
# submission

model.eval()
predictions = []

with torch.no_grad():
    # for batch in test_loader:
    for batch in tqdm(test_loader, total=len(test_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(outputs.squeeze().tolist())

submission = pd.DataFrame({'Rating': predictions})

submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

In [15]:
import zipfile
import joblib
import os

if not os.path.exists(os.path.join(os.getcwd(), 'ChandMidi.ipynb')):
    %notebook -e ChandMidi.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)
            
submission.to_csv('submission.csv', index=False)
file_names = ['ChandMidi.ipynb', 'submission.csv']
compress(file_names)

In [16]:
import zipfile
import joblib
import os

if not os.path.exists(os.path.join(os.getcwd(), 'ChandMidi.ipynb')):
    %notebook -e ChandMidi.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)
            
submission.to_csv('submission.csv', index=False)
file_names = ['ChandMidi.ipynb', 'submission.csv']
compress(file_names)