In [2]:
import regex as re
from langdetect import detect
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
import torch
import random
from transformers import DistilBertModel, DistilBertTokenizer, BertModel, BertTokenizer
import joblib

# set seed
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

stop = stopwords.words('english')

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [3]:
def clean_text(text):
    assert detect(text) == 'en', 'Text is not in English'
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    cleaned_text = cleaned_text.lower()
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in (stop)])
    return cleaned_text

In [8]:
def get_embeddings(sample_comments, model, tokenizer):
    sample_comments = [clean_text(comment) for comment in sample_comments]
    tokenized = [tokenizer.encode(comment, add_special_tokens=True) for comment in sample_comments]
    
    # pad to max length
    max_len = 0 # the maximum sequence length of the reviews
    for i, review in enumerate(tokenized):
        if len(review) > max_len:
            max_len = len(review)

    # pad the sequences to the maximum length
    padded = np.array([review + [0]*(max_len-len(review)) for i, review in enumerate(tokenized)])
    
    # get attn mask
    attention_mask = np.where(padded != 0, 1, 0) # 0 means ignore
    attention_mask = torch.tensor(attention_mask)
    input_ids = torch.tensor(padded)
    
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    embeddings = last_hidden_states[0][:,0,:].numpy()
    return embeddings

In [9]:
def pipeline(comments, model_path):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    embeddings = get_embeddings(comments, model, tokenizer)
    model = joblib.load(model_path)
    predictions = model.predict(embeddings)
    sentiment_map = {-1: 'Negative', 1: 'Positive', 0: 'Neutral'}
    predictions = [sentiment_map[pred] for pred in predictions]
    prediction_df = pd.DataFrame({'comment': comments, 'sentiment': predictions})
    return prediction_df

In [19]:
sample_comments = [
    "This movie is soso",
]

In [20]:
df = pipeline(sample_comments, model_path='sentiment_analysis_model.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [21]:
df

Unnamed: 0,comment,sentiment
0,This movie is soso,Positive
