<a href="https://colab.research.google.com/github/AndreassOlsson/HuggingFace/blob/main/movierating_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict movieratings on self-scraped dataset


# Scraping & saving the data


In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, element
import requests
import lxml
import re
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def scrape(num_pages=1):
  data = {}
  for page in range(num_pages):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    url = "https://www.moviezine.se/recensioner?page=" + str(page)
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'lxml')

    articles = soup.select('div.article_info')
    for article in articles:
      title = article.select('h3 > span')
      title = title[0].text.strip() if len(title) == 1 else None

      ratingDiv = article.select('div.rating > ul')
      if len(ratingDiv) > 0:
        rating = 0
        for el in ratingDiv[0]:
          if isinstance(el, element.Tag):
            if el['class'][-1] == 'mz_star_on':
              rating += 1
      else:
        rating = None

      data[len(data)] = {
          'title': title,
          'rating': rating
      }
  
  df = pd.DataFrame(data).transpose()
  df.to_pickle('drive/MyDrive/Andreas Olsson/Huggingface/hf4_files-and-model-weights/df.pkl') 

scrape(num_pages=500)

# Preparing the data

## Loading dataframe

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Unpickle dataset
import pandas as pd
md = pd.read_pickle('drive/MyDrive/Andreas Olsson/Huggingface/hf4_files-and-model-weights/df.pkl')

## One-hot encoding dataframe

In [None]:
def encode(row):
  return [1.0 if i == row['rating'] else 0.0 for i in range(1,6)]

def ohe(row):
  return[1.0 if i <= row['rating'] else 0.0 for i in range(1,6)]

md['rating_encoded_asList'] = md.apply(encode, axis=1)

y = pd.get_dummies(md.rating, prefix='r')
md = pd.concat([md, y], axis=1)

def subtract(row):
  return int(row['rating']) - 1

md['rating_minus1'] = md.apply(subtract, axis=1)

md['rating_ohe'] = md.apply(ohe, axis=1)

md

Unnamed: 0,title,rating,rating_encoded_asList,r_1,r_2,r_3,r_4,r_5,rating_minus1,rating_ohe
0,Stiligt men ljummet svartsjukedrama,3,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0,2,"[1.0, 1.0, 1.0, 0.0, 0.0]"
1,Vietnamkriget som collegekomedi,3,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0,2,"[1.0, 1.0, 1.0, 0.0, 0.0]"
2,"Retrodoftande ""whodunnit""",3,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0,2,"[1.0, 1.0, 1.0, 0.0, 0.0]"
3,Clooney och Roberts bråkar på Bali,2,"[0.0, 1.0, 0.0, 0.0, 0.0]",0,1,0,0,0,1,"[1.0, 1.0, 0.0, 0.0, 0.0]"
4,Makaber melankoli och mörk humor,4,"[0.0, 0.0, 0.0, 1.0, 0.0]",0,0,0,1,0,3,"[1.0, 1.0, 1.0, 1.0, 0.0]"
...,...,...,...,...,...,...,...,...,...,...
6356,Fnittrig sommarfilm,3,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0,2,"[1.0, 1.0, 1.0, 0.0, 0.0]"
6357,Football Factory på skotska,3,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0,2,"[1.0, 1.0, 1.0, 0.0, 0.0]"
6358,Kladdig tonårsrysare,2,"[0.0, 1.0, 0.0, 0.0, 0.0]",0,1,0,0,0,1,"[1.0, 1.0, 0.0, 0.0, 0.0]"
6359,Drama om själens vikt,3,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0,2,"[1.0, 1.0, 1.0, 0.0, 0.0]"


In [None]:
md.rating.value_counts()

3    2192
4    1972
2    1387
5     411
1     399
Name: rating, dtype: int64

## Splitting dataframe

In [None]:
train = md.sample(frac=0.7)
test = md[~md.title.isin(train.title)]

## Convert dataframe to dataset

In [None]:
!pip install datasets

In [None]:
# Convert to huggingface Dataset
from datasets import Dataset

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

## Tokenize dataset

In [None]:
!pip install transformers

In [None]:
# Tokenize datasets
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased")

In [None]:
def tokenize_function(example):
  return tok(example['title'], padding='max_length', max_length=512, truncation=False)

train_tok = train_dataset.map(tokenize_function, batched=True)
test_tok = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
train_tok.save_to_disk('drive/MyDrive/Andreas Olsson/Huggingface/hf4_files-and-model-weights/tr')
test_tok.save_to_disk('drive/MyDrive/Andreas Olsson/Huggingface/hf4_files-and-model-weights/ts')

# Transforming tokenized datasets to TF format, loading, compiling and training TF model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets

In [None]:
# Load tokenized datasets
from datasets import load_from_disk

train_tok = load_from_disk('drive/MyDrive/Andreas Olsson/Huggingface/hf4_files-and-model-weights/tr')
test_tok = load_from_disk('drive/MyDrive/Andreas Olsson/Huggingface/hf4_files-and-model-weights/ts')


In [None]:
!pip install transformers

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:
train_tok

Dataset({
    features: ['title', 'rating', 'rating_encoded_asList', 'r_1', 'r_2', 'r_3', 'r_4', 'r_5', 'rating_minus1', 'rating_ohe', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4453
})

In [None]:
tf_train = train_tok.to_tf_dataset(
    columns=["input_ids", "token_type_ids", "attention_mask"],
    #label_cols=['rating_ohe'],
    label_cols=['rating_minus1'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_test = test_tok.to_tf_dataset(
    columns=["input_ids", "token_type_ids", "attention_mask"],
    #label_cols=['rating_ohe'],
    label_cols=['rating_minus1'],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
tf_train

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [None]:
# from transformers import TFBertForSequenceClassification

# model = TFBertForSequenceClassification.from_pretrained("KB/bert-base-swedish-cased", problem_type="multi_label_classification", num_labels=5)

Downloading:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/658M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at KB/bert-base-swedish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AutoConfig, TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("KB/bert-base-swedish-cased", num_labels=5)

Downloading:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/658M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at KB/bert-base-swedish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import tensorflow as tf

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-6),
  #  loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
  #  metrics=tf.keras.metrics.BinaryCrossentropy(from_logits=False),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)

In [None]:
import tensorflow as tf

# Save model's weights after each epoch
import os
checkpoint_path = "drive/MyDrive/Andreas Olsson/Huggingface/hf4_files-and-model-weights/single-label/"

checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

# Train the model with the new callback
model.fit(tf_train, validation_data=tf_test, epochs=5, callbacks=[cp_callback])  # Pass callback to training

# Performing Inference

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

tok = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased")
model = TFAutoModelForSequenceClassification.from_pretrained("KB/bert-base-swedish-cased", num_labels=5)

def tokenize_function(example):
  return tok(example['title'], padding='max_length', max_length=512, truncation=False)

In [None]:
# Instantiate model of the same architecture
# Load weights from google drive, apply weights to model
import tensorflow as tf

checkpoint_path = "drive/MyDrive/Andreas Olsson/Huggingface/hf4_files-and-model-weights/single-label/"
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa4d9e67290>

In [None]:
import pandas as pd
import datasets
import numpy as np

def predict_rating(movieTitles_list):
  data = {}
  for i, title in enumerate(movieTitles_list):
    data[i] = {'title':title}
  
  df = pd.DataFrame(data).transpose()
  input = datasets.Dataset.from_pandas(df)
  input = input.map(tokenize_function).to_tf_dataset(
      columns=["input_ids", "token_type_ids", "attention_mask"],
      batch_size=4
  )
  for i, p in enumerate(model.predict(input).logits):
    print(f"\nFilm: {data[i]['title']}, Gissat betyg: {np.argmax(p) + 1}")

In [None]:
movies = [
    'Starkt och fängslande drama om att växa upp som mobbad i Sverige',
    'En extremt rolig komedi som alltid får dig att skratta',
    'Tråkig komedi som inte alls håller måttet',
    'Kanske den sämsta filmen som någonsin har skapats',
    'En kritikerrosad roman om ett ungt par som blir kära',
    'En bra äventyrsfilm men som saknar det där lilla extra'
]
predict_rating(movies)

  0%|          | 0/6 [00:00<?, ?ex/s]


Film: Starkt och fängslande drama om att växa upp som mobbad i Sverige, Gissat betyg: 4

Film: En extremt rolig komedi som alltid får dig att skratta, Gissat betyg: 5

Film: Tråkig komedi som inte alls håller måttet, Gissat betyg: 2

Film: Kanske den sämsta filmen som någonsin har skapats, Gissat betyg: 1

Film: En kritikerrosad roman om ett ungt par som blir kära, Gissat betyg: 4

Film: En bra äventyrsfilm men som saknar det där lilla extra, Gissat betyg: 2
