In [1]:
!pip install opendatasets



In [2]:
import opendatasets as od
import pandas as pd
import time
from pathlib import Path
import numpy as np
from functools import reduce
from joblib import Parallel, delayed

In [3]:
dataset_path = Path('imdb-user-reviews', 'song_lyrics.csv')
if not dataset_path.is_file():
    od.download('https://www.kaggle.com/datasets/sadmadlad/imdb-user-reviews')

Skipping, found downloaded files in "./imdb-user-reviews" (use force=True to force download)


In [4]:
import json


n, mean, M2 = 0, 0.0, 0
for path in Path('imdb-user-reviews').glob('**/*'):
    if path.is_file() and path.suffix == '.json':
        with open(path, 'r') as f:
            info = json.load(f)
        score = float(info['movieIMDbRating'])
        n += 1
        delta = score - mean
        mean += delta / n
        M2 += delta * (score - mean)

print(mean, (M2 / n) ** (1/2))

8.03 1.051712888577486


In [5]:
result = []
for path in Path('imdb-user-reviews').glob('**/*'):
    if path.is_file() and path.suffix == '.json':
        with open(path, 'r') as f:
            info = json.load(f)
        score = float(info['movieIMDbRating'])
        result.append(score)
print(sum(result)/len(result))

8.030000000000001


In [6]:
def preprocessing(x):
    x = x.strip()
    if isinstance(x, str):
        if len(x)<=2:
            return int(x)
        else:
            return np.nan
    if isinstance(x, int) or isinstance(x, float):
        return x
    else:
        return np.nan

In [7]:
def chunkify(list_of_strings, number_of_chunks=30):
    step = len(list_of_strings) // number_of_chunks
    if step != 0:
        for i in range(0, len(list_of_strings), step):
            yield list_of_strings[i : i + step]
    else:
        yield list_of_strings

In [8]:
def chunks_mapper(data):
    return (len(data), np.sum(data))

def reducer(data_1, data_2):
    return (np.sum(data_1[0])+np.sum(data_2[0]), np.sum(data_1[1])+np.sum(data_2[1]))

In [9]:
%%time
result = []
for path in Path('imdb-user-reviews').glob('*/*.csv'):

     with open(path, 'r') as f:
          data = pd.read_csv(f)
          data = data["User's Rating out of 10"].apply(preprocessing)
          data = data[~data.isna()].values
          data_chunk = chunkify(data, 30)
          mapped = map(chunks_mapper, data_chunk)
          data = reduce(reducer, mapped)
          result.append(data)       
result_sum = np.sum([x[1] for x in result])
result_len_sum = np.sum([x[0] for x in result])
result = result_sum/result_len_sum 

CPU times: user 290 ms, sys: 50.1 ms, total: 340 ms
Wall time: 408 ms


In [10]:
print(f"Cредний балл фильмов:{result:.2f}")

Cредний балл фильмов:8.08


In [11]:
%%time
result = []
for path in Path('imdb-user-reviews').glob('*/*.csv'):

     with open(path, 'r') as f:
          data = pd.read_csv(f)
          data = data["User's Rating out of 10"].apply(preprocessing)
          data = data[~data.isna()].values
          data_chunk = chunkify(data, 30)
          mapped = Parallel(n_jobs=5)(delayed(chunks_mapper)(chunk) for chunk in data_chunk)
          #mapped = map(chunks_mapper, data_chunk)
          data = reduce(reducer, mapped)
          result.append(data)
result_sum = np.sum([x[1] for x in result])
result_len_sum = np.sum([x[0] for x in result])
result = result_sum/result_len_sum

CPU times: user 374 ms, sys: 90.8 ms, total: 465 ms
Wall time: 598 ms


In [12]:
print(f"Cредний балл фильмов:{result:.2f}")

Cредний балл фильмов:8.08


In [13]:
from multiprocessing import Pool

In [14]:
%%time
result = []
for path in Path('imdb-user-reviews').glob('*/*.csv'):

     with open(path, 'r') as f:
          data = pd.read_csv(f)
          data = data["User's Rating out of 10"].apply(preprocessing)
          data = data[~data.isna()].values
          data_chunk = chunkify(data, 30)
          with Pool(5) as p:
            mapped = p.map(chunks_mapper, data_chunk)
            data = reduce(reducer, mapped)
            result.append(data)
result_sum = np.sum([x[1] for x in result])
result_len_sum = np.sum([x[0] for x in result])
result = result_sum/result_len_sum

CPU times: user 364 ms, sys: 182 ms, total: 546 ms
Wall time: 569 ms


In [15]:
print(f"Cредний балл фильмов:{result:.2f}")

Cредний балл фильмов:8.08


# вывод:
* последователное выполнение кода показала лучший результат, \
это обусловлдено небольшим объемом данных который возможно поместить в операционную памать,\
что нивелирует использование быблиотек паральльной обработки данных.
