<a href="https://colab.research.google.com/github/DmitriyValetov/nlp_course_project/blob/master/rs_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

RossiyaSegodnya Dataset for PyTorch
[github repository](https://github.com/RossiyaSegodnya/ria_news_dataset)

## Raw

Full dataset 1003869 news 

https://github.com/RossiyaSegodnya/ria_news_dataset/raw/master/ria.json.gz

20 news 

https://raw.githubusercontent.com/RossiyaSegodnya/ria_news_dataset/master/ria_20.json

1000 news

https://raw.githubusercontent.com/RossiyaSegodnya/ria_news_dataset/master/ria_1k.json

## Processed
*   html parsing (BeautifulSoup)
*   split into sentences (nltk)
*   split into words (nltk)
*   str.isalnum() (python)

https://drive.google.com/open?id=1-UtATnzLE809Vi6RLgy3GRHX2TXRzhd6

*   html parsing (BeautifulSoup)
*   split into sentences (nltk)
*   split into words (nltk)
*   str.isalnum() (python)
*   stopwords (nltk)

https://drive.google.com/open?id=1bhsdkXYEe4qixPddK9DkaQ-7z0jAn5Bi

*   html parsing (BeautifulSoup)
*   split into sentences (nltk)
*   split into words (nltk)
*   str.isalnum() (python)
*   stopwords (nltk)
*   lemmatization (pymorphy2)

*   html parsing (BeautifulSoup)
*   split into sentences (nltk)
*   split into words (nltk)
*   str.isalnum() (python)
*   stopwords (nltk)
*   stemming (nltk snowball)


# Download

## Raw

In [0]:
import requests
import os

# url = "https://raw.githubusercontent.com/RossiyaSegodnya/ria_news_dataset/master/ria_20.json"
url = "https://raw.githubusercontent.com/RossiyaSegodnya/ria_news_dataset/master/ria_1k.json"
# url = "https://github.com/RossiyaSegodnya/ria_news_dataset/raw/master/ria.json.gz"
fn, ext = os.path.splitext(os.path.basename(url))
print(f'downloading {fn + ext}')
r = requests.get(url) 
with open(fn + ext, 'wb') as f:
  f.write(r.content)
# if ext == '.gz':  # requests should decompress .gz by default but don't...
#   print(f'decompressing from {fn + ext} to {fn}')
#   import gzip
#   import shutil
#   with gzip.open(fn + ext, 'rb') as gz_file:
#     with open(fn, 'wb') as json_file:
#       shutil.copyfileobj(gz_file, json_file)

downloading ria_1k.json


## Processed

In [0]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# Download a file based on its file ID.
# 1-UtATnzLE809Vi6RLgy3GRHX2TXRzhd6  # norm
# 1bhsdkXYEe4qixPddK9DkaQ-7z0jAn5Bi  # stop
file_id = '1-UtATnzLE809Vi6RLgy3GRHX2TXRzhd6'
downloaded = drive.CreateFile({'id': file_id})
downloaded.FetchMetadata()
fn = downloaded.metadata['title']
print(f'downloading: {fn}')
downloaded.GetContentFile(fn)

# Process (skip if dataset is processed)


In [0]:
import nltk
nltk.download('punkt')  # for sentence tokenization
nltk.download('stopwords')
!pip install pymorphy2

In [0]:
import json
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from pymorphy2 import MorphAnalyzer
import gzip
import os

morph = MorphAnalyzer()  # lemmatizer
snow = SnowballStemmer('russian')  # Porter stemmer doesn't work with russian
stop = stopwords.words('russian')
# Files are not clear json files. They contain json strings line by line.
fn = 'norm_sents_ria.json.gz'  # raw path
nfn = 'stop_lem_norm_sents_ria.json.gz'  # processed path
cnt = 0
if os.path.exists(nfn):
  with open(nfn) as nf:
    for line in nf:
      cnt += 1
  with open(nfn) as nf:
    for _ in range(cnt - 1):
      next(nf)
    print(json.loads(next(nf)))  # print last sample
print(f'start from {cnt + 1} sample')
_, ext = os.path.splitext(fn)
f = gzip.open(fn, 'rb') if ext == '.gz' else open(fn)
with f:
  for _ in range(cnt):  # skip already processed samples
    next(f)
  with open(nfn, 'a+') as nf:
    for line in tqdm(f, initial=cnt, total=1003869):  # 1003869 (full dataset)
      n = json.loads(line)
      # text = BeautifulSoup(n['text']).get_text()
      # title = BeautifulSoup(n['title']).get_text()
      # norm_text_ss = [' '.join(morph.parse(w)[0].normal_form for w in word_tokenize(s) 
      # if w.isalnum() and w not in stopwords.words("russian")) for s in sent_tokenize(text)]
      # norm_text_ss = [' '.join(morph.parse(w)[0].normal_form for w in word_tokenize(s) 
      # if w.isalnum() and w not in stopwords.words("russian")) for s in sent_tokenize(text)]
      # norm_title_ss = [' '.join(morph.parse(w)[0].normal_form for w in word_tokenize(s) 
      # if w.isalnum() and w not in stopwords.words("russian")) for s in sent_tokenize(title)]
      # STOP + LEM
      # norm_text_ss = [' '.join(morph.parse(w)[0].normal_form for w in s.split() 
      # if w not in stopwords.words("russian")) for s in text]
      # norm_title_ss = [' '.join(morph.parse(w)[0].normal_form for w in s.split() 
      # if w not in stopwords.words("russian")) for s in title]
      # STOP
      # norm_text_ss = [' '.join(w for w in s.split() 
      # if w not in stop) for s in n['text']]
      # norm_title_ss = [' '.join(w for w in s.split() 
      # if w not in stop) for s in n['title']]
      # STOP + SNOW
      # norm_text_ss = [' '.join(snow.stem(w) for w in s.split() 
      # if w not in stop) for s in n['text']]
      # norm_title_ss = [' '.join(snow.stem(w) for w in s.split() 
      # if w not in stop) for s in n['title']]
      norm_text_ss = [' '.join(morph.parse(w)[0].normal_form for w in s.split() 
      if w not in stop) for s in n['text']]
      norm_title_ss = [' '.join(morph.parse(w)[0].normal_form for w in s.split() 
      if w not in stop) for s in n['title']]
      json_str = json.dumps({'text': norm_text_ss, 'title': norm_title_ss}, ensure_ascii=False)
      nf.write(json_str + '\n')
with open(nfn, 'rb') as f:  # first check
  print(json.loads(next(f)))
with open(nfn, 'rb') as f:  # all check
  for line in tqdm(f):
    json.loads(line)

In [0]:
import gzip
import shutil

cfn = nfn + '.gz'
# compress normalized dataset
print(f'compressing {nfn} to {cfn}')
with gzip.open(cfn, 'wb') as gz_file:
  with open(nfn, 'rb') as json_file:
    shutil.copyfileobj(json_file, gz_file)

with gzip.open(cfn, 'rb') as f:  # first check
  print(json.loads(next(f)))
with gzip.open(fn, 'rb') as f:  # all check
  for line in tqdm(f):
    json.loads(line)

In [0]:
# uploaded = drive.CreateFile({'title': cfn})
# uploaded.SetContentFile(cfn)
# uploaded.Upload()

# from google.colab import drive
# drive.mount('/content/drive')

from shutil import copyfile
copyfile(cfn, '/content/drive/My Drive/' + cfn)