## Terminal Commands

In [108]:
#!pip install pandas
#!pip install pyspark
#!pip install nltk
!pip install tqdm
!pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0-py2.py3-none-any.whl
Collecting scikit-learn
  Using cached scikit_learn-0.24.2-cp36-cp36m-macosx_10_13_x86_64.whl (7.2 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn, sklearn
Successfully installed scikit-learn-0.24.2 sklearn-0.0 threadpoolctl-3.1.0


## Imports & Dependencies

In [110]:
import argparse
import pandas as pd
import os
from collections import defaultdict, Counter
import json
import numpy as np
import nltk
from dateutil.parser import parse
from joblib import Parallel, delayed
from sklearn.utils import class_weight
from tqdm import tqdm

import tensorflow as tf

## Global vars

In [20]:
CATEGORIES_TO_IGNORE = ['bolig', 'abonnement', 'xyz 42 testseksjon', 'header_1']
SITES_TO_IGNORE = ['kundeservice.adressa.no']
PAD_TOKEN = '<PAD>'
UNFREQ_TOKEN= '<UNF>'

## Contextual Vars

In [29]:
path_to_article_folder = "home/lemeiz/content_refine/"

## Loaders

In [71]:
with open(os.path.join(path_to_article_folder, os.listdir(path_to_article_folder)[0]), 'r', encoding='utf-8') as f:
    line = f.readline()
    content_raw = json.loads(line)
    dict_content = {}
    for key in content_raw:
        if key == 'fields':
            for json_field in content_raw['fields']:
                value = json_field['value']
                if json_field['field'] == 'body':
                    value = ' '.join(value)
                dict_content[json_field['field']] = value
        dict_content[key] = content_raw[key]
    
    print(dict_content['body'])


Saken oppdateres. Det er Trøndelag politidistrikt som klokken 15.50 melder om at to barn er observert over tunnelåpningen Frøyatunnelen på Frøya-siden. Ifølge Twitter-meldingen er det ingen sikring på stedet og fare for at barna kan falle ned i veien. Politipatruljen fant ingen barn da de kom til stedet, kun fotspor, opplyser politiet på Twitter klokken 16.00. Nå advarer politet mot å leke i dette området. - Dette er ingen lekeplass, skriver politiet.


In [86]:
def parse_json_to_dict(line:str) -> dict:
    content_raw = json.loads(line)
    dict_content = {}
    for key in content_raw:
        if key == 'fields':
            for json_field in content_raw['fields']:
                value = json_field['value']
                if json_field['field'] == 'body':
                    value = ' '.join(value)
                dict_content[json_field['field']] = value
        else:
            dict_content[key] = content_raw[key]
    
    return dict_content

In [98]:
def article_information_parse(line:str) -> dict:
    def unique_list_if_str(value):
        if type(value) == list:
            return value
        else:
            return [value]
        
        
    #content_raw = parse_json_to_dict(line)
    content_raw = defaultdict(str, parse_json_to_dict(line))
    
    publishtime = content_raw['publishtime'] if content_raw['publishtime'] != '' else content_raw['createtime'] 
    #Converting to unix timestamp in miliseconds
    publishtime_ts = int(parse(publishtime).timestamp()) * 1000
    
    author_1st = content_raw['author'][0] if type(content_raw['author']) == list else content_raw['author']
    
    if type(content_raw['heading']) == list:
        heading = set(content_raw['heading']) #Set to remove repeated phrases
    else:
        heading = [content_raw['heading']]
    
    textual_highlights = f"{content_raw['title']} | {content_raw['teaser']} | {'. '.join(heading)} | {content_raw['body']}".replace(u'\xad','').replace('"', '')
    
    new_content = {'id': content_raw['id'],
                   'url': content_raw['url'],
                   'site': unique_list_if_str(content_raw['og-site-name'])[0],
                   'adressa-access': content_raw['adressa-access'], #(free, subscriber)
                   'author_1st':  author_1st if author_1st != '' else '', #3777 unique                  
                   'publishtime': publishtime,
                   'created_at_ts': publishtime_ts,
                   'text_highlights': textual_highlights, 
                   #Extracted using NLP techniques (by Adressa)
                   'concepts': ','.join(unique_list_if_str(content_raw['kw-concept'])), #98895 unique
                   'entities': ','.join(unique_list_if_str(content_raw['kw-entity'])), #150214 unique
                   'locations': ','.join(unique_list_if_str(content_raw['kw-location'])), #5533 unique
                   'persons': ','.join(unique_list_if_str(content_raw['kw-person'])), #53535 unique
                   #Categories and keywords tagged by the journalists of Adresseavisen and may be of variable quality (label)
                   'category0': content_raw['category0'], #39 unique
                   'category1': content_raw['category1'] if 'category1' in content_raw else '', #126 unique
                   'category2': content_raw['category2'] if 'category2' in content_raw else '', #75 unique
                   'keywords': content_raw['keywords'], #6489 unique
                  }

        
    return new_content

with open(os.path.join(path_to_article_folder, os.listdir(path_to_article_folder)[0]), 'r', encoding='utf-8') as f:
    line = f.readline()
    x = article_information_parse(line)
    



In [122]:
def parse_content_file(fp:str):
    with open(fp, 'r', encoding='utf-8') as f:
        try:
            for line in f:
                if line.strip()=='null':
                    return None
                else:
                    content=article_information_parse(line)
                return content
        except Exception as e:
            print(e)
        
parse_content_file(os.path.join(path_to_article_folder, os.listdir(path_to_article_folder)[3]))

{'id': '13eb96b4cfbbc5954c54a75737afcac5ccc61779',
 'url': 'http://www.adressa.no/nyheter/trondheim/article586450.ece',
 'site': 'adressa.no',
 'adressa-access': 'free',
 'author_1st': 'elin fosshaug olsø',
 'publishtime': '2005-10-29T14:28:40.000Z',
 'created_at_ts': 1130596120000,
 'text_highlights': 'Bilister aggressive mot trafikkaksjon | En varebil holdt på å kjøre ned ei barnevogn og flere fotgjengere, da beboere på Tiller aksjonerte mot trafikkaos lørdag ettermiddag. | Brå u-sving. Bilister aggressive mot trafikkaksjon -adressa.no. – Kaotisk. – Ønsker bom | Saken oppdateres. Flere bilførere reagerte med aggressiv kjøring og forsøk på å trenge seg gjennom folkemengden med bil, da omkring femti beboere på Tiller sperret trafikken på Østre Rosten med en gåsakte-aksjon på lørdag. Trafikksituasjon var høyst kaotisk midt i den verste lørdagshandelen. Også ved Rostengrenda ble trafikken sperret ved at en mengde beboere på Tiller gikk fram og tilbake over fotgjengerfeltet. Køene ble fle

In [117]:
def load_files(rel_path, file_list=None) -> pd.DataFrame:
    if not file_list:
        file_list = os.listdir(rel_path)
    
    article_data = []
    
    for idx, filename in enumerate(tqdm(file_list)):
        fp = os.path.join(rel_path, filename)
        file_content = parse_content_file(fp)
        if file_content:
            article_data.append(file_content)
            
    print(f'# Files processed       : {len(file_list)}')
    print(f'# Files parsed          : {len(article_data)}')
    print(f'# Files rejected (empty): {len(file_list) - len(file_content)}')
    
    return article_data
    
    
#load_files(path_to_article_folder)

In [118]:
def load_from_folder(rel_path):
    articles_list = os.listdir(rel_path)
    articles = load_files(rel_path, articles_list)
    
    news_df = pd.DataFrame([art for art in articles])
    news_df = news_df[(~news_df['category0'].isin(CATEGORIES_TO_IGNORE)) & (~news_df['site'].astype(str).isin(SITES_TO_IGNORE))]
    

    news_df.drop_duplicates(subset='id', keep='first', inplace=True)
    return news_df

In [119]:
def preprocess(input_folder_path:str, output_path:str):
    print(f'Loading data from {input_folder_path}')
    news_df = load_from_folder(input_folder_path)
    print(f'Loaded {len(news_df)} articles.')
    
    print(f'Saving data to {output_path}')
    news_df.to_csv(output_path, index=False)
    print(f'Saved {len(news_df)} articles.')
    

In [121]:
preprocess(path_to_article_folder, 'data/raw_df/adressa_raw.csv')

Loading data from home/lemeiz/content_refine/


 75%|███████▍  | 55910/74886 [01:12<00:24, 777.00it/s] 

Expecting value: line 2 column 1 (char 92)


100%|██████████| 74886/74886 [01:36<00:00, 777.97it/s] 


# Files processed       : 74886
# Files parsed          : 74885
# Files rejected (empty): 74870
Loaded 73308 articles.
Saving data to data/raw_df/adressa_raw.csv
Saved 73308 articles.


In [116]:
len({'a':1, 's':'d'})

2