In [97]:
import pandas as pd
import numpy as np
import os
import sklearn

## Global VARS

In [66]:
PAD_TOKEN = '<PAD>'
UNFREQ_TOKEN= '<UNF>'

In [2]:
data_path = 'data/raw_df/adressa_raw.csv'

In [162]:
news_df = pd.read_csv(data_path)

In [63]:
news_df.head().T

Unnamed: 0,0,1,2,3,4
id,fcc01a7a1a7f7092a2da6b9c5186fdef421c8ab6,e1c14c3f599c9764a003740b9959c4e6f2fbc8e3,6a0612e60690288a776834811004ce133f326cee,13eb96b4cfbbc5954c54a75737afcac5ccc61779,b40a30877124510cf65683b6c9391d927e20f89d
url,http://www.adressa.no/nyheter/sortrondelag/201...,http://www.adressa.no/bil/veteran/article80867...,http://www.adressa.no/kultur/2015/11/06/Histor...,http://www.adressa.no/nyheter/trondheim/articl...,http://www.adressa.no/forbruker/hjem/article15...
site,adressa.no,adressa.no,adressa.no,adressa.no,adressa.no
adressa-access,free,free,free,free,free
author_1st,pål solberg,,annemona grann,elin fosshaug olsø,ann iren bævre
publishtime,2016-03-02T14:58:31.000Z,2007-02-21T07:19:00.000Z,2015-11-06T20:41:20.000Z,2005-10-29T14:28:40.000Z,2010-08-06T10:13:05.000Z
created_at_ts,1456930711000,1172042340000,1446842480000,1130596120000,1281089585000
text_highlights,- Dette er ingen lekeplass | To barn ble obser...,Trondheim fylles med veteranbiler | Midtsommer...,- Historiene er nesten for utrolige | - Jeg ta...,Bilister aggressive mot trafikkaksjon | En var...,Fyll på med småbord | Det skal være kos og far...
concepts,"politiet,barn","veteranbiler,trondheim,kjøre,bil,nobel årgang,...","valg,mennesker,unge mennesker,temaer,ekstreme ...","brå u-sving,bilister aggressive,trafikken,bil,...","småbord,bordet"
entities,"twitter,frøyatunnelen","royal garden hotel,per ramsøskar,granåsen,dats...","hitler,festningen,individets,jugoslavia,hipp,t...","obs,sandmoen,tiller vel ingebrigt storli,østre...","tray,offecct,penny og wishbone,established,wis..."


# Process features

In [166]:
def get_categ_encoder_from_values(values, include_pad_token=True, include_unfrequent_token=False):
    encoder_values = []
    if include_pad_token:
        encoder_values.append(PAD_TOKEN)
    if include_unfrequent_token:
        encoder_values.append(UNFREQ_TOKEN)
    encoder_values.extend(values)
    encoder_ids = list(range(len(encoder_values)))
    encoder_dict = dict(zip(encoder_values, encoder_ids))
    return encoder_dict

def encode_categ_feature(value, encoder_dict):
    if value in encoder_dict:
        return encoder_dict[value]
    else:
        return encoder_dict[UNFREQ_TOKEN]

def transform_categorical_column(series, encoder):
    return series.apply(lambda x: encode_categ_feature(x, encoder))

def get_encoder_from_freq_values(series, min_freq=100):
    freq_values_counts_df = get_freq_values(series, min_freq=min_freq)
    encoder = get_categ_encoder_from_values(freq_values_counts_df[freq_values_counts_df.columns[0]].unique(), include_unfrequent_token=True)    
    return encoder

def get_freq_values(series, min_freq=100):
    flatten_values_counts = series.groupby(series).size()
    return flatten_values_counts[flatten_values_counts >= min_freq].sort_values(ascending=False).reset_index(name='count')

def comma_sep_values_to_list(value):
    return list([y.strip() for y in value.split(',') if (y.strip() != '')])

### ID encoding

In [77]:
id_enc = get_categ_encoder_from_values(news_df['id'])
transform_categorical_column(news_df['id'], id_enc)

0            1
1            2
2            3
3            4
4            5
         ...  
73303    73304
73304    73305
73305    73306
73306    73307
73307    73308
Name: id, Length: 73308, dtype: int64

### Category encoder

In [101]:
category0_encoder = get_categ_encoder_from_values(news_df['category0'].unique())
print(f'Category0 - unique count {len(category0_encoder)}')
news_df['category0_encoded'] = transform_categorical_column(news_df['category0'], category0_encoder)

category0_class_weights = sklearn.utils.class_weight.compute_class_weight('balanced', classes=news_df['category0_encoded'].unique(), y=news_df['category0_encoded'])
print(f'Category0 weights: {category0_class_weights}')

Category0 - unique count 41
Category0 weights: [4.57717283e-02 3.07500000e+00 2.42068419e-01 7.26397146e-01
 4.08173719e+00 6.74034572e-01 2.45046129e-01 3.40018553e+00
 4.26209302e+01 2.34360614e+00 2.92529928e-01 2.54541667e+01
 2.00733844e+00 1.32612156e+00 5.11927374e+00 4.77265625e+00
 5.98921569e+00 1.30907143e+01 2.90904762e+01 1.36768657e+01
 1.66609091e+02 6.31965517e+01 7.41983806e+00 3.66540000e+02
 2.90904762e+01 7.33080000e+01 9.16350000e+01 2.65608696e+01
 2.61814286e+02 4.07266667e+01 1.22180000e+02 1.83270000e+03
 9.16350000e+02 9.16350000e+02 1.83270000e+03 1.83270000e+03
 6.10900000e+02 1.83270000e+03 9.16350000e+02 1.83270000e+03]


In [116]:
category1_encoder = get_categ_encoder_from_values(news_df['category1'].unique())
news_df['category1_encoded'] = transform_categorical_column(news_df['category1'], category1_encoder)
category1_class_weights = sklearn.utils.class_weight.compute_class_weight('balanced', classes=news_df['category1_encoded'].unique(), y=news_df['category1_encoded'])

Category1 - unique count 128
{'<PAD>': 0, 'nyheter|sortrondelag': 1, 'bil|veteran': 2, nan: 3, 'nyheter|trondheim': 4, 'forbruker|hjem': 5, 'nyheter|innenriks': 6, 'nyheter|utenriks': 7, 'forbruker|sexogsamliv': 8, 'pluss|okonomi': 9, 'meninger|ordetfritt': 10, 'nyheter|helse': 11, 'meninger|kommentarer': 12, 'nyheter|nordtrondelag': 13, 'pluss|magasin': 14, 'pluss|nyheter': 15, 'kultur|musikk': 16, 'vaeret|klima': 17, 'meninger|kronikker': 18, '100sport|fotball': 19, 'nyheter|moreromsdal': 20, 'nyheter|okonomi': 21, 'pluss|meninger': 22, 'forbruker|livsstil': 23, 'pluss|sport': 24, 'meninger|leder': 25, '100sport|vintersport': 26, 'nyheter|psykiatri': 27, '100sport|ballsport': 28, 'sport|sprek': 29, 'kultur|tv': 30, '100sport|sykkel': 31, 'pluss|kultur': 32, 'kultur|festivaler': 33, 'kultur|film': 34, '100sport|andreidretter': 35, 'tema|villeveier': 36, '100sport|sprek': 37, 'kultur|bok': 38, 'meninger|snakkut': 39, 'kultur|vin': 40, 'forbruker|fritid': 41, 'forbruker|personlig okonom

### Author Encoder (must remove infrequent authors due to the "meninger" section)

In [140]:
author_encoder = get_encoder_from_freq_values(news_df['author_1st'], min_freq=20)
news_df['author_encoded'] = transform_categorical_column(news_df['author_1st'], author_encoder)

In [167]:
 #Converting values separated by "," to lists
#news_df['keywords'] = news_df['keywords'].apply(comma_sep_values_to_list)
news_df['concepts'] = news_df['concepts'].apply(comma_sep_values_to_list)
news_df['entities'] = news_df['entities'].apply(comma_sep_values_to_list)
news_df['locations'] = news_df['locations'].apply(comma_sep_values_to_list)
news_df['persons'] = news_df['persons'].apply(comma_sep_values_to_list)

AttributeError: 'float' object has no attribute 'split'

In [159]:
news_df.loc[news_df['concepts'].isna()]

Unnamed: 0,id,url,site,adressa-access,author_1st,publishtime,created_at_ts,text_highlights,concepts,entities,locations,persons,category0,category1,category2,keywords,category0_encoded,category1_encoded,author_encoded
16,ec39b8188ab20ea4d2c54719d8f0b8fdd4b0d922,http://www.adressa.no/nyheter/utenriks/article...,adressa.no,free,ntb,2015-02-16T08:41:21.000Z,1424076081000,EU kunngjør ny sanksjonsliste | Brussel (NTB):...,,"arkadij bakhin,anatolij antonov","ukraina,europa",,nyheter,nyheter|utenriks,,"[utenriks, innenriks, trondheim, E6, midtbyen,...",1,7,2
20,1f9f2e659eb5488b54457b44773c38e894b62180,http://www.adressa.no/incoming/2017/03/04/Her-...,adressa.no,free,richard sagen,2017-03-04T07:16:26.000Z,1488611786000,Her er Adresseavisens beste bilder fra ski-vm ...,,adresseavisens,lahti,,incoming,,,"[nyheter, trondheim, ukeadressa, eadressa, ska...",8,3,1
104,31fe9e457e82b8fd257c27827eb9f0325319cff6,http://www.adressa.no/nyheter/trondheim/articl...,adressa.no,free,,2004-04-23T19:05:02.000Z,1082747102000,Dette er Revolt Technology | ReVolt Technology...,,"sinvent og viking venture,eiendom,næringsforen...","belgia,trondheim,oslo","erik hagen,trygve burchardt,nils kristian nakstad",nyheter,nyheter|trondheim,,"[utenriks, innenriks, trondheim, E6, midtbyen,...",1,4,1
114,4070f85cf4952e0e2d1b89d89c569a22ae07ecb5,http://www.adressa.no/pluss/article10774251.ece,adressa.no,subscriber,stein arne sæther,2015-03-21T07:03:13.000Z,1426921393000,Det meste gikk galt da Norge plutselig fikk kr...,,,"norge,tyskland",,pluss,,,"[pluss, eadressa, ukeadressa, digitalt, arkiv,...",7,3,80
173,a8d451cb414735f4ce4d82ea28e4c484ea25168d,http://www.adressa.no/nyheter/article98748.ece,adressa.no,free,tone almhjell,2002-08-26T13:40:35.000Z,1030369235000,Redningsmann | Publisert: 26.08.2002 14:40 Si...,,,,,nyheter,,,"[utenriks, innenriks, trondheim, E6, midtbyen,...",1,3,253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72996,25593a1d583001d42191a8d5c4afd924e86079bc,http://www.adressa.no/pluss/2017/01/23/Se-bild...,adressa.no,subscriber,rune petter ness,2017-01-23T05:15:22.000Z,1485148522000,Se bildeserien fra «råtne» Britannia | | Se b...,,britannia,,,pluss,,,"[pluss, eadressa, ukeadressa, digitalt, arkiv,...",7,3,235
73058,c54979a72545b88ef7b9d047d869e68d941d59ff,http://www.adressa.no/nyheter/kuriosa/article6...,adressa.no,free,yngve bergli,2012-12-08T12:17:48.000Z,1354969068000,Her er lesernes pepperkakehus | Se bildene. | ...,,,,,nyheter,nyheter|kuriosa,,"[utenriks, innenriks, trondheim, E6, midtbyen,...",1,45,61
73187,1bac4b262e513826ac3236c2831a640618863caf,http://www.adressa.no/video/article12008127.ece,adressa.no,free,,2016-01-09T00:14:53.000Z,1452298493000,Høydepunkter fra Nyttårskonserten | Se kulturm...,,"nyttårskonserten,olavshallen",,astrid s,video,,,"[nyheter, trondheim, ukeadressa, eadressa, ska...",15,3,1
73214,9936e8f376fe0a57d96050d575265518152b0105,http://www.adressa.no/meninger/2017/05/27/Sjef...,adressa.no,free,kjetil kroksæter,2017-05-27T14:33:55.000Z,1495895635000,"Kjenner du den brautende, høyrøstede typen som...",,,,,meninger,,,"[Kommentar, USA, Donald Trump, Kjetil Kroksæter]",6,3,91
