## Preprocessing steps

In [1]:
import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter

#### Fetch stopwords from nltk + stemming

In [2]:
#nltk.download('stopwords')
#nltk.download('punkt')
#stopwords.fileids()
word_stemmer = SnowballStemmer("norwegian", ignore_stopwords=False)

#### Load data, 'NoReC: The Norwegian Review Corpus'

In [3]:
def loadOldData():
    reviews = pd.read_json('data/metadata.json', orient='index', encoding='utf-8')
    reviews.reset_index(drop=True, inplace=True) # Fix indexes
    reviews.drop(columns=['year', 'url', 'month', 'excerpt', 'title', 'day', 'authors', 'source-id', 'id'], inplace=True)
    return reviews

def loadNewData():
    reviews = pd.read_csv('data/data.csv', encoding='utf-8')
    return reviews

reviews = loadOldData()

In [4]:
print(len(reviews), 'records')
reviews.head()

43614 records


Unnamed: 0,category,cons,language,pros,rating,source,source-category,source-tags,split,tags
0,screen,,nb,,6,p3,tv,[],train,[tv]
1,screen,,nb,,6,p3,tv,[],train,[tv]
2,screen,,nb,,6,p3,tv,[],train,[tv]
3,screen,,nb,,5,p3,tv,[],train,[tv]
4,screen,,nb,,5,p3,film,[],train,[movie]


In [5]:
excludedStopWords = set(['ikkje', 'ikke', 'inkje'])
stopWords = set([word for word in set(stopwords.words('norwegian')) if word not in excludedStopWords])
print(stopWords)

{'vere', 'eit', 'sin', 'noen', 'de', 'har', 'for', 'mine', 'siden', 'å', 'dykkar', 'til', 'noko', 'ingen', 'at', 'hvilke', 'nokon', 'noka', 'oss', 'meg', 'man', 'nokor', 'no', 'eller', 'hadde', 'hjå', 'av', 'kunne', 'blei', 'du', 'kvi', 'nå', 'varte', 'slik', 'henne', 'sia', 'blitt', 'ja', 'opp', 'vart', 'korleis', 'mykje', 'mitt', 'dykk', 'dei', 'bli', 'hennes', 'fordi', 'disse', 'ble', 'verte', 'inn', 'hvilken', 'eitt', 'den', 'so', 'enn', 'hennar', 'da', 'upp', 'då', 'ved', 'deres', 'di', 'og', 'en', 'både', 'båe', 'kvifor', 'medan', 'ha', 'er', 'over', 'sitt', 'deira', 'vort', 'sidan', 'nokre', 'som', 'der', 'din', 'med', 'før', 'etter', 'elles', 'mellom', 'var', 'honom', 'et', 'ville', 'me', 'fra', 'bare', 'i', 'somme', 'hvis', 'hoss', 'mi', 'mange', 'vi', 'når', 'sjøl', 'kvar', 'ned', 'korso', 'også', 'sine', 'så', 'sånn', 'dere', 'skal', 'uten', 'mot', 'denne', 'hvor', 'eg', 'vore', 'ho', 'hva', 'jeg', 'ut', 'vil', 'han', 'dette', 'vår', 'begge', 'her', 'ingi', 'ditt', 'deim', '

In [6]:
print(set(reviews.category)) # Forskjellige kategorier

{'music', 'stage', 'products', 'restaurants', 'games', 'misc', 'sports', 'literature', 'screen'}


In [7]:
for k, v in Counter(list(reviews.source)).most_common():
    print(k, v, np.round(((v/43416)*100), 2))

vg 12861 29.62
sa 6996 16.11
dagbladet 6693 15.42
p3 5708 13.15
fvn 3348 7.71
dinside 3280 7.55
bt 2589 5.96
ap 2139 4.93


In [8]:
for k, v in Counter(list(reviews.category)).most_common():
    print(k, v, np.round(((v/43416)*100), 2))

screen 14297 32.93
music 13204 30.41
misc 4619 10.64
literature 4313 9.93
products 3470 7.99
games 1799 4.14
restaurants 915 2.11
stage 764 1.76
sports 233 0.54


In [9]:
print(set(reviews.loc[:,'source-category'])) # Kilde kategorier ?

{'meninger', 'magasin', 'data', 'tema', 'reise', 'osloby', 'tv', 'kampsport', 'spill', 'sport', 'bok', 'mobil', 'bil, båt og motor', 'film', 'håndball', 'fotball', 'teknologi', 'bolig', 'scene', 'kjendis', 'byliv', 'kommentar', 'restaurant', 'autofil', 'norge', 'økonomi', 'forbruker', 'innenriks', 'nyheter', 'bt magasinet', 'motor', 'kultur', 'lokalt', 'magasinet', 'teater', 'langrenn', 'sulten', 'motorsport', 'konsert', 'a-magasinet', 'mat og drikke', 'fritid', 'rampelys', 'utenriks', 'ishockey', 'musikk'}


In [10]:
# ratingen fordelt på alle anmeldelsene...
for k,v in Counter(list(reviews.rating)).most_common():
    print(k,v, np.round(((v/43416)*100), 2))

5 16046 36.96
4 14142 32.57
3 7477 17.22
2 2768 6.38
6 2722 6.27
1 459 1.06


In [11]:
#for k,v in Counter(list(reviews.tags)).most_common():
#    print(k,v, np.round(((v/43416)*100), 2))

In [12]:
len(reviews.loc[reviews.source == 'dinside']) # 3280 anmeldelser er fra dinside, som bruker cons + pros..

3280

In [13]:
srces = []
for src, pr, cns in zip(reviews['source'], reviews['pros'], reviews['cons']):
    if str(pr) == 'nan' and str(cns) == 'nan':
        continue
    srces.append(src)
set(srces) # These use pros + cons

{'dinside'}

#### Cleanup pros and cons, if possible

In [14]:
def cleanText(text):    
    text = text.replace('\n', ' ').strip().lower()
    text = re.sub(r'[^a-zæøåéäö ]+', '', text) # Remove any symbols
    text = re.sub(r'\s\s+', ' ', text) # Remove consecutive whitespace
    text = ' '.join([word_stemmer.stem(word) for word in text.split() if word not in stopWords])
    return text

In [15]:
reviews.pros = [cleanText(str(r)) if str(r) != 'nan' else '' for r in reviews.pros]
reviews.cons = [cleanText(str(r)) if str(r) != 'nan' else '' for r in reviews.cons]
print([r for r in reviews.cons if len(r) > 0][:2])

['treg autofokus dår lys svakt kitobjektiv', 'skill ikk mye konkurrent anonym']


#### Merge title, excerpt and content into a new column called content, create a new dataset with the features we care about

In [18]:
CONTENT = None

def parseFile(path, file):
    global CONTENT, word_stemmer
    with open(path, 'r', encoding='utf-8') as f:
        CONTENT[file] = cleanText(f.read())

def parseFolder(folder):
    for _, _, files in os.walk(folder, topdown=True):
        for i, file in enumerate(files):
            if i % 600 == 0:
                print("Processing",'{}/{}'.format(folder,file))            
            parseFile('{}/{}'.format(folder,file), file[:-4])
            
def fetchContent():
    """
    Parse all related text documents, merge this with the reviews pandas dataframe.
    """
    global CONTENT
    CONTENT = {}
    parseFolder("data/train")
    parseFolder("data/test")
    parseFolder("data/dev")
    print("\nFinished!\n")
    print(len(CONTENT), '==', len(reviews))

def mergeContent():
    """
    Merge content into reviews data frame, but first drop unnecessary cols...
    """
    global CONTENT, reviews    
    reviews['content'] = [CONTENT[key] for key in sorted([k for k,_ in CONTENT.items()])]
    reviews.to_csv('data/data.csv', index=False, encoding='utf-8')
        
#fetchContent()
#mergeContent()

In [19]:
reviews.head()

Unnamed: 0,category,cons,language,pros,rating,source,source-category,source-tags,split,tags,content
0,screen,,nb,,6,p3,tv,[],train,[tv],rom s topp inn tvdram akkurat andr sist sesong...
1,screen,,nb,,6,p3,tv,[],train,[tv],twin peaks definitiv gold box edition gull twi...
2,screen,,nb,,6,p3,tv,[],train,[tv],the wir sesong the wir gjør avheng god måt nes...
3,screen,,nb,,5,p3,tv,[],train,[tv],mad sesong stil underhold sofistiker tvseri ma...
4,screen,,nb,,5,p3,film,[],train,[movie],mad sesong tvunderholdning høyest kvalit først...
