# Language Analysis for WikiScraper

## lang_confidence_score 

In [None]:
import numpy as np

def lang_confidence_score(word_counts: dict, language_words_with_frequency: dict):
    values = np.array(list(word_counts.values()), dtype=float)
    total = values.sum()

    if total == 0:
        return {k: 0.0 for k in word_counts}

    normalized_word_count_freqs = values / total

    extracted_words_values = [
        language_words_with_frequency.get(word, 0.0)
        for word in word_counts.keys()
    ]

    language_freqs = np.array(extracted_words_values, dtype=float)

    if language_freqs.sum() != 0:
        language_freqs /= language_freqs.sum()

    score = 1/np.mean(normalized_word_count_freqs * language_freqs) + 1e6
    score = 1/(1+np.exp(-score))

    return float(score)

## Data Prep

In [3]:
import wordfreq
from wordfreq import top_n_list

languages = {'en', 'it', 'da'}
word_with_frequencies = {}
for lang in languages:
    word_with_frequencies[lang] = wordfreq.get_frequency_dict(lang, wordlist='best')

def get_k_best(word_with_frequencies, k):
    new_dict = {}

    for lang, language_best in word_with_frequencies.items():
        smaller_freqs = {}

        for word in top_n_list(lang, k):
            smaller_freqs[word] = language_best.get(word, 0.0)

        new_dict[lang] = smaller_freqs

    return new_dict

print(get_k_best(word_with_frequencies, 3))

{'en': {'the': 0.05370317963702527, 'to': 0.026915348039269153, 'and': 0.025703957827688632}, 'da': {'i': 0.030902954325135904, 'og': 0.028840315031266057, 'er': 0.028183829312644536}, 'it': {'di': 0.03890451449942807, 'e': 0.028840315031266057, 'che': 0.022908676527677734}}


In [None]:
from wikiscraper.controller import Controller
from wikiscraper.scraper import Scraper
from wikiscraper.page import Page
from pathlib import Path
from wikiscraper import config
import json

cl = Controller()
start_page = cl._get_page("Team Rocket")

def get_score_of_page(language_words_with_frequency: dict, page: Page):
    cl.clear_json()
    page.count_words()
    path = Path(config.WORD_COUNTS_JSON)

    raw = path.read_text(encoding="utf-8").strip()
    data: dict[str, int] = json.loads(raw) if raw else {}
        
    return lang_confidence_score(data, language_words_with_frequency)

def get_worst_page(language_words_with_frequency: dict, num: int = 60, start_page: Page=start_page):
    curr_page = start_page
    best_phrase = start_page.phrase
    best_score = 0
    for i in range(1,60):
        score = get_score_of_page(language_words_with_frequency, curr_page)
        
        if score > best_phrase:
            best_phrase = best_score
            best_phrase = curr_page.phrase
        
        curr_page = cl.next_page(curr_page, wait=0.5)