In [80]:
import matplotlib.pyplot as plt
from IPython.display import HTML, display
from wordcloud import WordCloud
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pandas as pd
import random
    
def load_raw_data(name, sep = ",", key="text"):
    data = pd.read_csv(f"../raw/{name}",sep=sep)
    data[key] = [ str(v) for v in data[key]]
    X, y = data[key].values, data["label"].values
    return X,y

def _table(data):
    return '<table><tr>{}</tr></table>'.format(
           '</tr><tr>'.join(
               '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in data)
           )
    
def show_table(data):
    display(HTML(_table(data)))

def _hist(name, X_len, y):
    # An "interface" to matplotlib.axes.Axes.hist() method
    fig=plt.figure()
    n, bins, patches = plt.hist(x=X_len, bins=30, color='#3333ff',
                                alpha=0.5, rwidth=0.9, label='Fake+True')
    
    def p1():
        n1, bins1, patches1 = plt.hist(x=X_len[y==1], bins=30, color='#ff3333',
                                    alpha=0.5, rwidth=0.9, label='Fake')
    def p2():
        n0, bins0, patche0 = plt.hist(x=X_len[y==0], bins=30, color='#33ff33',
                                    alpha=0.5, rwidth=0.9, label='True')

    if sum(y) > -sum(y-1):
        p1()
        p2()
    else:
        p2()
        p1()

    plt.grid(axis='y', alpha=0.75)
    plt.xlabel('Długość tekstu')
    plt.ylabel('Częstotliwośc')
    plt.title(name)
    #plt.text(23, 45, r'$\mu=15, b=3$')
    maxfreq = n.max()
    # Set a clean upper y-axis limit.
    plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
    plt.legend(loc='upper right')
    plt.savefig(f'{name}_hist.png')
    plt.close(fig)
    
def _pie(name, y):
    fig=plt.figure()
    labels=[f"Fake",f"True"]
    patches, texts = plt.pie(y, startangle=90, labels=labels, textprops={'fontsize': 36})
    #plt.legend(patches, labels,loc='best', prop={'size': 26})
    
    plt.savefig(f'{name}_piechart.png')
    plt.close(fig)

def _wordcount(name,X):
    counter = Counter()
    tokenizer = RegexpTokenizer(r'[A-Za-z]{2,}')
    stop_words = set(stopwords.words('english'))

    for w in X:
        words = tokenizer.tokenize(w)
        words = [word for word in words if word not in stop_words]
        counter.update(words)

    most_common = counter.most_common(500)

    df = pd.DataFrame(most_common, columns=["word","n"])
    df["m"] = np.ceil(len(df)*df["n"]/np.sum(df["n"]))

    big_text = []
    for i,row in df.iterrows():
        for r in range(int(row["m"])):
            big_text.append( row["word"] ) 

    random.shuffle(big_text)

    fig=plt.figure()
    wordcloud = WordCloud(max_font_size=40, background_color="white").generate(" ".join(big_text))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig(f'{name}_wordcloud.png')
    plt.close(fig)


def _stats(name,file="qprop.csv", sep="\t", maxlen=20000, key="text"):
    X,y = load_raw_data(file,sep,key=key)
    X_len = np.array([min(len(x), maxlen) for x in X])

    _hist(name,X_len,y)
    _wordcount(name,X)

    clsprop = [-sum(y-1), sum(y)]
    
    ds_stats = [
        ["<b>N</b>", f"{len(y)}"],
        ["<b>Fake</b>", f"{clsprop[1]}"],
        ["<b>True</b>", f"{clsprop[0]}"],
        ["<b>R</b>", f"1:{int(np.ceil(max(clsprop)/min(clsprop)))}"]
    ]

    return [ f"<h2 style='writing-mode: vertical-rl;'>{name}</h2>",
        f"<img style='width:500px' src='./{name}_hist.png?{random.randint(0,10000)}'/>",
        f"<img style='width:500px' src='./{name}_wordcloud.png?{random.randint(0,10000)}'/>",
        _table(ds_stats)]


# Ogólna charakterystyka zebranych zbiorów danych

In [94]:
show_table([_stats('PubHealth','pubhealth.csv',sep=",")])

0,1,2,3
PubHealth,,,N10075Fake6306True3769R1:2

0,1
N,10075
Fake,6306
True,3769
R,1:2


In [93]:
show_table([_stats('Q-Prop','qprop.csv',sep="\t")])

0,1,2,3
Q-Prop,,,N51270Fake5736True45534R1:8

0,1
N,51270
Fake,5736
True,45534
R,1:8


In [95]:
show_table([_stats('MM-Covid_en','mmcovid_en.csv',sep=",", maxlen=5000)])

0,1,2,3
MM-Covid_en,,,N7332Fake2028True5304R1:3

0,1
N,7332
Fake,2028
True,5304
R,1:3


In [92]:
show_table([_stats('Covid-FN','covid_fake_news.csv',sep="\t", key="title")])

0,1,2,3
Covid-FN,,,N8972Fake8511True461R1:19

0,1
N,8972
Fake,8511
True,461
R,1:19


In [91]:
show_table([_stats('GRAFN','grafn_lite.csv',sep=",")])

0,1,2,3
GRAFN,,,N63930Fake12999True50931R1:4

0,1
N,63930
Fake,12999
True,50931
R,1:4


In [96]:
show_table([_stats('ISOT','isot.csv',sep=",")])

0,1,2,3
ISOT,,,N44898Fake23481True21417R1:2

0,1
N,44898
Fake,23481
True,21417
R,1:2


In [82]:
show_table([_stats('Nela-GT','nela-gt-2020.csv',sep=",")])

0,1,2,3
Nela-GT,,,N757096Fake262288True494808R1:2

0,1
N,757096
Fake,262288
True,494808
R,1:2


In [84]:
show_table([_stats('FakenewsNet','fakenewsnet.csv',sep=",")])

0,1,2,3
FakenewsNet,,,N18690Fake4571True14119R1:4

0,1
N,18690
Fake,4571
True,14119
R,1:4


In [86]:
show_table([_stats('Liar','liar.csv',sep=",", maxlen=500)])

0,1,2,3
Liar,,,N8061Fake3554True4507R1:2

0,1
N,8061
Fake,3554
True,4507
R,1:2


In [87]:
show_table([_stats('Feverous','feverous.csv',sep=",", maxlen=500)])

0,1,2,3
Feverous,,,N76439Fake30696True45743R1:2

0,1
N,76439
Fake,30696
True,45743
R,1:2


In [89]:
show_table([_stats('Politifact','politifact.csv',sep=",", maxlen=500)])

0,1,2,3
Politifact,,,N10463Fake2824True7639R1:3

0,1
N,10463
Fake,2824
True,7639
R,1:3
