<a href="https://colab.research.google.com/github/Martipetti/fake-news-detection/blob/main/DataAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Liar Dataset Esploration

In [None]:
!pip install dash

In [None]:
!pip install -U datasets

In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import dash
from dash import html, dcc
import plotly.graph_objs as go
from dash.dependencies import Input, Output

### Dataset import

In [None]:
dataset = load_dataset("chengxuphd/liar2")
dataset

In [None]:
statement_train, y_train = dataset["train"]["statement"], dataset["train"]["label"]
statement_val, y_train = dataset["validation"]["statement"], dataset["validation"]["label"]
statement_test, y_test = dataset["test"]["statement"], dataset["test"]["label"]

In [None]:
train_df = dataset['train'].to_pandas()
val_df = dataset['validation'].to_pandas()
test_df = dataset['test'].to_pandas()

complete_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

complete_df.head()

## Esploration

In [None]:
print("\nInformazioni sul dataframe:")
complete_df.info()

In [None]:
print("\nValori mancanti per colonna:")
print(complete_df.isnull().sum())

In [None]:
fig = px.histogram(
    complete_df,
    x='label',
    title='Distribuzione delle etichette (Label)',
    labels={'label': 'Etichetta', 'count': 'Frequenza'},
    color='label',
)

fig.update_layout(
    xaxis_title='Etichetta',
    yaxis_title='Frequenza',
    bargap=0.2
)

fig.show()

### Speaker Analysis

In [None]:
num_unique_speakers = complete_df['speaker'].nunique()
print(f"\nNumero di valori diversi nella colonna 'speaker': {num_unique_speakers}")

In [None]:
speaker_counts = complete_df['speaker'].value_counts().head(30).reset_index()
speaker_counts.columns = ['speaker', 'frequenza']

speaker_counts = speaker_counts.sort_values(by='frequenza', ascending=True)

fig = px.scatter(
    speaker_counts,
    x='frequenza',
    y='speaker',
    title='Frequenza delle dichiarazioni per i primi 30 speaker',
    labels={'frequenza': 'Frequenza', 'speaker': 'Speaker'},
    height=700,
    size='frequenza',
    size_max=30,
    hover_name='speaker',
)

fig.update_layout(
    yaxis=dict(tickfont=dict(size=10)),
    coloraxis_showscale=False
)

fig.show()

### Subject Analysis

In [None]:
num_unique_subjects = complete_df['subject'].nunique()
print(f"\nNumero di valori diversi nella colonna 'subject': {num_unique_subjects}")

In [None]:
subject_counts = complete_df['subject'].value_counts().head(30).reset_index()
subject_counts.columns = ['subject', 'frequenza']

subject_counts = subject_counts.sort_values(by='frequenza', ascending=True)

fig = px.scatter(
    subject_counts,
    x='frequenza',
    y='subject',
    title='Frequenza delle dichiarazioni per i primi 30 subject',
    labels={'frequenza': 'Frequenza', 'subject': 'Subject'},
    height=700,
    size='frequenza',
    size_max=30,
    hover_name='subject',
)

fig.update_layout(
    yaxis=dict(tickfont=dict(size=10)),
    coloraxis_showscale=False
)

fig.show()

### Temporal analysis

In [None]:
partial_df = complete_df[['label', 'speaker', 'date']].copy()
partial_df['date'] = pd.to_datetime(partial_df['date'])
partial_df['year'] = partial_df['date'].dt.year
partial_df = partial_df.drop(columns=['date'])
partial_df = partial_df[partial_df['year'] >= 2007]

In [None]:
speaker_df = partial_df.copy()

app_speaker = dash.Dash(__name__)

app_speaker.layout = html.Div([
    html.H2("Frequenza Speaker per Anno"),
    dcc.Graph(id='graph-with-slider'),
    dcc.Slider(
        id='year-slider',
        min=speaker_df['year'].min(),
        max=speaker_df['year'].max(),
        value=speaker_df['year'].min(),
        marks={str(year): str(year) for year in sorted(speaker_df['year'].unique())},
        step=None
    )
])

@app_speaker.callback(
    Output('graph-with-slider', 'figure'),
    [Input('year-slider', 'value')]
)
def update_figure(selected_year):
    # Filtro per anno
    filtered_df = speaker_df[speaker_df['year'] == selected_year]

    # Conta speaker per quell'anno
    speaker_counts = (
        filtered_df['speaker']
        .value_counts()
        .head(30)
        .reset_index()
    )
    speaker_counts.columns = ['speaker', 'frequenza']
    speaker_counts = speaker_counts.sort_values(by='frequenza', ascending=True)

    fig = px.scatter(
        speaker_counts,
        x='frequenza',
        y='speaker',
        title=f'Frequenza delle dichiarazioni per i primi 30 speaker ({selected_year})',
        labels={'frequenza': 'Frequenza', 'speaker': 'Speaker'},
        height=700,
        size='frequenza',
        size_max=30,
        hover_name='speaker',
    )

    fig.update_layout(
        yaxis=dict(tickfont=dict(size=10)),
        coloraxis_showscale=False
    )

    return fig

app_speaker.run(debug=True)

In [None]:
app_liar = dash.Dash(__name__)

app_liar.layout = html.Div([
    html.H2("Top Speaker per Etichette 0 e 1"),
    dcc.Graph(id='graph-with-slider'),
    dcc.Slider(
        id='year-slider',
        min=partial_df['year'].min(),
        max=partial_df['year'].max(),
        value=partial_df['year'].min(),
        marks={str(year): str(year) for year in sorted(partial_df['year'].unique())},
        step=None
    )
])

@app_liar.callback(
    Output('graph-with-slider', 'figure'),
    [Input('year-slider', 'value')]
)
def update_figure(selected_year):
    # Filtro per anno e label 0 o 1
    filtered_df = partial_df[
        (partial_df['year'] == selected_year) &
        (partial_df['label'].isin([0, 1]))
    ]

    # Conta dichiarazioni per speaker
    speaker_counts = (
        filtered_df['speaker']
        .value_counts()
        .head(30)
        .reset_index()
    )
    speaker_counts.columns = ['speaker', 'frequenza']
    speaker_counts = speaker_counts.sort_values(by='frequenza', ascending=True)

    # Crea grafico con plotly express
    fig = px.scatter(
        speaker_counts,
        x='frequenza',
        y='speaker',
        title=f'Top 30 speaker con label 0 o 1 - Anno {selected_year}',
        labels={'frequenza': 'Frequenza (label 0 o 1)', 'speaker': 'Speaker'},
        height=700,
        size='frequenza',
        size_max=30,
        hover_name='speaker',
    )

    fig.update_layout(
        yaxis=dict(tickfont=dict(size=10)),
        coloraxis_showscale=False
    )

    return fig


app_liar.run(debug=True)
