# Elisa's code snippets for report 1 in a Jupyter Notebook
###############################################################################################


## Link to datasets:
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AMCV2H

## Link to source for labels:
https://mediabiasfactcheck.com/


In [1]:
## Importing libraries
#importing Libraries
import numpy as np
import pandas as pd
from matplotlib.pylab import plt
import seaborn as sns
# from sqlalchemy import create_engine
# import sqlalchemy as 
import sqlite3

from sklearn import metrics
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# from wordcloud import WordCloud
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk


## 1 - Dataset Analysis: "nela-gt-2022-db"


### Elisa's code for reading and inspecting the database "nela-gt-2022.db"

In [2]:
# Read sqlite query results into a pandas DataFrame called db
con = sqlite3.connect("nela-gt-2022_db/nela-gt-2022.db")
db = pd.read_sql_query("SELECT * from newsdata", con)

OperationalError: unable to open database file

### first exploration of the document "nela-gt-2022.db"

In [None]:
print(db.head())

In [None]:
print(db.info())

In [None]:
print(db.describe())

In [None]:
# check for missing values
db.isnull().sum(axis = 0)

### Analysis of the publishers / sources

In [None]:
# get publishers
db["source"].unique()

In [None]:
# get publishers sorted alphabetically
a = db["source"].unique()
print(sorted(a))


In [None]:
# check whether we have duplicates
print(db.duplicated().sum())

In [None]:
# plot sources - barplot
fig = plt.figure(figsize = (10,40))
sns.countplot(y = db.source, order = db["source"].value_counts().index)
plt.xlabel("Frequency")
plt.ylabel("Sources")
plt.title("Number of articles per news source contained in the dataset in the year 2022")
plt.show();

#### idea for improvement: create bins (1.g. sources with 100 articles, 100-200 etc. and then plot)


### Analysis of time data

In [None]:
## create new variables
db["year"] = pd.to_datetime(db["date"]).dt.year
db["month"] = pd.to_datetime(db["date"]).dt.month
db["day"] = pd.to_datetime(db["date"]).dt.dayofyear
db["weekday"] = pd.to_datetime(db["date"]).dt.weekday
db["time"] = pd.to_datetime(db["date"]).dt.time

db.info()
db.head(5)

#### Articles per month - barplot

In [None]:
# get values
# db["month"].value_counts()

## plot the number of articles per month
plt.figure(figsize=(15,5))
sns.countplot(x=db.month, color="limegreen");
plt.xticks(np.arange(12),["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]);
plt.ylabel("Number of articles")
plt.xlabel("Months")
plt.title("Number of articles per month in dataset");


#### Articles per day - lineplot

In [None]:
## lineplot articles per day whole year
db["date"].value_counts()

## sort the values and create a new variable
lineplot_data = db["date"].value_counts().sort_index()

## plot
fig = plt.figure(figsize = (10, 8))
ax1 = fig.add_subplot(111)
ax1.plot_date(lineplot_data.index, lineplot_data, linestyle='-')

## nicer x-axis
month_starts = [0,31,60,91,121,152,182,213,244,274,305,335]
month_names = ['Jan','Feb','Mar','Apr','May','Jun',
               'Jul','Aug','Sep','Oct','Nov','Dec'] 
plt.gca().set_xticks(month_starts)
plt.gca().set_xticklabels(month_names)

plt.show();

# ToDo: figure out what the two low dots are

#### Articles per weekday - barplot 

In [None]:
# get values
# db["weekday"].value_counts()

## plot the number of articles per weekday
# barplot
fig = plt.figure(figsize = (9, 6))
sns.countplot (x = db["weekday"], color = "lightblue")               
plt.xticks(np.arange(7),["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
plt.ylabel("Frequency")
plt.xlabel("Weekdays")
plt.show();




## 2 - Dataset Analysis: "label.tab"

In [None]:
# read document 
labels = pd.read_csv("labels.csv")

In [None]:
labels.head(10)

In [None]:
labels.info()

### Plot the labels - barplot

In [None]:
sns.countplot(x = "label", data = labels)
plt.xticks(np.arange(3),["0: Reliable", "1: Mixed", "2: Unreliable"])
plt.ylabel("Frequency")
plt.xlabel("Outlet-level veracity label 'label' document")
plt.show();

## 3 - Dataset Analysis: "labels_all.tab"

In [None]:
# read document
labels_all = pd.read_csv("labels_all.csv")

In [None]:
labels_all.head(10)

In [None]:
labels_all.info(10)


### Plot the labels - barplot

In [None]:
sns.countplot(x = "label", data = labels_all)
plt.xticks(np.arange(4),["-1: Unlabeled", "0: Reliable", "1: Mixed", "2: Unreliable"])
plt.ylabel("Frequency")
plt.xlabel("Outlet-level veracity label label_all document")
plt.show();

### Plot the countries - barplot

In [None]:
sns.countplot(y = "country", data = labels_all);
plt.ylabel("Country")
plt.xlabel("Frequency")
plt.show();

# ToDo: plot descending

### Plot factuality scores

In [None]:
# count values
labels_all.factuality.value_counts()


In [None]:
sns.countplot(x = "factuality", data = labels_all) 
plt.xticks(np.arange(6),["0: Very Low", "1", "2", "3", "4", "5: Very High"])
plt.ylabel("Frequency")
plt.xlabel("Outlet-level factuality label labels_all document")
plt.show();

# ToDo: need a title for this one :-)

In [None]:
## alternative but not so nice
sns.displot (labels_all["factuality"], kde=True, bins=6);
