## Seeing and Plotting for the evaluation of the data:
#### We ll use a simple heatmap and histogram to visualize how frequent each source is getting updated

In [None]:
#importing all necessary libraries
import pandas as pd
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scipy.stats as stats 
from scipy.stats import chi2_contingency

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm 
import warnings 
warnings.filterwarnings('ignore')

In [None]:
from pymongo import MongoClient
import os
from dotenv import load_dotenv
load_dotenv()
mongo_uri = os.getenv("MONGO_URI")

client = MongoClient(mongo_uri)
# Create a new database
db = client["my_db"]

print("Connected to MongoDB!")


import pandas as pd

article = db["articles"]
user = db["users"]
source = db["sources"]

docs = article.find()
df = pd.DataFrame(list(docs))
df.head()

In [None]:
print("There is ",len(df), " articles in the DB")

In [None]:
mayadeen_df = df[df['source_id'] == mayadeen_id]
print("There is ",len(mayadeen_df), " articles from mayadeen")
#apparently there is a major problem in th source IDs

In [None]:
manar_df = df[df['source_id'] == manar_id]
print("There is ",len(manar_df), " articles from manar")

In [None]:
mtv_df = df[df['source_id'] == mtv_id]
print("There is ",len(mtv_df), " articles from mtv")

In [None]:
jadeed_df = df[df['source_id'] == jadeed_id]
print("There is ",len(jadeed_df), " articles from jadeed")

In [None]:
annahar_df = df[df['source_id'] == annahar_id]
print("There is ",len(annahar_df), " articles from annahar")

In [None]:
#checking for duplicate news
df.url.is_unique

In [None]:
#checking for empty slots
df.isna().sum()

In [None]:
#we can check for where the language is unknown
df[df['language'] == 'unknown'].head(2)
#notice that the content is empty could be some rapid news or something or video

In [None]:
#checking for empty content:
df[(df['content'] == ' ') | (df['content']== 'No content to be displayed.')]

## we can plot which sources have the most empty contents
### but 1606 news out of 3300 is A LOT

### we need to first group them by days i.e. get all of the number of articles that are uploaded in this day

In [None]:
import datetime
#we'll have a list of the corresponding news article
news_list_dict = ['mayadeen', 'annahar', 'mtv', 'jadeed', 'manar']
#now we need to establish the corresponding frequencies

In [None]:
mayadeen_df[mayadeen_df['publish_date'] > str(datetime.date.today())]

In [None]:
today = mayadeen_df[mayadeen_df['publish_date'] > str(datetime.date.today())]
today

In [None]:
#transforming into hours and not datetime
def get_hours(date):
    return date.hour

def day_news(news_df):
    return news_df[news_df['created_at'] > str(datetime.date.today())].publish_date
def hour_news(news_df):
    day_series = day_news(news_df)
    return day_series.to_frame().applymap(get_hours)
hour_mayadeen = hour_news(mayadeen_df).value_counts()
hour_mtv = hour_news(mtv_df).value_counts()
hour_manar = hour_news(manar_df).value_counts()
hour_jadeed = hour_news(jadeed_df).value_counts()
hour_annahar = hour_news(annahar_df).value_counts()

## Plotting Frequency

In [None]:
hour_mayadeen

In [None]:
plt.figure(figsize=(8, 5))
hour_mayadeen.plot(label = 'mayadeen')
hour_mtv.plot(label = 'mtv')
hour_manar.plot(label = 'manar')
hour_jadeed.plot(label = 'jadeed')
hour_annahar.plot(label = 'annahar')
plt.xticks([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24], ['0', '2', '4', '6', '8', '10', '12', '14', '16', '18', '20', '22', '24'])
plt.xlabel('hour')
plt.legend()
plt.show()

### for the distributions of the number of charachters

In [None]:
def get_count(content):
    return len(content)
mayadeen_count_char = mayadeen_df['content'].to_frame().applymap(get_count)
annahar_count_char = annahar_df['content'].to_frame().applymap(get_count)
mtv_count_char = mtv_df['content'].to_frame().applymap(get_count)
jadeed_count_char = jadeed_df['content'].to_frame().applymap(get_count)
manar_count_char = manar_df['content'].to_frame().applymap(get_count)

In [None]:
#setting up the y and x axis for the histogram
values = [['mayadeen', int(mayadeen_count_char.describe().T['mean'])] , 
          ['annahar', int(annahar_count_char.describe().T['mean']) ],
          ['mtv',int(mtv_count_char.describe().T['mean'])] ,
          ['jadeed',int(jadeed_count_char.describe().T['mean'])] ,
          ['manar',int(manar_count_char.describe().T['mean'])]]
news = ['mayadeen', 'annahar', 'mtv', 'jadeed', 'manar']
count = pd.DataFrame(values, columns = ['news', 'count'])
count

In [None]:
fig = px.histogram(data_frame = count,
                   x='news', y ='count', facet_col_spacing = 1, nbins = 10)
fig.show()

### Number of words: (applying the same logic)

In [None]:
def get_count_wrd(content):
    return len(content.split(' '))
mayadeen_count_wrd = mayadeen_df['content'].to_frame().applymap(get_count_wrd)
annahar_count_wrd = annahar_df['content'].to_frame().applymap(get_count_wrd)
mtv_count_wrd = mtv_df['content'].to_frame().applymap(get_count_wrd)
jadeed_count_wrd = jadeed_df['content'].to_frame().applymap(get_count_wrd)
manar_count_wrd = manar_df['content'].to_frame().applymap(get_count_wrd)

In [None]:
mayadeen_count_wrd.head(3)

In [None]:
#setting up the y and x axis for the histogram
values = [['mayadeen', int(mayadeen_count_wrd.describe().T['mean'])] , 
          ['annahar', int(annahar_count_wrd.describe().T['mean']) ],
          ['mtv',int(mtv_count_wrd.describe().T['mean'])] ,
          ['jadeed',int(jadeed_count_wrd.describe().T['mean'])] ,
          ['manar',int(manar_count_wrd.describe().T['mean'])]]
news = ['mayadeen', 'annahar', 'mtv', 'jadeed', 'manar']
count_wrd = pd.DataFrame(values, columns = ['news', 'count'])
count_wrd

In [None]:
fig = px.histogram(data_frame = count_wrd,
                   x='news', y ='count', facet_col_spacing = 1, nbins = 10)
fig.show()