In [2]:
import os
import sys
import pandas as pd
import plotly as px
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [3]:
# Get the current working directory
current_directory = os.getcwd()

# Get the parent directory
parent_dir = os.path.dirname(current_directory)

# Insert the parent directory into sys.path
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
scripts_dir = os.path.join(parent_dir, 'scripts')
sys.path.insert(0, scripts_dir)

# Now I can import my module
from scripts.main import DataPreprocessing

In [4]:
file_path = { 
    "Row-Analysis-Ratings": "https://drive.google.com/uc?export=download&id=1MC0ePKh2oc3VqGtOMNTboyICpyuiSr2l"
}
preprocesor = DataPreprocessing(file_path)
data_frames = preprocesor.process_file()
df = data_frames.get("Row-Analysis-Ratings")
df.head()

Data for Row-Analysis-Ratings loaded successfully.


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


##### Basic Statistics about textual length 

In [11]:
# 'headline' is the column name containing the headlines
headline_lengths = df['headline'].str.len()
headline_lengths.describe()

count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline, dtype: float64

##### Number of articles per publisher

In [6]:
publisher_counts = df['publisher'].value_counts()
display(publisher_counts) 

publisher
Paul Quintaro                      228373
Lisa Levin                         186979
Benzinga Newsdesk                  150484
Charles Gross                       96732
Monica Gerson                       82380
                                    ...  
Shazir Mucklai - Imperium Group         1
Laura Jennings                          1
Eric Martin                             1
Jose Rodrigo                            1
Jeremie Capron                          1
Name: count, Length: 1034, dtype: int64

In [7]:
df.shape

(1407328, 6)

##### Publication Dates 

In [8]:
# Ensure the date column is in datetime format
df['date'] = pd.to_datetime(df['date'], format='ISO8601')

# Group by date and count the number of articles published each day
daily_counts = df.groupby(df['date'].dt.date).size().reset_index(name='counts')

# Create an interactive line plot with markers
fig = px.line(daily_counts, x='date', y='counts', title='News Frequency During Days',
              labels={'date': 'Date', 'counts': 'Number of Articles Published'},
              markers=True, template='plotly_dark')

# Customize the layout for better insights
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Number of Articles Published',
    title={
        'text': 'News Frequency During Days',
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    }
)

# Show the plot
fig.show()