In [135]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re

In [None]:
file_path = "../data/news_data/raw_analyst_ratings.csv"
df = pd.read_csv(file_path, index_col=0) # index_col = 0 is to remove unnamed:0 columns

Descriptive Statistics

In [None]:
# lets see the number of rows, columns, non null entreis and index ranges
df.info()

In [None]:
# here i would like to see a descriptive statistics summary 
df.describe()

In [None]:
# Lets print out the datatype of the headline column and print the first five values
print(type(df['headline']))
print(df['headline'].head())

In [None]:
# ensure the headline datatype is string and remove white spaces
df['headline'] = df['headline'].astype(str).str.strip()
print(df['headline'].head())

In [None]:
df.dropna(subset=['headline'], inplace=True)
print(f"the updated headline rows after droping down missing values is : {df.shape}")

In [None]:
# lets check for duplicate values and remove if found
duplicate_count = df.duplicated().sum()
print(f"duplicates found: {duplicate_count}")
if duplicate_count>0:
    df.drop_duplicates(inplace=True)
    print(f"remaining rows after duplication removal is {df.shape[0]}")

In [None]:
if 'date' in df.columns:
    # make the date to iso8601 format and make the timezone 
    df['date'] = pd.to_datetime(df['date'], format = "ISO8601",utc =True)
    print(df['date'].head())
    print(f"data type is now: {df['date'].dtype}")
    
    #set index
    df.set_index('date' , inplace= True)
    print(df.head())
else:
    print("date column not found")

In [None]:
# Add a new column for headline length (number of characters)
df['headline_length'] = df['headline'].apply(len)

# Basic statistics
print(df['headline_length'].describe())
# word counts instead of character counts
df['headline_word_count'] = df['headline'].apply(lambda x: len(x.split()))
print(df['headline_word_count'].describe())


In [None]:
# Count articles per publisher
publisher_counts = df['publisher'].value_counts()
print(publisher_counts.head(10))  # top 10 most active publishers

# Optional bar plot
plt.figure(figsize=(8,5))
sns.barplot(x=publisher_counts.head(10).index, 
            y=publisher_counts.head(10).values, 
            palette='magma')
plt.xticks(rotation=45)
plt.ylabel('Number of articles')
plt.xlabel('Publisher')
plt.title('Top 10 most active publishers')
plt.show()


In [None]:

# Resample by day and count headlines
daily_counts = df['headline'].resample('D').count()

# Plot daily trend
plt.figure(figsize=(12,5))
daily_counts.plot()
plt.title('Number of headlines per day')
plt.ylabel('Number of headlines')
plt.xlabel('Date')
plt.show()


In [None]:
df['weekday'] = df.index.weekday  # Monday=0, Sunday=6
weekday_counts = df['weekday'].value_counts().sort_index()

plt.figure(figsize=(6,4))
sns.barplot(x=['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], 
            y=weekday_counts.values, palette='viridis')
plt.ylabel('Number of headlines')
plt.xlabel('Weekday')
plt.title('Number of articles by weekday')
plt.show()


In [None]:
monthly_counts = df['headline'].resample('M').count()
monthly_counts.plot(figsize=(12,5), title='Monthly news frequency')
plt.show()


Sentimental analysis

here lets create a polarity column. for that i took the headline column and using an apply method in order to make the change for each column. After that, a textblob.sentiment.polarity is appled to get the polarity of each row.

In [None]:
sample = df.head(1000)
sample['polarity'] = sample['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)
def get_sentiment_label(p):
    if p > 0.05:
        return 'positive'
    elif p < -0.05:
        return 'negative'
    else:
        return 'neutral'

sample['sentiment_label'] = sample['polarity'].apply(get_sentiment_label)
print(sample[['headline', 'polarity', 'sentiment_label']])


In [None]:
# headlines in each category
sentiment_counts = sample['sentiment_label'].value_counts()
# plot
plt.figure(figsize=(6,4))
sns.barplot(x=sentiment_counts.index, y = sentiment_counts.values, palette='viridis')
plt.ylabel('Number of headlines')
plt.xlabel('sentiment category')
plt.show()

Text Analysis(Topic Modeling):

In [None]:
# download stop words
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\derej\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [137]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text
df['clean_headline'] = df['headline'].apply(clean_text)

# Check first 5 cleaned headlines
print(df['clean_headline'].head())



date
2020-06-05 14:30:54+00:00                         stocks hit week highs friday
2020-06-03 14:45:20+00:00                      stocks hit week highs wednesday
2020-05-26 08:30:07+00:00                                biggest movers friday
2020-05-22 16:45:06+00:00                 stocks moving fridays midday session
2020-05-22 15:38:59+00:00    b securities maintains neutral agilent technol...
Name: clean_headline, dtype: object
