# Financial News Sentiment Analysis - EDA

## Task 1: Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Set plot style
sns.set_theme()
sns.set_palette('husl')

# List of tickers and their file paths
tickers = ['AAPL', 'AMZN', 'GOOG', 'NVDA', 'TSLA', 'META']
file_paths = [
    'financial-news-sentiment/data/AAPL_historical_data.csv',
    'financial-news-sentiment/data/AMZN_historical_data.csv',
    'financial-news-sentiment/data/GOOG_historical_data.csv',
    'financial-news-sentiment/data/NVDA_historical_data.csv',
    'financial-news-sentiment/data/TSLA_historical_data.csv',
    'financial-news-sentiment/META_historical_data.csv'
]

# Combine all stock data into one DataFrame
dfs = []
for ticker, path in zip(tickers, file_paths):
    temp_df = pd.read_csv(path)
    temp_df['ticker'] = ticker
    dfs.append(temp_df)

df = pd.concat(dfs, ignore_index=True)

# Analyst ratings
Ddf = pd.read_csv('financial-news-sentiment/data/raw_analyst_ratings.csv')

df.head()

: 

In [None]:
C:\Users\hp\Downloads\Telegram Desktop\10Academy-Week1-FNSPID\.venv

In [None]:
df.info()
df.head()
df['ticker'].value_counts()

: 

In [None]:
plt.style.use('seaborn')

In [None]:
sns.set_theme()

In [None]:
# ...existing code...
# Set plot style
sns.set_theme()
sns.set_palette('husl')
# ...existing code...

In [None]:
# ...existing code...
# Set plot style
sns.set_theme()
sns.set_palette('husl')
# ...existing code...

In [None]:
# ...existing code...
# Set plot style
sns.set_theme()
sns.set_palette('husl')
# ...existing code...

### 1. Descriptive Statistics

In [None]:
# Text length analysis
df['headline_length'] = df['headline'].apply(len)

# Basic statistics
headline_stats = df['headline_length'].describe()
headline_stats

In [None]:
# Publisher analysis
publisher_counts = df['publisher'].value_counts()
top_publishers = publisher_counts.head(10)

plt.figure(figsize=(12, 6))
top_publishers.plot(kind='bar')
plt.title('Top 10 Most Active Publishers')
plt.xlabel('Publisher')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### 2. Time Series Analysis

In [None]:
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Daily publication frequency
daily_frequency = df.groupby(df['date'].dt.date).size()

plt.figure(figsize=(15, 6))
daily_frequency.plot()
plt.title('Daily Publication Frequency')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.tight_layout()
plt.show()

### 3. Text Analysis

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Text preprocessing
def preprocess_text(text):
    if pd.isna(text):
        return []
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

# Process all headlines
all_tokens = []
for headline in df['headline']:
    tokens = preprocess_text(headline)
    all_tokens.extend(tokens)

# Most common words
word_counts = Counter(all_tokens)
common_words = word_counts.most_common(20)

plt.figure(figsize=(12, 6))
plt.bar([word[0] for word in common_words], [word[1] for word in common_words])
plt.title('Top 20 Most Common Words in Headlines')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()