# Apple Financial and Social Analysis Dataset
## Exploratory Data Analysis (EDA)

**Author:** Diyar Erol  
**Repository:** [DiyarErol/Apple_Financial_and_Social_Analysis_Dataset](https://github.com/DiyarErol/Apple_Financial_and_Social_Analysis_Dataset)  
**Date:** December 2025

### Overview
This notebook provides a comprehensive exploratory data analysis of the Apple Financial and Social Analysis Dataset. We will examine the structure, distributions, correlations, and key insights from the data to prepare for feature engineering and machine learning modeling.

### Dataset Description
- **Features:** 80+ technical indicators, financial metrics, and sentiment scores
- **Time Period:** 2015–2025 (AAPL stock data)
- **Sample Size:** 2,696 trading records
- **Target Variable:** Apple stock price prediction (closing price)

## 1. Library Imports and Environment Setup

In this section, we import all necessary libraries and configure the visualization environment.

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configure visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")

## 2. Load Dataset

Load the main datasets from the data directory. We will use the enhanced feature dataset which includes technical indicators and sentiment scores.

In [None]:
# Define data path (relative to notebooks directory)
import os
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')

# Load the enhanced feature dataset
df = pd.read_csv(os.path.join(data_path, 'apple_feature_enhanced.csv'))

print(f"✅ Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {len(df.columns)} features")
print(f"\nFirst few rows:")
df.head()

## 3. Dataset Overview

Examine the basic structure, dimensions, and statistical summary of the dataset.

In [None]:
# Display basic information about the dataset
print("=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"\nShape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n" + "=" * 80)
print("DATA TYPES AND INFO")
print("=" * 80)
df.info()

print("\n" + "=" * 80)
print("STATISTICAL SUMMARY")
print("=" * 80)
df.describe().round(4)

## 4. Missing Value and Data Type Inspection

Check for missing values and validate data types across all features.

In [None]:
# Check for missing values
print("=" * 80)
print("MISSING VALUES ANALYSIS")
print("=" * 80)

missing_count = df.isnull().sum()
missing_percent = (missing_count / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_count,
    'Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

missing_df_filtered = missing_df[missing_df['Missing Count'] > 0]
if len(missing_df_filtered) > 0:
    print(missing_df_filtered)
else:
    print("✅ No missing values detected in the dataset!")

print("\n" + "=" * 80)
print("DATA TYPE DISTRIBUTION")
print("=" * 80)
print(df.dtypes.value_counts())

print("\n" + "=" * 80)
print("NUMERIC COLUMNS SUMMARY")
print("=" * 80)
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"Total numeric columns: {len(numeric_cols)}")
print(f"Numeric columns: {list(numeric_cols[:10])}...") if len(numeric_cols) > 10 else print(f"Numeric columns: {list(numeric_cols)}")

## 5. Basic Statistical Analysis

Analyze key financial metrics and price trends in the dataset.

In [None]:
# Identify key columns
print("=" * 80)
print("KEY FINANCIAL METRICS ANALYSIS")
print("=" * 80)

# Check for price-related columns
price_cols = [col for col in df.columns if 'close' in col.lower() or 'price' in col.lower()]
print(f"\nPrice/Close columns found: {price_cols}")

if price_cols:
    close_col = price_cols[0]
    print(f"\n### Price Statistics ({close_col}):")
    print(f"Mean: ${df[close_col].mean():.2f}")
    print(f"Median: ${df[close_col].median():.2f}")
    print(f"Std Dev: ${df[close_col].std():.2f}")
    print(f"Min: ${df[close_col].min():.2f}")
    print(f"Max: ${df[close_col].max():.2f}")
    print(f"Range: ${df[close_col].max() - df[close_col].min():.2f}")

# Check for return-related columns
return_cols = [col for col in df.columns if 'return' in col.lower()]
print(f"\nReturn columns found: {return_cols}")

if return_cols:
    for ret_col in return_cols[:3]:
        print(f"\n### Return Statistics ({ret_col}):")
        print(f"Mean: {df[ret_col].mean():.6f}")
        print(f"Std Dev: {df[ret_col].std():.6f}")
        print(f"Min: {df[ret_col].min():.6f}")
        print(f"Max: {df[ret_col].max():.6f}")

## 6. Feature Correlation Heatmap

Visualize correlations between numeric features to identify relationships and potential multicollinearity.

In [None]:
# Calculate correlation matrix for numeric features
numeric_df = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()

# Create correlation heatmap (limit to top features for readability)
plt.figure(figsize=(16, 12))
# Select top 20 features by variance
top_features = numeric_df.var().nlargest(20).index
sns.heatmap(correlation_matrix.loc[top_features, top_features], 
            annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap (Top 20 Features by Variance)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("✅ Correlation heatmap generated!")
print(f"\nTop 10 most correlated feature pairs:")
# Get top correlations (excluding diagonal)
corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_pairs.append({
            'Feature 1': correlation_matrix.columns[i],
            'Feature 2': correlation_matrix.columns[j],
            'Correlation': correlation_matrix.iloc[i, j]
        })
corr_pairs_df = pd.DataFrame(corr_pairs).sort_values('Correlation', ascending=False, key=abs)
print(corr_pairs_df.head(10))

## 7. Visual Insights

Explore distributions and trends through visualization using Matplotlib and Seaborn.

In [None]:
# Visualize distribution of top numeric features
top_numeric_features = numeric_df.var().nlargest(6).index

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, col in enumerate(top_numeric_features):
    axes[idx].hist(df[col], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✅ Distribution histograms generated!")

In [None]:
# Time series plot of key price metric
if price_cols:
    plt.figure(figsize=(16, 6))
    plt.plot(df.index, df[price_cols[0]], linewidth=2, color='navy', alpha=0.8)
    plt.title(f'{price_cols[0]} Over Time', fontsize=14, fontweight='bold')
    plt.xlabel('Time Index')
    plt.ylabel('Price (USD)')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"✅ Time series plot of {price_cols[0]} generated!")

# Box plots for outlier detection
print("\n" + "=" * 80)
print("OUTLIER DETECTION")
print("=" * 80)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))
for idx, col in enumerate(top_numeric_features[:3]):
    axes[idx].boxplot(df[col])
    axes[idx].set_title(f'Box Plot: {col}', fontweight='bold')
    axes[idx].set_ylabel(col)
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✅ Box plots generated for outlier detection!")

## 8. Summary of Findings

Key insights and observations from the exploratory data analysis.