In [None]:
# ==============================
# DATA DESCRIPTIVE FULL REPORT
# ==============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

sns.set_style("whitegrid")

# ==============================
# Load Dataset
# ==============================

df = pd.read_csv("PS_2026.02.07_05.49.09.csv", comment="#", low_memory=False)

print("Dataset Shape:", df.shape)
print("\n")

# ==============================
# Basic Info
# ==============================

print("Dataset Info:\n")
df.info()

print("\nStatistical Summary:\n")
print(df.describe().T)

# ==============================
# Data Type Distribution
# ==============================

print("\nData Type Distribution:\n")
print(df.dtypes.value_counts())

plt.figure(figsize=(6,4))
df.dtypes.value_counts().plot(kind='bar')
plt.title("Data Type Distribution")
plt.show()

# ==============================
# Missing Value Analysis
# ==============================

missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

print("\nTop Missing Columns:\n")
print(missing.head(20))

plt.figure(figsize=(12,6))
missing.head(20).plot(kind='bar')
plt.title("Top 20 Columns with Missing Values")
plt.show()

missing_percent = (df.isnull().sum() / len(df)) * 100
missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)

print("\nMissing Percentage:\n")
print(missing_percent.head(20))

plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Value Heatmap")
plt.show()

# ==============================
# Numerical Analysis
# ==============================

num_cols = df.select_dtypes(include=np.number).columns

print("\nSkewness & Kurtosis:\n")
skew_kurt = pd.DataFrame({
    "Skewness": df[num_cols].skew(),
    "Kurtosis": df[num_cols].kurt()
}).sort_values(by="Skewness", ascending=False)

print(skew_kurt.head(15))

# ==============================
# Distribution Plots
# ==============================

for col in num_cols[:3]:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

# ==============================
# Correlation Heatmap
# ==============================

plt.figure(figsize=(12,8))
sns.heatmap(df[num_cols[:20]].corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# ==============================
# Strong Correlation Pairs
# ==============================

corr_matrix = df[num_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
strong_pairs = upper.stack().sort_values(ascending=False)

print("\nTop Strong Correlation Pairs:\n")
print(strong_pairs.head(15))

# ==============================
# Outlier Detection
# ==============================

plt.figure(figsize=(12,6))
sns.boxplot(data=df[num_cols[:5]])
plt.xticks(rotation=45)
plt.title("Boxplot for Outlier Detection")
plt.show()

# ==============================
# Variance Analysis
# ==============================

variance = df[num_cols].var().sort_values(ascending=False)

print("\nTop Variance Features:\n")
print(variance.head(15))

# ==============================
# Data Quality Score
# ==============================

total_cells = df.shape[0] * df.shape[1]
missing_cells = df.isnull().sum().sum()

data_quality = 100 - ((missing_cells / total_cells) * 100)

print(f"\nOverall Data Quality Score: {data_quality:.2f}%")

# ==============================
# PCA Visualization
# ==============================

num_data = df[num_cols].fillna(0)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_data)

pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

plt.figure(figsize=(8,6))
plt.scatter(pca_result[:,0], pca_result[:,1], alpha=0.5)
plt.title("PCA Projection (2D)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

print("\n===== DATA DESCRIPTIVE REPORT COMPLETED =====")