# Imports

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('') #Poner la ruta!
df.head(10)

### Información del DataSet

In [None]:
df.info()

In [None]:
print("1. Dimensiones (Shape):", df.shape, "\n")
print("\n2. Valores Nulos por Columna:\n", df.isnull().sum())
null_pct = df.isnull().mean() * 100
print("\nPorcentaje de nulos por columna:\n", null_pct)
print("\n3. Filas Duplicadas Totales:", df.duplicated().sum(), "\n")
print("4. Valores Únicos por Columna:\n", df.nunique())

### Estadísticas Descriptivas



In [None]:
print("Estadísticas Descriptivas:")

print(df.describe())

print("\nModa:\n", df.mode(numeric_only=True).iloc[0])
print("\nVarianza:\n", df.var(numeric_only=True))
print("\nRango:\n", df.max(numeric_only=True) - df.min(numeric_only=True))

### Percentiles y Cuartiles

In [None]:
percentiles = df.quantile([0.25, 0.50, 0.75, 0.90, 0.95, 0.99], numeric_only=True)

for col in percentiles.columns:
    print(f"\nPercentiles para: {col}")
    print(f"Q1 (25th): {percentiles[col][0.25]:.2f}")
    print(f"Q2 (50th/Median): {percentiles[col][0.50]:.2f}")
    print(f"Q3 (75th): {percentiles[col][0.75]:.2f}")
    print(f"P90: {percentiles[col][0.90]:.2f}")
    print(f"P95: {percentiles[col][0.95]:.2f}")
    print(f"P99: {percentiles[col][0.99]:.2f}")

### Outliers con IQR

In [None]:
numeric_cols = df.select_dtypes(include='number').columns

q1 = df[numeric_cols].quantile(0.25)
q3 = df[numeric_cols].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

condition = ~((df[numeric_cols] < lower_bound) | (df[numeric_cols] > upper_bound)).any(axis=1)
df_clean = df[condition]

for col in numeric_cols:
    outliers = df[(df[col] < lower_bound[col]) | (df[col] > upper_bound[col])]
    print(f"\n--- Outlier Analysis: {col} ---")
    print(f"Outlier Count: {len(outliers)}")
    print(f"Outlier Percentage: {len(outliers) / len(df) * 100:.2f}%")
    print(f"Mean without outliers: {df_clean[col].mean():.2f}")

### Correlation Matrix

In [None]:
correlation_matrix = df.corr(numeric_only=True)
print("\nCorrelation Matrix:")
print(correlation_matrix)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

### Distribución de frecuencia

In [None]:
for col in numeric_cols:
    df[f'{col}_range'] = pd.cut(df[col], bins=5)
    
    freq_dist = df.groupby(f'{col}_range', observed=False).agg({
        col: ['count', 'mean']
    }).round(2)
    
    freq_dist.columns = ['frequency', 'avg_in_range']
    freq_dist['percentage'] = (freq_dist['frequency'] / freq_dist['frequency'].sum() * 100).round(2)
    
    print(f"\nFrequency Distribution para: {col}")
    print(freq_dist)
    
    df[col].hist(bins=20, figsize=(10, 6), color='skyblue', edgecolor='black')
    plt.title(f'{col} Distribution')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

### Z-Score Analysis

In [None]:
for col in numeric_cols:
    df[f'{col}_z_score'] = stats.zscore(df[col], nan_policy='omit')
    df[f'{col}_anomaly_flag'] = pd.cut(
        df[f'{col}_z_score'].abs(),
        bins=[0, 2, 3, float('inf')],
        labels=['Normal', 'Moderate', 'Extreme']
    )
    
    anomalies = df[df[f'{col}_z_score'].abs() > 2].sort_values(f'{col}_z_score', ascending=False, key=abs)
    
    print(f"\nAnomalies Detected en {col}: {len(anomalies)}")
    print(anomalies[[col, f'{col}_z_score', f'{col}_anomaly_flag']].head(10))
    
    plt.figure(figsize=(12, 6))
    plt.scatter(df.index, df[f'{col}_z_score'], c=df[f'{col}_z_score'].abs(), cmap='Reds', alpha=0.6)
    plt.axhline(y=2, color='orange', linestyle='--', label='±2 SD')
    plt.axhline(y=-2, color='orange', linestyle='--')
    plt.axhline(y=3, color='red', linestyle='--', label='±3 SD')
    plt.axhline(y=-3, color='red', linestyle='--')
    plt.title(f'Z-Score Distribution - Anomaly Detection ({col})')
    plt.ylabel(f'{col} Z-Score')
    plt.xlabel('Index')
    plt.legend()
    plt.show()