<a href="https://colab.research.google.com/github/BaronVonBussin/Stuff/blob/main/scatterplots_and_basic_stats_20241219.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NOTE THAT THIS DOES NOT RENDER THE CHARTS; RATHER, THEY'RE SAVED TO .PNG IN FOLDER.

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

# Generate datasets
np.random.seed(42)
n = 200

# Dataset A: Base series with moderate noise
t = np.linspace(0, 20, n)
A = np.sin(t/2) + np.random.normal(0, 0.5, n)

# Dataset B: Strongly positively correlated with A
B = A * 1.2 + np.random.normal(0, 0.3, n)

# Dataset C: Weakly negatively correlated with A
C = -A * 0.3 + np.sin(t) + np.random.normal(0, 1, n)

# Dataset D: Uncorrelated with A
D = np.cos(t/1.5) + np.random.normal(0, 0.8, n)

# Create DataFrame
df = pd.DataFrame({
    'A': A,
    'B': B,
    'C': C,
    'D': D
})

# Calculate statistics
def calculate_statistics(x, y):
    correlation = np.corrcoef(x, y)[0,1]
    covariance = np.cov(x, y)[0,1]
    x_std = np.std(x)
    y_std = np.std(y)
    x_var = np.var(x)
    y_var = np.var(y)

    return {
        'Correlation': correlation,
        'Covariance': covariance,
        'X Std Dev': x_std,
        'Y Std Dev': y_std,
        'X Variance': x_var,
        'Y Variance': y_var
    }

# Calculate statistics for each pair
ab_stats = calculate_statistics(A, B)
ac_stats = calculate_statistics(A, C)
ad_stats = calculate_statistics(A, D)

# Print results
print("A vs B Statistics:")
print("==================")
for key, value in ab_stats.items():
    print(f"{key}: {value:.4f}")

print("\nA vs C Statistics:")
print("==================")
for key, value in ac_stats.items():
    print(f"{key}: {value:.4f}")

print("\nA vs D Statistics:")
print("==================")
for key, value in ad_stats.items():
    print(f"{key}: {value:.4f}")

# Calculate number of standard deviations
def count_std_deviations(series):
    mean = np.mean(series)
    std = np.std(series)
    deviations = abs(series - mean) / std
    return {
        '1 std': np.mean(deviations <= 1),
        '2 std': np.mean(deviations <= 2),
        '3 std': np.mean(deviations <= 3)
    }

print("\nStandard Deviation Distribution:")
print("================================")
for series_name in ['A', 'B', 'C', 'D']:
    std_counts = count_std_deviations(df[series_name])
    print(f"\n{series_name}:")
    for std, proportion in std_counts.items():
        print(f"Within {std}: {proportion:.2%}")

A vs B Statistics:
Correlation: 0.9564
Covariance: 0.7797
X Std Dev: 0.8020
Y Std Dev: 1.0113
X Variance: 0.6433
Y Variance: 1.0227

A vs C Statistics:
Correlation: -0.2732
Covariance: -0.2899
X Std Dev: 0.8020
Y Std Dev: 1.3163
X Variance: 0.6433
Y Variance: 1.7326

A vs D Statistics:
Correlation: -0.2476
Covariance: -0.2263
X Std Dev: 0.8020
Y Std Dev: 1.1340
X Variance: 0.6433
Y Variance: 1.2859

Standard Deviation Distribution:

A:
Within 1 std: 63.50%
Within 2 std: 95.50%
Within 3 std: 100.00%

B:
Within 1 std: 67.00%
Within 2 std: 95.00%
Within 3 std: 100.00%

C:
Within 1 std: 67.50%
Within 2 std: 96.00%
Within 3 std: 99.50%

D:
Within 1 std: 62.00%
Within 2 std: 97.00%
Within 3 std: 100.00%


In [2]:
import numpy as np
import pandas as pd
from scipy import stats

# Generate the same datasets as in the visualization
np.random.seed(42)
n = 200
t = np.linspace(0, 20, n)

# Generate datasets with distinct characteristics
A = np.sin(t) + np.random.normal(0, 0.5, n)
B = A * 1.2 + np.random.normal(0, 0.3, n)  # Strong positive correlation
C = -A * 0.3 + np.sin(t/2) + np.random.normal(0, 1, n)  # Weak negative correlation
D = np.cos(t/1.5) + np.random.normal(0, 0.8, n)  # Uncorrelated

df = pd.DataFrame({
    'A': A, 'B': B, 'C': C, 'D': D
})

def analyze_distribution(series):
    return {
        'mean': np.mean(series),
        'std': np.std(series),
        'skewness': stats.skew(series),
        'kurtosis': stats.kurtosis(series),
        'normality_test': stats.normaltest(series)
    }

def analyze_relationship(x, y):
    return {
        'correlation': np.corrcoef(x, y)[0,1],
        'covariance': np.cov(x, y)[0,1],
        'r_squared': np.corrcoef(x, y)[0,1]**2,
        'spearman_corr': stats.spearmanr(x, y).correlation
    }

# Individual distributions
print("Individual Distribution Analysis:")
print("===============================")
for col in df.columns:
    stats_dict = analyze_distribution(df[col])
    print(f"\nSeries {col}:")
    print(f"Mean: {stats_dict['mean']:.4f}")
    print(f"Std Dev: {stats_dict['std']:.4f}")
    print(f"Skewness: {stats_dict['skewness']:.4f}")
    print(f"Kurtosis: {stats_dict['kurtosis']:.4f}")
    k2, p_value = stats_dict['normality_test']
    print(f"Normality test p-value: {p_value:.4f}")

# Relationships with A
print("\nRelationships with Series A:")
print("===========================")
for col in ['B', 'C', 'D']:
    rel_stats = analyze_relationship(df['A'], df[col])
    print(f"\nA vs {col}:")
    print(f"Correlation: {rel_stats['correlation']:.4f}")
    print(f"Covariance: {rel_stats['covariance']:.4f}")
    print(f"R-squared: {rel_stats['r_squared']:.4f}")
    print(f"Spearman correlation: {rel_stats['spearman_corr']:.4f}")

Individual Distribution Analysis:

Series A:
Mean: 0.0113
Std Dev: 0.8010
Skewness: -0.0814
Kurtosis: -0.7443
Normality test p-value: 0.0043

Series B:
Mean: 0.0393
Std Dev: 1.0035
Skewness: 0.0110
Kurtosis: -0.4160
Normality test p-value: 0.4199

Series C:
Mean: 0.0925
Std Dev: 1.2343
Skewness: -0.0225
Kurtosis: -0.4357
Normality test p-value: 0.3672

Series D:
Mean: 0.0632
Std Dev: 1.1340
Skewness: -0.1477
Kurtosis: -0.6414
Normality test p-value: 0.0265

Relationships with Series A:

A vs B:
Correlation: 0.9557
Covariance: 0.7720
R-squared: 0.9134
Spearman correlation: 0.9565

A vs C:
Correlation: -0.2094
Covariance: -0.2080
R-squared: 0.0438
Spearman correlation: -0.2046

A vs D:
Correlation: 0.0679
Covariance: 0.0620
R-squared: 0.0046
Spearman correlation: 0.0755


In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats


# Generate the same datasets as in the visualization
np.random.seed(42)
n = 200
t = np.linspace(0, 20, n)

# Generate datasets with distinct characteristics
A = np.sin(t) + np.random.normal(0, 0.5, n)
B = A * 1.2 + np.random.normal(0, 0.3, n)  # Strong positive correlation
C = -A * 0.3 + np.sin(t/2) + np.random.normal(0, 1, n)  # Weak negative correlation
D = np.cos(t/1.5) + np.random.normal(0, 0.8, n)  # Uncorrelated

df = pd.DataFrame({
    'A': A, 'B': B, 'C': C, 'D': D
})

def analyze_distribution(series):
    return {
        'mean': np.mean(series),
        'std': np.std(series),
        'skewness': stats.skew(series),
        'kurtosis': stats.kurtosis(series),
        'normality_test': stats.normaltest(series)
    }

def analyze_relationship(x, y):
    return {
        'correlation': np.corrcoef(x, y)[0,1],
        'covariance': np.cov(x, y)[0,1],
        'r_squared': np.corrcoef(x, y)[0,1]**2,
        'spearman_corr': stats.spearmanr(x, y).correlation
    }

# Individual distributions
print("Individual Distribution Analysis:")
print("===============================")
for col in df.columns:
    stats_dict = analyze_distribution(df[col])
    print(f"\nSeries {col}:")
    print(f"Mean: {stats_dict['mean']:.4f}")
    print(f"Std Dev: {stats_dict['std']:.4f}")
    print(f"Skewness: {stats_dict['skewness']:.4f}")
    print(f"Kurtosis: {stats_dict['kurtosis']:.4f}")
    k2, p_value = stats_dict['normality_test']
    print(f"Normality test p-value: {p_value:.4f}")

# Relationships with A
print("\nRelationships with Series A:")
print("===========================")
for col in ['B', 'C', 'D']:
    rel_stats = analyze_relationship(df['A'], df[col])
    print(f"\nA vs {col}:")
    print(f"Correlation: {rel_stats['correlation']:.4f}")
    print(f"Covariance: {rel_stats['covariance']:.4f}")
    print(f"R-squared: {rel_stats['r_squared']:.4f}")
    print(f"Spearman correlation: {rel_stats['spearman_corr']:.4f}")

Individual Distribution Analysis:

Series A:
Mean: 0.0113
Std Dev: 0.8010
Skewness: -0.0814
Kurtosis: -0.7443
Normality test p-value: 0.0043

Series B:
Mean: 0.0393
Std Dev: 1.0035
Skewness: 0.0110
Kurtosis: -0.4160
Normality test p-value: 0.4199

Series C:
Mean: 0.0925
Std Dev: 1.2343
Skewness: -0.0225
Kurtosis: -0.4357
Normality test p-value: 0.3672

Series D:
Mean: 0.0632
Std Dev: 1.1340
Skewness: -0.1477
Kurtosis: -0.6414
Normality test p-value: 0.0265

Relationships with Series A:

A vs B:
Correlation: 0.9557
Covariance: 0.7720
R-squared: 0.9134
Spearman correlation: 0.9565

A vs C:
Correlation: -0.2094
Covariance: -0.2080
R-squared: 0.0438
Spearman correlation: -0.2046

A vs D:
Correlation: 0.0679
Covariance: 0.0620
R-squared: 0.0046
Spearman correlation: 0.0755


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style
plt.style.use('ggplot')
#sns.set_palette("darkgrid")
sns.set_theme(style="darkgrid")

# Generate datasets
np.random.seed(42)
n = 200
t = np.linspace(0, 20, n)

# Generate datasets with distinct characteristics
A = np.sin(t) + np.random.normal(0, 0.5, n)
B = A * 1.2 + np.random.normal(0, 0.3, n)  # Strong positive correlation
C = -A * 0.3 + np.sin(t/2) + np.random.normal(0, 1, n)  # Weak negative correlation
D = np.cos(t/1.5) + np.random.normal(0, 0.8, n)  # Uncorrelated

df = pd.DataFrame({
    'A': A, 'B': B, 'C': C, 'D': D
})

# Create figure for histograms
plt.figure(figsize=(15, 10))
for i, col in enumerate(['A', 'B', 'C', 'D'], 1):
    plt.subplot(2, 2, i)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of Series {col}')
    plt.xlabel('Value')
    plt.ylabel('Count')
plt.tight_layout()
plt.savefig('distributions.png')
plt.close()

# Create scatter plots
plt.figure(figsize=(15, 10))
for i, col in enumerate(['B', 'C', 'D'], 1):
    plt.subplot(2, 2, i)
    plt.scatter(df['A'], df[col], alpha=0.5)
    plt.xlabel('Series A')
    plt.ylabel(f'Series {col}')

    # Add correlation coefficient
    corr = df['A'].corr(df[col])
    plt.title(f'A vs {col} (correlation: {corr:.3f})')

    # Add regression line
    z = np.polyfit(df['A'], df[col], 1)
    p = np.poly1d(z)
    plt.plot(df['A'], p(df['A']), "r--", alpha=0.8)

plt.tight_layout()
plt.savefig('correlations.png')
plt.close()

# Create correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap='RdBu', vmin=-1, vmax=1, center=0)
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

# Create box plots
plt.figure(figsize=(10, 6))
df.boxplot()
plt.title('Distribution Comparison (Box Plots)')
plt.ylabel('Value')
plt.savefig('boxplots.png')
plt.close()

# Calculate and print statistics
print("\nDescriptive Statistics:")
print("=====================")
print(df.describe())

print("\nCorrelation Matrix:")
print("=================")
print(df.corr())

# Standard deviation analysis
print("\nStandard Deviation Analysis:")
print("=========================")
for col in df.columns:
    data = df[col]
    mean = data.mean()
    std = data.std()
    within_1_std = np.mean(abs(data - mean) <= std)
    within_2_std = np.mean(abs(data - mean) <= 2*std)
    within_3_std = np.mean(abs(data - mean) <= 3*std)

    print(f"\nSeries {col}:")
    print(f"Within 1 std: {within_1_std:.1%}")
    print(f"Within 2 std: {within_2_std:.1%}")
    print(f"Within 3 std: {within_3_std:.1%}")


Descriptive Statistics:
                A           B           C           D
count  200.000000  200.000000  200.000000  200.000000
mean     0.011320    0.039344    0.092538    0.063240
std      0.803014    1.005989    1.237439    1.136819
min     -1.957604   -2.414429   -3.171960   -3.155739
25%     -0.634151   -0.794815   -0.814597   -0.735946
50%      0.033985    0.070107    0.094832   -0.042581
75%      0.648075    0.788394    0.977188    1.001905
max      1.664908    2.659870    3.066161    2.547985

Correlation Matrix:
          A         B         C         D
A  1.000000  0.955704 -0.209359  0.067899
B  0.955704  1.000000 -0.215271  0.067664
C -0.209359 -0.215271  1.000000 -0.089791
D  0.067899  0.067664 -0.089791  1.000000

Standard Deviation Analysis:

Series A:
Within 1 std: 60.0%
Within 2 std: 96.5%
Within 3 std: 100.0%

Series B:
Within 1 std: 65.5%
Within 2 std: 96.5%
Within 3 std: 100.0%

Series C:
Within 1 std: 67.5%
Within 2 std: 97.0%
Within 3 std: 100.0%

Series D:
W