<a href="https://colab.research.google.com/github/Abraham-Nispel/Maths_and_Stats/blob/main/STATS/01_Statistics_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Statistics
STATISTICAL INFERENCE, PARAMETER
ESTIMATION, AND MODEL VERIFICATION
## References:
1. FUNDAMENTALS OF PROBABILITY AND STATISTICS FOR ENGINEERS by T.T.SONG [Link]()

In [None]:
#@title  Upload libraries

# Datascience libraries
import math
import numpy as np
import pandas as pd

# Machine learning preprocesing
# Machine learning modelling
# Machine learning validation

# Ploting libraries
import matplotlib.pyplot as plt
import seaborn as sns
# set seed for reproducibility
np.random.seed(0)

### Outliers
two methods
* The interquantile method
* The z-score method

In [1]:
#@title Load libraries and functions
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Functions
def outliers(df, n, features):
    """
    This function computes the outliers based on the interquartile range method and for the z-score method.

    Args:
        df (pd.DataFrame): The dataframe containing the data.
        n (int): The number of standard deviations to use for the z-score method.
        features (list): A list of features to compute the outliers for.

    Returns:
        tuple: Two dataframes, one with the outliers calculated by the IQR method and the other by the z-score method.
    """
    # Initialize lists to store outlier records
    outliers_iqr_records = []
    outliers_zscore_records = []

    for col in features:
        # IQR method
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        iqr_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        for index, row in iqr_outliers.iterrows():
            outliers_iqr_records.append([index, col, row[col]])

        # Z-score method
        mean = df[col].mean()
        std = df[col].std()
        z_scores = (df[col] - mean) / std
        z_outliers = df[abs(z_scores) > n]
        for index, row in z_outliers.iterrows():
            outliers_zscore_records.append([index, col, row[col]])

    # Convert the lists of records into DataFrames
    df_outliers_iqr = pd.DataFrame(outliers_iqr_records, columns=['Index', 'Feature_Name', 'Outlier_Value'])
    df_outliers_zscore = pd.DataFrame(outliers_zscore_records, columns=['Index', 'Feature_Name', 'Outlier_Value'])

    return df_outliers_iqr, df_outliers_zscore

In [None]:
#@title Example 1 Outliers (part 1)
# Load Sample data
in_data = sns.load_dataset("tips"); #print(in_data.head())  # You can replace this with your own dataset

# Create a boxplot using Seaborn (categorical)
sns.boxplot(x="day", y="total_bill", data=in_data) # x= categorical values, y=numerical
# Add info and Show the plot
plt.xlabel("Day of the Week"); plt.ylabel("Total Bill ($)")
plt.title("Distribution of Total Bill Amounts by Day")
plt.show()

new_df = pd.DataFrame({'day': in_data['day'],
                       'total_bill': in_data['total_bill'],
                       'total_bill_2': in_data['total_bill']})
new_df.head()

In [None]:
#@title Example 1 Outliers (part 2)
# Assuming 'df' is your DataFrame and you're interested in features 'Feature1' and 'Feature2'
df_outliers_iqr, df_outliers_zscore = outliers(new_df, 2, ['total_bill', 'total_bill_2'])
print("IQR Method Outliers:\n", df_outliers_iqr)
print("\nZ-Score Method Outliers:\n", df_outliers_zscore)

# Apply Interquantile range and z score methods
iqr_df, z_df = outliers_with_index(new_df, 3, ['total_bill'])
print(iqr_df)
print(z_df)#