# Descriptive Statistics

This script does the following:

- We define two functions: **descriptive_stats_numeric** for continuous data and **descriptive_stats_categorical** for categorical data.
- The descriptive_stats_numeric function calculates various statistics including Tukey's fences for outlier detection.
- The descriptive_stats_categorical function calculates relevant statistics for categorical data, such as mode, unique value count, and frequencies of the most common categories.
- We create an analyze_dataset function that applies the appropriate statistics function to each column based on its data type.
- In the example usage, we create a sample dataset with mixed data types (numeric and categorical).
- We apply the analyze_dataset function to our DataFrame and print the results.

Key points:

- The script automatically detects whether a column is numeric or categorical using pd.api.types.is_numeric_dtype().
- For numeric columns, it calculates statistics like mean, median, standard deviation, and uses Tukey's fences for outlier detection.
- For categorical columns, it provides information like the number of unique values, mode, and counts of the most common categories.
- We use dropna() when calculating statistics to handle any potential missing values.

To use this with your own dataset:

- Load your data into a pandas DataFrame.
- Call the analyze_dataset(df) function with your DataFrame.
- The function will return a dictionary with statistics for each column.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

def descriptive_stats_numeric(data, tukey_factor=1.5):
    """Calculate descriptive statistics for numeric data"""
    desc = stats.describe(data)
    percentiles = np.percentile(data, [25, 50, 75])
    q1, q3 = percentiles[0], percentiles[2]
    iqr = q3 - q1
    lower_fence = q1 - tukey_factor * iqr
    upper_fence = q3 + tukey_factor * iqr
    outliers = data[(data < lower_fence) | (data > upper_fence)]

    return {
        "column": data.name,
        "n": desc.nobs,
        "min": desc.minmax[0],
        "max": desc.minmax[1],
        "mean": desc.mean,
        "std": np.sqrt(desc.variance),
        "median": percentiles[1],
        "q1": q1,
        "q3": q3,
        "iqr": iqr,
        "skewness": desc.skewness,
        "kurtosis": desc.kurtosis,
        "lower_fence": lower_fence,
        "upper_fence": upper_fence,
        "n_outliers": len(outliers)
    }

def descriptive_stats_categorical(data):
    """Calculate descriptive statistics for categorical data"""
    value_counts = data.value_counts()
    return {
        "column": data.name,
        "n": len(data),
        "n_unique": data.nunique(),
        "mode": data.mode().iloc[0],
        "mode_count": value_counts.iloc[0],
        "second_most_common": value_counts.index[1] if len(value_counts) > 1 else None,
        "second_most_common_count": value_counts.iloc[1] if len(value_counts) > 1 else None,
    }

def analyze_dataset_numerical(df):
    """Analyze numeric columns in the DataFrame"""
    results = []

    for column in df.select_dtypes(include=np.number).columns:
        col_data = df[column].dropna()
        stats = descriptive_stats_numeric(col_data)
        results.append(stats)

    return pd.DataFrame(results)

def analyze_dataset_categorical(df):
    """Analyze categorical columns in the DataFrame"""
    results = []

    for column in df.select_dtypes(exclude=np.number).columns:
        col_data = df[column].dropna()
        stats = descriptive_stats_categorical(col_data)
        results.append(stats)

    return pd.DataFrame(results)



def analyze_dataset(df):
    """Analyze each column in the dataset"""
    results = {}

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            results[column] = descriptive_stats_numeric(df[column].dropna())
        else:
            results[column] = descriptive_stats_categorical(df[column].dropna())

    return results

def missing_data(df):
  """Calculate missing data in a dataframe"""
  df_overview = []
  columns = df.columns
  for i in columns :
    types = df[i].dtypes
    unique_data = df[i].nunique()

    missing_count=df[i].isnull().sum()
    value_count= df[i].isnull().count()
    missing_percentage= round(missing_count/value_count*100,2)

    df_overview.append([i , types , unique_data , missing_count, missing_percentage])

  df_info = pd.DataFrame (df_overview)
  df_info.columns =['name of column' , 'types' ,'unique_data' , 'missing value', "missing percentage"]
  df_info.style.highlight_max(color = 'green', axis = 0)

  return df_info

def boxplot_df(df):
  import matplotlib.pyplot as plt
  import seaborn as sns
  plt.subplots(nrows=4 , ncols=4 , figsize=(20,20))
  plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.4, hspace=0.8)

  for i in range (len(df.columns)-1):

    plt.subplot(4,4,i+1)
    sns.boxplot(df[df.columns[i]], color='aqua',flierprops = dict(markerfacecolor = '0.10', markersize = 5))

def plot_correlation_matrix(df):
  import matplotlib.pyplot as plt
  import seaborn as sns
  fig = plt.figure(figsize = (15, 8), facecolor='lightblue')
  sns.heatmap(df.corr() , annot=True , fmt=".2f")
