Import standard libraries we need.

In [38]:
import sys
import math
import numpy  as np


  Calculate the mean of a list of numbers.

  Parameters:
  data (list): A list of numbers.

  Returns:
  float: The mean of the numbers in the list.

In [39]:
def mean(data):
  return np.sum(data) / len(data)

  Calculate the median of a list of numbers.

  Parameters:
  data (list): A list of numbers.

  Returns:
  float: The median value of the input data.

  Examples:
  ```python
  median([1, 2, 3, 4, 5])
  3
  median([1, 2, 3, 4, 5, 6])
  3.5
  ```

In [40]:
def median(data):
  sorted_data = np.sort(data)
  n = len(sorted_data)
  mid = n // 2
  if n % 2 == 0:
    return (sorted_data[mid - 1] + sorted_data[mid]) / 2
  else:
    return sorted_data[mid]

### Calculate the variance of a given dataset.

In [41]:
def variance(data):
  m = mean(data)
  return np.sum((x - m) ** 2 for x in data) / len(data)


### Calculate the standard deviation of a given dataset.
**Example**
```python
data = [1, 2, 3, 4, 5]
standard_deviation(data)
```
>>>1.4142135623730951

In [42]:
def std_dev(data):
  if len(data) == 0:
    raise ValueError("Dataset is empty.")

  return math.sqrt(variance(data))

  Calculate the quartiles and interquartile range (IQR) of a given dataset.

  Parameters:
  - data (list): A list of numerical values representing the dataset.

  Returns:
  - q1 (float): The first quartile (25th percentile) of the dataset.
  - q2 (float): The second quartile (50th percentile or median) of the dataset.
  - q3 (float): The third quartile (75th percentile) of the dataset.
  - iqr (float): The interquartile range (IQR) of the dataset, calculated as q3 - q1.

In [43]:
def quartiles(data):
  sorted_data = sorted(data)
  n = len(sorted_data)
  mid = n // 2
  if n % 2 == 0:
    lower_half = sorted_data[:mid]
    upper_half = sorted_data[mid:]
  else:
    lower_half = sorted_data[:mid]
    upper_half = sorted_data[mid + 1:]

  q1 = median(lower_half)
  q2 = median(sorted_data)
  q3 = median(upper_half)
  iqr = q3 - q1

  return q1, q2, q3, iqr

Detects outliers and extreme outliers in a given dataset.

Parameters:
data (list): A list of numerical values representing the dataset.

Returns:
tuple: A tuple containing two lists - outliers and extreme outliers.
    - outliers: A list of values that are considered outliers.
    - extreme_outliers: A list of values that are considered extreme outliers.

In [44]:
def detect_outliers(data):
    q1, q2, q3, iqr = quartiles(data)
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outliers = [x for x in data if x < lower_bound or x > upper_bound]
    extreme_outliers = [x for x in data if x < lower_bound - 3 * iqr or x > upper_bound + 3 * iqr]

    return outliers, extreme_outliers

  Calculate the skewness of a given dataset.

  Parameters:
  data (list or array-like): The dataset for which skewness needs to be calculated.

  Returns:
  float: The skewness value of the dataset.

In [53]:
def skewness(data):
  m = mean(data)
  mod = median(data)
  s = std_dev(data)

  return (m - mod) / s


  Calculate the kurtosis of a given dataset.

  Parameters:
  data (list or array-like): The dataset for which kurtosis needs to be calculated.

  Returns:
  float: The kurtosis value of the dataset.

  Formula:
  The kurtosis is calculated using the fourth moment about the mean divided by the square of the variance.
  The formula used is:
  kurtosis = (sum((x - m) ** 4 for x in data) * n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3) * s ** 4) - 3 * (n - 1) ** 2 / ((n - 2) * (n - 3))

  where:
  - m: Mean of the dataset
  - s: Standard deviation of the dataset
  - n: Number of elements in the dataset
  - x: Each element in the dataset

In [46]:
def kurtosis(data):
  m = mean(data)
  s = std_dev(data)
  n = len(data)

  return sum((x - m) ** 4 for x in data) * n * (n + 1) / ((n - 1) * (n - 2) * (n - 3) * s ** 4) - 3 * (n - 1) ** 2 / ((n - 2) * (n - 3))

  Suggests transformations based on the skewness and kurtosis of a distribution.

  Parameters:
  skew (float): The skewness of the distribution.
  kurt (float): The kurtosis of the distribution.

  Returns:
  list: A list of suggested transformations based on the skewness and kurtosis.

In [47]:
def suggest_transformations(skew, kurt):
  transformations = []
  if abs(skew) > 1:
    transformations.append("Logarithmic Transformation")
  if abs(kurt) > 3:
    transformations.append("Square Root Transformation")
  if abs(skew) > 1 and abs(kurt) > 3:
    transformations.append("Box-Cox Transformation")

  return transformations

In [54]:
# Main function
def analyze_data_distribution(file_name):
    file_path = file_name
    data = np.loadtxt(file_path, delimiter=',')

    # Basic Statistical Measures
    mean_value = mean(data)
    median_value = median(data)
    variance_value = variance(data)
    std_dev_value = std_dev(data)

    # Quartile Values and IQR
    q1, q2, q3, iqr = quartiles(data)

    # Anomaly Detection
    outliers, extreme_outliers = detect_outliers(data)

    # Skewness and Kurtosis
    skew = skewness(data)
    kurt = kurtosis(data)

    # Suggested Data Transformations
    transformations = suggest_transformations(skew, kurt)

    # Print results
    print("Basic Statistical Measures:")
    print(f"Mean: {mean_value}")
    print(f"Median: {median_value}")
    print(f"Variance: {variance_value}")
    print(f"Standard Deviation: {std_dev_value}")

    print("\nQuartiles:")
    print(f"Q1: {q1}")
    print(f"Q2: {q2}")
    print(f"Q3: {q3}")
    print(f"IQR: {iqr}")

    print("\nAnomaly Detection:")
    print(f"Outliers: {outliers}")
    print(f"Extreme Outliers: {extreme_outliers}")

    print("\nSkewness and Kurtosis:")
    print(f"Skewness: {skew}")
    print(f"Kurtosis: {kurt}")


    print("\nSuggested Data Transformations:")
    if  transformations:
        print("\n".join( suggest_transformations))
    else:
        print("No suggested data transformations.")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print(f"Usage: {sys.argv[0]} <file_name>")
        sys.exit(1)

    file_name = "Data-1.txt"

    analyze_data_distribution(file_name)



  return np.sum((x - m) ** 2 for x in data) / len(data)


Basic Statistical Measures:
Mean: 100.03485078
Median: 99.8985
Variance: 399.61406751647326
Standard Deviation: 19.9903493595403

Quartiles:
Q1: 86.5805
Q2: 99.8985
Q3: 113.47
IQR: 26.889499999999998

Anomaly Detection:
Outliers: [159.76, 45.209, 158.44, 161.59, 44.641, 41.256, 158.91, 39.226, 45.647, 164.69, 40.761, 42.829, 44.124, 154.76, 154.53, 158.6, 159.01, 45.819, 29.215, 157.44, 169.93, 40.264, 177.99, 45.487, 159.23, 161.78, 31.487, 159.61, 167.08, 38.267, 39.343, 45.112, 155.16, 43.394, 154.23, 159.86, 157.12, 45.758, 156.77, 156.47, 155.36, 158.0, 175.36, 45.275, 156.4, 164.83, 45.794, 158.12, 158.24, 35.282, 42.358, 153.92, 156.08, 157.61, 44.77, 44.612, 159.1, 42.008, 37.906, 40.812, 42.246, 45.651, 175.66, 41.651, 41.5, 160.11, 157.95, 42.624, 157.37, 155.33, 36.921, 37.011, 45.449, 171.53, 161.76, 161.13, 160.17, 44.078, 166.76, 45.42, 45.283, 42.047, 159.99, 41.782, 42.748, 172.62, 157.86, 165.6, 160.43, 154.93, 42.593, 155.27, 163.01, 45.56, 41.949, 42.229, 41.803, 154