## **Descriptive Statistics**

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

***1. Measure of central tendency***

***i) Mean***

In [2]:
# Dataset without outlier
data1 = [80, 85, 90, 95, 100]
mean1 = np.mean(data1)

# Dataset with an outlier
data2 = [80, 85, 90, 95, 200]  # 200 is an outlier
mean2 = np.mean(data2)

print("Mean without outlier:", mean1)
print("Mean with outlier:", mean2)


Mean without outlier: 90.0
Mean with outlier: 110.0


***ii) Median***

In [3]:
# Dataset without outlier
median1 = np.median(data1)

# Dataset with an outlier
median2 = np.median(data2)

print("Median without outlier:", median1)
print("Median with outlier:", median2)

Median without outlier: 90.0
Median with outlier: 90.0


***iii) Mode***

In [7]:
# Numerical dataset
data = [4, 7, 8, 7, 6, 7, 9, 10]

# Finding mode using Pandas
mode_value = pd.Series(data).mode()

print("Mode:", mode_value.tolist())  # Convert to list for multiple modes

Mode: [7]


In [8]:
# Categorical dataset
fruits = ["Apple", "Banana", "Apple", "Orange", "Apple", "Orange", "Banana", "Apple"]

# Finding mode using Pandas
mode_fruit = pd.Series(fruits).mode()

print("Most Frequent Fruit (Mode):", mode_fruit.tolist())  # Convert to list

Most Frequent Fruit (Mode): ['Apple']


***iv) Weighted Mean***

In [9]:
# Football player market value prediction example
values = np.array([50, 55, 60])  # Predictions from models
weights = np.array([0.2, 0.3, 0.5])  # Importance of each model

# Weighted mean calculation
weighted_mean = np.average(values, weights=weights)

print("Football Player Predicted Market Value (Weighted Mean):", round(weighted_mean, 2), "M")


Football Player Predicted Market Value (Weighted Mean): 56.5 M


In [10]:
# Student's grade example
scores = np.array([85, 78, 92])  # Scores in Assignments, Midterm, Final
weights = np.array([0.3, 0.3, 0.4])  # Weightage

# Weighted mean calculation
final_grade = np.average(scores, weights=weights)

print("Student's Final Grade (Weighted Mean):", round(final_grade, 2))

Student's Final Grade (Weighted Mean): 85.7


***v) Trimmed mean***

In [11]:
from scipy import stats

# Example dataset (Athlete Speeds)
data = [24, 25, 27, 28, 30, 50, 29, 26, 100, 31]

# Calculate 20% trimmed mean
trimmed_mean = stats.trim_mean(data, proportiontocut=0.1)  # Removes 10% from both ends

print("Trimmed Mean:", round(trimmed_mean, 2))


Trimmed Mean: 30.75


In [12]:
# Example dataset (Employee Salaries in $1000s)
salaries = [30, 32, 35, 40, 100, 120, 150, 200, 250, 300]

# Calculate 20% trimmed mean
trimmed_mean_salary = stats.trim_mean(salaries, proportiontocut=0.2)  # Removes 20% from both ends

print("Trimmed Mean Salary:", round(trimmed_mean_salary, 2))

Trimmed Mean Salary: 107.5


***2. Measure of Dispersion***

In [13]:
# Two datasets
dataset_A = np.array([-5, 0, 5])
dataset_B = np.array([-10, 0, 10])

# Mean
mean_A = np.mean(dataset_A)
mean_B = np.mean(dataset_B)

# Range
range_A = np.ptp(dataset_A)  # Max - Min
range_B = np.ptp(dataset_B)

# Variance
variance_A = np.var(dataset_A, ddof=0)  # Population variance
variance_B = np.var(dataset_B, ddof=0)

# Standard Deviation
std_A = np.std(dataset_A, ddof=0)
std_B = np.std(dataset_B, ddof=0)

# Print results
print(f"Dataset A - Mean: {mean_A}, Range: {range_A}, Variance: {variance_A}, Std Dev: {std_A}")
print(f"Dataset B - Mean: {mean_B}, Range: {range_B}, Variance: {variance_B}, Std Dev: {std_B}")


Dataset A - Mean: 0.0, Range: 10, Variance: 16.666666666666668, Std Dev: 4.08248290463863
Dataset B - Mean: 0.0, Range: 20, Variance: 66.66666666666667, Std Dev: 8.16496580927726


***i) Range***

In [14]:
# Example datasets
data_without_outlier = np.array([5, 10, 15, 20, 25])
data_with_outlier = np.array([5, 10, 15, 20, 100])

# Calculate range
range_without_outlier = np.ptp(data_without_outlier)  # Max - Min
range_with_outlier = np.ptp(data_with_outlier)

print(f"Range without outlier: {range_without_outlier}")
print(f"Range with outlier: {range_with_outlier}")


Range without outlier: 20
Range with outlier: 95


***ii) Variance***

In [15]:
# Example dataset
data = np.array([5, 10, 15, 20, 100])

# Population Variance
pop_variance = np.var(data, ddof=0)  # ddof=0 means divide by N

# Sample Variance
sample_variance = np.var(data, ddof=1)  # ddof=1 means divide by (n-1)

print(f"Population Variance: {pop_variance}")
print(f"Sample Variance: {sample_variance}")


Population Variance: 1250.0
Sample Variance: 1562.5


***iii) Standard Deviation***

In [16]:
# Example dataset
data = np.array([5, 10, 15, 20, 100])

# Population Standard Deviation
pop_std_dev = np.std(data, ddof=0)  # ddof=0 means divide by N

# Sample Standard Deviation
sample_std_dev = np.std(data, ddof=1)  # ddof=1 means divide by (n-1)

print(f"Population Standard Deviation: {pop_std_dev}")
print(f"Sample Standard Deviation: {sample_std_dev}")


Population Standard Deviation: 35.35533905932738
Sample Standard Deviation: 39.528470752104745


***iv) Coefficient of Variation (CV)***

In [17]:
# Example datasets
heights = np.array([160, 165, 170, 175, 180])  # Heights in cm
salaries = np.array([40000, 45000, 50000, 55000, 60000])  # Salaries in $

# Function to calculate Coefficient of Variation
def coefficient_of_variation(data):
    mean = np.mean(data)
    std_dev = np.std(data, ddof=1)  # Sample Standard Deviation
    return (std_dev / mean) * 100  # Convert to percentage

# Calculate CV for each dataset
cv_heights = coefficient_of_variation(heights)
cv_salaries = coefficient_of_variation(salaries)

print(f"Coefficient of Variation for Heights: {cv_heights:.2f}%")
print(f"Coefficient of Variation for Salaries: {cv_salaries:.2f}%")

Coefficient of Variation for Heights: 4.65%
Coefficient of Variation for Salaries: 15.81%


***v) Interquartile Range (IQR)***

In [18]:
import numpy as np

# Sample dataset
data = np.array([5, 7, 9, 10, 15, 21, 25, 30, 40])

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)

# Calculate IQR
IQR = Q3 - Q1

# Find outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find outliers
outliers = data[(data < lower_bound) | (data > upper_bound)]

print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
print(f"Outliers: {outliers if len(outliers) > 0 else 'No Outliers'}")

Q1: 9.0, Q3: 25.0, IQR: 16.0
Lower Bound: -15.0, Upper Bound: 49.0
Outliers: No Outliers
