# IE6400 Foundations for Data Analytics Engineering
# Fall 2023
### Module 2: Probability Distribution
#### - STUDENT VERSION -

#### Excercise 1 Probability Distribution of the Sum of Two Fair Six-Sided Dice Rolls

In [None]:
import numpy as np

# Define the sample space
sample_space = np.arange(2, 13)

# Initialize a dictionary to store probabilities
probabilities = {}

# Calculate probabilities for each sum
for sum_value in sample_space:
    count = np.sum(sample_space == sum_value)
    probability = count / 36.0
    probabilities[sum_value] = probability

probabilities

In [None]:
import matplotlib.pyplot as plt

# Extract sums and corresponding probabilities
sums = list(probabilities.keys())
probs = list(probabilities.values())

# Create a bar chart
plt.bar(sums, probs, tick_label=sums, color='green')
plt.xlabel('Sum of Two Dice Rolls')
plt.ylabel('Probability')
plt.title('Probability Distribution of the Sum of Two Dice Rolls')
plt.show()


#### Excercise 2 Generating and Analyzing a Binomial Distribution with SciPy

In [None]:
import numpy as np
from scipy.stats import binom
import matplotlib.pyplot as plt


In [None]:
n = 10  # Number of trials
p = 0.3  # Probability of success

In [None]:
binomial_dist = binom(n, p)

In [None]:
x_values = np.arange(0, n+1)

In [None]:
probabilities = binomial_dist.pmf(x_values)

In [None]:
plt.bar(x_values, probabilities)
plt.xlabel('Number of Successes')
plt.ylabel('Probability')
plt.title('Binomial Distribution')
plt.show()


#### Excercise 3 Calculating and Visualizing Binomial Probability Mass Function in Python

In [None]:
from scipy.stats import binom
import matplotlib.pyplot as plt

In [None]:
n = 10  # Number of trials
p = 0.3  # Probability of success
k_range = range(0, n+1)  # Range of possible numbers of successes

In [None]:
binomial_dist = binom(n, p)

In [None]:
probabilities = [binomial_dist.pmf(k) for k in k_range]

In [None]:
plt.bar(k_range, probabilities)
plt.xlabel('Number of Successes (k)')
plt.ylabel('Probability')
plt.title('Binomial Probability Mass Function (PMF)')
plt.show()


#### Excercise 4 Understanding the Negative Binomial Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
n = 5  # number of successes
p = 0.5  # probability of a success
size = 1000  # number of samples

samples = np.random.negative_binomial(n, p, size)

In [None]:
sns.histplot(samples, bins=30, kde=True)
plt.title('Negative Binomial Distribution')
plt.xlabel('Number of Failures before 5 Successes')
plt.ylabel('Frequency')
plt.show()

#### Exercise 5 Applying the Negative Binomial Distribution to a Dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import nbinom

In [None]:
n_actual = 7  # actual number of successes
p_actual = 0.4  # actual probability of a success
size = 5000  # number of samples

dataset = np.random.negative_binomial(n_actual, p_actual, size)

In [None]:
mean = np.mean(dataset)
variance = np.var(dataset)

# Estimating p using the relationship between mean and variance
p_estimated = mean / variance

# Estimating n using the estimated p
n_estimated = mean * p_estimated / (1 - p_estimated)

In [None]:
# Plotting the actual dataset histogram
sns.histplot(dataset, bins=30, kde=False, label='Actual Data', color='blue', alpha=0.5)

# Plotting the estimated PMF
x = np.arange(0, max(dataset)+1)
plt.plot(x, nbinom.pmf(x, n_estimated, p_estimated) * size, 'o-', label='Estimated PMF', color='red')

plt.title('Actual vs. Estimated Negative Binomial Distribution')
plt.xlabel('Number of Failures before Successes')
plt.ylabel('Frequency')
plt.legend()
plt.show()


#### Excercise 6 Understanding the Poisson Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
lambda_val = 5  # expected number of events in the interval
size = 1000  # number of samples

samples = np.random.poisson(lambda_val, size)


In [None]:
sns.histplot(samples, bins=30, kde=True)
plt.title('Poisson Distribution')
plt.xlabel('Number of Events')
plt.ylabel('Frequency')
plt.show()


#### Excercise 7 Understanding the Hypergeometric Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
NGood = 10  # number of successes in the population
NBad = 20  # number of failures in the population
nsample = 5  # number of draws
size = 1000  # number of samples

samples = np.random.hypergeometric(NGood, NBad, nsample, size)


In [None]:
sns.histplot(samples, bins=30, kde=True)
plt.title('Hypergeometric Distribution')
plt.xlabel('Number of Successes in Sample')
plt.ylabel('Frequency')
plt.show()


#### Exercise 8 Understanding the Multivariate Hypergeometric Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import hypergeom


In [None]:
colors = [13, 13, 13, 13]  # 13 cards of each suit in a deck: Hearts, Diamonds, Clubs, Spades
nsample = 10  # number of draws
size = 1000  # number of samples

M = sum(colors)  # total number of cards
N = nsample  # number of draws

# Generate samples for each suit
samples = np.array([hypergeom.rvs(M, color, N, size=size) for color in colors]).T

In [None]:
# Plotting the distribution for each suit
suits = ['Hearts', 'Diamonds', 'Clubs', 'Spades']

for idx, suit in enumerate(suits):
    sns.histplot(samples[:, idx], bins=np.arange(-0.5, nsample+1.5), kde=False, label=suit)

plt.title('Multivariate Hypergeometric Distribution')
plt.xlabel('Number of Cards Drawn')
plt.ylabel('Frequency')
plt.legend()
plt.show()


#### Exercise 9 Understanding the Uniform Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
low = 2  # 2 hours
high = 10  # 10 hours
size = 1000  # number of samples

samples = np.random.uniform(low, high, size)


In [None]:
sns.histplot(samples, bins=30, kde=True)
plt.title('Uniform Distribution of Processing Times')
plt.xlabel('Processing Time (hours)')
plt.ylabel('Frequency')
plt.show()


#### Exercise 10 Understanding the Normal Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
mean = 50  # average score
std = 10  # standard deviation
size = 1000  # number of samples

samples = np.random.normal(mean, std, size)


In [None]:
sns.histplot(samples, bins=30, kde=True)
plt.title('Normal Distribution of Examination Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()


#### Exercise 11 Understanding the Log-Normal Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
mean = 0  # mean of the logarithm
sigma = 0.5  # standard deviation of the logarithm
size = 1000  # number of samples

samples = np.random.lognormal(mean, sigma, size)


In [None]:
sns.histplot(samples, bins=50, kde=True)
plt.title('Log-Normal Distribution of Incomes')
plt.xlabel('Income')
plt.ylabel('Frequency')
plt.show()


#### Exercise 12 Understanding the Gamma Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
shape = 2  # number of events
scale = 1  # average interval between events
size = 1000  # number of samples

samples = np.random.gamma(shape, scale, size)


In [None]:
sns.histplot(samples, bins=50, kde=True)
plt.title('Gamma Distribution of Waiting Times')
plt.xlabel('Waiting Time (hours)')
plt.ylabel('Frequency')
plt.show()


#### Exercise 13 Understanding the Exponential Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
scale = 5  # average time (in hours) between breakdowns
size = 1000  # number of samples

samples = np.random.exponential(scale, size)


In [None]:
sns.histplot(samples, bins=50, kde=True)
plt.title('Exponential Distribution of Time Between Breakdowns')
plt.xlabel('Time (hours)')
plt.ylabel('Frequency')
plt.show()


-----

#### Revised Date: October 28, 2023