# IE6400 Foundations for Data Analytics Engineering
# Fall 2023
### Module 2: Data and Sampling Distributions

#### Excercise 1 Random sample without replacement

In [None]:
import random

In [None]:
aList = [20, 40, 80, 100, 120]

In [None]:
sampled_list = random.sample(aList, 3)

In [None]:
print(sampled_list)

In [None]:
exampleList = [20, 40, 20, 20, 40, 60, 70]

In [None]:
sampled_list2 = random.sample(exampleList, 4) # choosing 4 random items from a list

In [None]:
print(sampled_list2)

#### Exercise 2 Random sample with replacement

In [None]:
names = ["Roger", "Nadal", "Novac", "Andre", "Sarena", "Mariya", "Martina"]

In [None]:
sample_list3 = random.choices(names, k=3)

In [None]:
print(sample_list3)

#### Exercise 3 Generate the sampled list of random integers

In [None]:
num_list = random.sample(range(100), 5) 

In [None]:
print(num_list)

In [None]:
random.shuffle(num_list)

In [None]:
print(num_list)

#### Exercise 4 Random sample from the Python set

In [None]:
aSet = {"Jhon", "kelly", "Scoot", "Emma", "Eric"}

In [None]:
aList = list(aSet)  

In [None]:
sampled_elements = random.sample(aList, 3)

In [None]:
print(sampled_elements)

#### Exercise 5 Random Sample from Python dictionary

In [None]:
marks_dict = {
    "Kelly": 55,
    "jhon": 70,
    "Donald": 60,
    "Lennin": 50
}

In [None]:
sampled_items = random.choices(list(marks_dict.items()), k=2)

In [None]:
sampled_dict = dict(sampled_items)

In [None]:
print(sampled_dict)

In [None]:
print(sampled_dict[list(sampled_dict.keys())[0]], sampled_dict[list(sampled_dict.keys())[1]])

In [None]:
print(sampled_dict[list(sampled_dict.keys())[1]], sampled_dict[list(sampled_dict.keys())[0]])

#### Exercise 6 Random seed to get the same sample list every time

In [None]:
alist = [20.5, 40.5, 30.5, 50.5, 70.5]

In [None]:
for i in range(1):
    # use 4 as a seed value
    random.seed(4)
    # get sample list of three item
    sample_list = random.sample(alist, 3)
    print(sample_list)

In [None]:
for i in range(1):
    # use 3 as a seed value
    random.seed(3)
    # get sample list of three item
    sample_list = random.sample(alist, 3)
    print(sample_list)

#### Exercise 7 Get a sample array from a multidimensional array

In [None]:
!pip install numpy

In [None]:
import numpy as np

In [None]:
array = np.array([[2, 4, 6], [5, 10, 15], [6, 12, 18], [7, 14, 21], [8, 16, 24]])

In [None]:
randomRows = np.random.randint(5, size=2)

In [None]:
for i in randomRows:
    print(array[i, :])

#### Exercise 8 Biased Sampling Using Random module

In [None]:
items = ["A", "B", "C", "D"]

In [None]:
weights = [0.2, 0.3, 0.4, 0.1]

In [None]:
sample_size = 3

In [None]:
sampled_items = random.choices(items, weights=weights, k=sample_size)

In [None]:
print("Sampled items:", sampled_items)

#### Exercise 9 Biased Sampling using NumPy module

In [None]:
def custom_prob_distribution(x):
    if x % 2 == 0:
        return 0.6
    else:
        return 0.4

In [None]:
items = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [None]:
probabilities = np.array([custom_prob_distribution(x) for x in items])

In [None]:
print(probabilities)

In [None]:
probabilities /= probabilities.sum()

In [None]:
sample_size = 5

In [None]:
sampled_indices = np.random.choice(len(items), size=sample_size, replace=False, p=probabilities)

In [None]:
sampled_items = items[sampled_indices]

In [None]:
print("Sampled items:", sampled_items)

#### Exercise 10 Biased Sampling using Roulette Wheel Selection

In [None]:
individuals = [("A", 0.2), ("B", 0.3), ("C", 0.4), ("D", 0.1)]

In [None]:
cumulative_fitness = [sum([ind[1] for ind in individuals[:i+1]]) for i in range(len(individuals))]

In [None]:
print(cumulative_fitness)

In [None]:
sample_size = 3

In [None]:
sampled_individuals = []

In [None]:
for _ in range(sample_size):
    rand_value = random.random()  # Generate a random value between 0 and 1
    selected_index = None

    # Find the index of the individual whose cumulative fitness is greater than rand_value
    for i, fitness in enumerate(cumulative_fitness):
        if rand_value <= fitness:
            selected_index = i
            break

    sampled_individuals.append(individuals[selected_index][0]) # Append individuals with selected index to empty list

In [None]:
print("Sampled individuals:", sampled_individuals)

#### Exercise 11 Stratified sampling

In [None]:
!pip install pandas

In [None]:
import pandas as pd

In [None]:
students = {
	'Name': ['Lisa', 'Kate', 'Ben', 'Kim', 'Josh',
			'Alex', 'Evan', 'Greg', 'Sam', 'Ella'],
	'ID': ['001', '002', '003', '004', '005', '006',
		'007', '008', '009', '010'],
	'Grade': ['A', 'A', 'C', 'B', 'B', 'B', 'C',
			'A', 'A', 'A'],

	'Category': [2, 3, 1, 3, 2, 3, 3, 1, 2, 1]
}

In [None]:
df = pd.DataFrame(students)

In [None]:
df

In [None]:
df.groupby('Grade', group_keys=False).apply(lambda x: x.sample(2))

In [None]:
df.groupby('Grade', group_keys=False).apply(lambda x: x.sample(frac=0.6))

#### Exercise 12 Systematic Sampling

In [None]:
number_of_students = 15

In [None]:
data = {'Id': np.arange(1, number_of_students+1).tolist(),
        'height': [159, 171, 158, 162, 162, 177, 160, 175,
                   168, 171, 178, 178, 173, 177, 164]}

In [None]:
df = pd.DataFrame(data)

In [None]:
display(df)

In [None]:
def systematic_sampling(df, step):
 
    indexes = np.arange(0, len(df), step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

In [None]:
systematic_sample = systematic_sampling(df, 3)

In [None]:
systematic_sample

#### Exercise 13 Setting step and start for systematic sampling

In [None]:
sample_size = 5

In [None]:
step = int(round(len(df)/sample_size,0))

In [None]:
start_index = random.randint(1, step)

In [None]:
sampled_df = df.iloc[start_index::step]

In [None]:
sampled_df

#### Exercise 14 Cluster Sampling

In [None]:
df = pd.DataFrame({'tour': np.repeat(np.arange(1,11), 20),
                   'experience': np.random.normal(loc=7, scale=1, size=200)})

In [None]:
df.head(5)

In [None]:
clusters = np.random.choice(np.arange(1,11), size=4, replace=False)

In [None]:
cluster_sample = df[df['tour'].isin(clusters)]

In [None]:
cluster_sample.head(6)

In [None]:
cluster_sample['tour'].value_counts()

#### Exercise 15 Convenience Sampling

In [None]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'Age': [25, 30, 22, 35, 28, 19, 42, 29, 31, 40],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Female', 'Female', 'Male']
}

In [None]:
df = pd.DataFrame(data)

In [None]:
convenience_sample = df.head(5)

In [None]:
print(convenience_sample)

#### Exercise 16 Purposive Sampling

In [None]:
purposive_sample = df[df['Age'] > 30]

In [None]:
print(purposive_sample)

#### Exercise 17 Snowball Sampling

In [None]:
initial_participant = ['Alice']

In [None]:
while len(initial_participant) < 5:  # Select 5 participants
    # Randomly select a participant from the dataset
    selected_participant = df.sample(1)['Name'].values[0]
    
    # Add the selected participant to the list
    initial_participant.append(selected_participant)

In [None]:
snowball_sample = df[df['Name'].isin(initial_participant)]

In [None]:
print(snowball_sample)

#### Exercise 18 Quota Sampling

In [None]:
data = {
    'Gender': ['Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male'],
    'Age': [25, 30, 22, 35, 28, 19, 42, 29, 31, 40],
    'Income': [50000, 60000, 45000, 75000, 55000, 40000, 90000, 62000, 71000, 80000]
}

In [None]:
df = pd.DataFrame(data)

In [None]:
gender_quota = {'Male': 2, 'Female': 2}

In [None]:
age_groups = {'Age 18-24': (18, 24), 'Age 25-34': (25, 34), 'Age 35-44': (35, 44)}

In [None]:
quota_sample = pd.DataFrame(columns=df.columns)

In [None]:
for gender, gender_quota_count in gender_quota.items():
    for age_group, (start, end) in age_groups.items():
        # Select participants that meet the gender and age criteria
        eligible_participants = df[(df['Gender'] == gender) & (df['Age'].between(start, end))]
        
        # If there are enough eligible participants to meet the quota, add them to the quota sample
        if len(eligible_participants) >= gender_quota_count:
            selected_participants = eligible_participants.sample(gender_quota_count, random_state=42)
            quota_sample = quota_sample.append(selected_participants)

In [None]:
print(quota_sample)

In [None]:
import math

In [None]:
import statistics

In [None]:
!pip install scipy

In [None]:
import scipy.stats

#### Exercise 19 Calculating Mean

In [None]:
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])
}

df = pd.DataFrame(d)
print(df)

In [None]:
age_sum = df['Age'].sum()
age_sum

In [None]:
age_mean = age_sum/len(df['Age'])
age_mean

In [None]:
age_mean = df['Age'].mean()
age_mean

#### Exercise 20 Harmonic mean

In [None]:
harmonic_mean = scipy.stats.hmean(df['Age'])
harmonic_mean

In [None]:
n = len(df['Age'])
sum_reciprocals = sum(1 / x for x in df['Age'])
harmonic_mean = n / sum_reciprocals

In [None]:
harmonic_mean

#### Exercise 21 Trimmed mean

In [None]:
trim_percent = 0.1

In [None]:
n = len(df['Age'])
k = int(n * trim_percent)

In [None]:
df.sort_values(by=['Age'])

In [None]:
trimmed_mean = np.mean(df['Age'][k:n-k])

In [None]:
trimmed_mean

#### Exercise 22 Median

In [None]:
age_median = df['Age'].median()
age_median

#### Execise 23 Mode

In [None]:
age_mode = df['Age'].mode()
age_mode

#### Exercise 24 Variance

In [None]:
age_var = df['Age'].var(ddof=1)
age_var

#### Exercise 25 Standard Deviation

In [None]:
age_std = df['Age'].std()
age_std

#### Exercise 26 Range

In [None]:
age_range = max(df['Age']) - min(df['Age'])
age_range

#### Exercise 27 IQR

In [None]:
q1 = np.percentile(df['Age'], 25)

In [None]:
q3 = np.percentile(df['Age'], 75)

In [None]:
iqr = q3 - q1
iqr

#### Exercise 28 Summarizing Data

In [None]:
df.describe()

In [None]:
df.describe(include=['object'])

In [None]:
df.describe(include='all')

#### Exercise 29 Stem and Leaf Plot

In [None]:
!pip install stemgraphic

In [None]:
import stemgraphic

In [None]:
data = [16, 25, 47, 56, 23, 45, 19, 55, 44, 27]

In [None]:
stemgraphic.stem_graphic(data, scale = 10)

In [None]:
!pip install matplotlib

In [None]:
import matplotlib.pyplot as plt

In [None]:
data = [16, 25, 47, 56, 23, 45, 19, 55, 44, 27]

In [None]:
stems = [1, 1, 2, 2, 2, 4, 4, 4, 5, 5]

In [None]:
plt.ylabel('Data') # for label at y-axis

plt.xlabel('stems') # for label at x-axis

plt.xlim(0, 10) # limit of the values at x axis

plt.stem(stems, data) # required plot


#### Exercise 30 Histogram

In [None]:
d = {'Name':pd.Series(['Tom','James','Tom','Vin','James','Smith','Betina',
   'Lee','David','Tom','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
}

df = pd.DataFrame(d)
print(df)

In [None]:
df['Age'].plot(kind='hist')

#### Exercise 31 Dot plot

In [None]:
plt.plot(df['Name'], df['Age'], linestyle='None', marker='o')
plt.show()

#### Exercise 32 Box Plot

In [None]:
np.random.seed(10)
data = np.random.normal(100, 20, 200)

In [None]:
fig = plt.figure(figsize =(10, 7))
plt.boxplot(data)
plt.show()

#### Exercise 33 Parallel Box Plot

In [None]:
np.random.seed(10)

data_1 = np.random.normal(100, 10, 200)
data_2 = np.random.normal(90, 20, 200)
data_3 = np.random.normal(80, 30, 200)
data_4 = np.random.normal(70, 40, 200)
data = [data_1, data_2, data_3, data_4]

In [None]:
fig = plt.figure(figsize =(10, 7))

# Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])

# Creating plot
bp = ax.boxplot(data)

# show plot
plt.show()


#### Exercise 34 Customized Box Plot

In [None]:
np.random.seed(10)
data_1 = np.random.normal(100, 10, 200)
data_2 = np.random.normal(90, 20, 200)
data_3 = np.random.normal(80, 30, 200)
data_4 = np.random.normal(70, 40, 200)
data = [data_1, data_2, data_3, data_4]

In [None]:
fig = plt.figure(figsize =(10, 7))
ax = fig.add_subplot(111)

# Creating axes instance
bp = ax.boxplot(data, patch_artist = True,
				notch ='True', vert = 0)

colors = ['#0000FF', '#00FF00',
		'#FFFF00', '#FF00FF']

for patch, color in zip(bp['boxes'], colors):
	patch.set_facecolor(color)

# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
	whisker.set(color ='#8B008B',
				linewidth = 1.5,
				linestyle =":")

# changing color and linewidth of
# caps
for cap in bp['caps']:
	cap.set(color ='#8B008B',
			linewidth = 2)

# changing color and linewidth of
# medians
for median in bp['medians']:
	median.set(color ='red',
			linewidth = 3)

# changing style of fliers
for flier in bp['fliers']:
	flier.set(marker ='D',
			color ='#e7298a',
			alpha = 0.5)

# x-axis labels
ax.set_yticklabels(['data_1', 'data_2',
					'data_3', 'data_4'])

# Adding title
plt.title("Customized box plot")

# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

# show plot
plt.show()


---

Revised Date: September 30, 2023