# IE6400 Foundations for Data Analytics Engineering
# Fall 2023
### Module 2: Nonparametric Methods
#### - STUDENT VERSION -

#### Exercise 1 Ranking data

In [1]:
import pandas as pd

In [2]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Score': [85, 72, 92, 72]}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Score
0,Alice,85
1,Bob,72
2,Charlie,92
3,David,72


In [4]:
df['Rank'] = df['Score'].rank(method='average', ascending=False)

print(df)

      Name  Score  Rank
0    Alice     85   2.0
1      Bob     72   3.5
2  Charlie     92   1.0
3    David     72   3.5


#### Exercise 2  Ranking Using the scipy.stats.rankdata() Function:

In [5]:
import numpy as np
from scipy.stats import rankdata

In [6]:
scores = np.array([85, 72, 92, 72])
ranks = rankdata(-scores, method='average')

In [7]:
print('Data:', scores)
print('Ranks:', ranks)

Data: [85 72 92 72]
Ranks: [2.  3.5 1.  3.5]


#### Exercise 3 Ranking Using the pandas.Series.rank() Method:

In [8]:
import pandas as pd

In [9]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Score': [85, 72, 92, 72]}

In [10]:
df = pd.DataFrame(data)
df['Rank'] = df['Score'].rank(ascending=False, method='average')

print(df)

      Name  Score  Rank
0    Alice     85   2.0
1      Bob     72   3.5
2  Charlie     92   1.0
3    David     72   3.5


#### Exercise 4 Ranking Using the argsort() Function:

In [11]:
import numpy as np

In [12]:
scores = np.array([85, 72, 92, 72])
ranks = np.argsort(-scores) + 1

print('Data:', scores)
print('Ranks:', ranks)

Data: [85 72 92 72]
Ranks: [3 1 2 4]


#### Exercise 5  

In [13]:
from numpy.random import rand
from numpy.random import seed
from scipy.stats import rankdata

In [14]:
# seed random number generator
seed(1)

In [15]:
# generate dataset
data = rand(1000)

In [16]:
# review first 10 samples
print(data[:10])

[4.17022005e-01 7.20324493e-01 1.14374817e-04 3.02332573e-01
 1.46755891e-01 9.23385948e-02 1.86260211e-01 3.45560727e-01
 3.96767474e-01 5.38816734e-01]


In [17]:
# rank data
ranked = rankdata(data)
# review first 10 ranked samples
print(ranked[:10])

[408. 721.   1. 300. 151.  93. 186. 342. 385. 535.]


#### Exercise 6 Kendall Tau Coefficient

In [18]:
import numpy as np
from scipy.stats import kendalltau
import matplotlib.pyplot as plt

In [19]:
# Sample data
rankings_A = np.array([1, 2, 3, 4, 5])
rankings_B = np.array([3, 2, 4, 1, 5])

In [20]:
tau, p_value = kendalltau(rankings_A, rankings_B)
print(f'Kendall Tau Coefficient: {tau:.2f}')
print(f'p-value: {p_value:.2f}')

Kendall Tau Coefficient: 0.20
p-value: 0.82


In [21]:
# Interpret the correlation
alpha = 0.05
if p_value < alpha:
    print("There is a statistically significant correlation.")
else:
    print("There is no statistically significant correlation.")

There is no statistically significant correlation.


#### Exercise 7 Case Study - Analyzing Exam Scores and Study Hours

In [22]:
import pandas as pd

In [23]:
df = pd.read_csv('example.csv')

In [24]:
df

Unnamed: 0,Hours_Study,Exam_Score
0,10,85
1,5,60
2,8,75
3,3,50
4,12,92
5,15,98
6,9,80
7,7,70
8,2,45
9,11,88


In [25]:
from scipy import stats

In [26]:
# Calculate Kendall's Tau and p-value
tau, p_value = stats.kendalltau(df['Hours_Study'], df['Exam_Score'])

# Set significance level
alpha = 0.05

In [27]:
# Compare p-value to alpha
if p_value < alpha:
    result = "Reject the null hypothesis. There is a significant correlation."
else:
    result = "Fail to reject the null hypothesis. There is no significant correlation."

In [28]:
print(f"Kendall's Tau (τ) = {tau:.2f}")
print(f"P-value = {p_value:.4f}")
print(result)

Kendall's Tau (τ) = 0.97
P-value = 0.0000
Reject the null hypothesis. There is a significant correlation.


#### Exercise 8 Wilcoxon test

In [29]:
import numpy as np
from scipy.stats import wilcoxon
import matplotlib.pyplot as plt

In [30]:
# Sample paired data
before_treatment = np.array([20, 22, 24, 19, 18, 21, 25, 23, 22, 20])
after_treatment = np.array([19, 24, 22, 20, 19, 20, 26, 22, 21, 19])

In [31]:
w, p_value = wilcoxon(before_treatment, after_treatment)
print(f'Wilcoxon Test Statistic: {w}')
print(f'p-value: {p_value:.4f}')

Wilcoxon Test Statistic: 23.0
p-value: 0.6953


In [32]:
# Interpret the results
alpha = 0.05  # significance level
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference between the two groups.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the two groups.")

Fail to reject the null hypothesis: There is no significant difference between the two groups.


#### Exercise 9 Kruskal-Wallis H Test

In [33]:
import numpy as np
from scipy.stats import kruskal

In [34]:
group1 = [22, 25, 28, 30, 32]
group2 = [18, 21, 23, 26, 29]
group3 = [15, 16, 19, 20, 24]

In [35]:
statistic, p_value = kruskal(group1, group2, group3)

In [36]:
print("Kruskal-Wallis H Test:")
print(f"Kruskal-Wallis H Statistic: {statistic:.2f}")
print(f"P-value: {p_value:.4f}")

Kruskal-Wallis H Test:
Kruskal-Wallis H Statistic: 6.86
P-value: 0.0324


In [37]:
if p_value < 0.05:
    print("There is a significant difference among the groups.")
else:
    print("There is no significant difference among the groups.")

There is a significant difference among the groups.


#### Exercise 10 Friedman Test

In [None]:
import numpy as np
from scipy.stats import friedmanchisquare

In [None]:
group1 = [10, 12, 15, 8, 11]
group2 = [8, 10, 13, 6, 12]
group3 = [9, 11, 14, 7, 10]

In [None]:
statistic, p_value = friedmanchisquare(group1, group2, group3)

In [None]:
print("Friedman Test:")
print(f"Friedman Test Statistic: {statistic:.2f}")
print(f"P-value: {p_value:.4f}")

In [None]:
if p_value < 0.05:
    print("There is a significant difference among the groups.")
else:
    print("There is no significant difference among the groups.")

#### Exercise 11 Mann-Whitney U Test

In [None]:
import numpy as np
from scipy.stats import mannwhitneyu

In [None]:
group1 = [20, 24, 22, 26, 21]
group2 = [30, 32, 31, 34, 35]

In [None]:
statistic, p_value = mannwhitneyu(group1, group2)

In [None]:
print("Mann-Whitney U Test:")
print(f"Mann-Whitney U Statistic: {statistic:.2f}")
print(f"P-value: {p_value:.4f}")

In [None]:
if p_value < 0.05:
    print("There is a significant difference between the two groups.")
else:
    print("There is no significant difference between the two groups.")

#### Exercise 12 Pearson’s Chi-Squared Test

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Sample data
data = {'Gender': ['Male', 'Male', 'Female', 'Female'],
        'Preference': ['Like', 'Dislike', 'Like', 'Dislike'],
        'Count': [50, 10, 20, 40]}
df = pd.DataFrame(data)
df

In [None]:
# Create a contingency table
contingency_table = df.pivot(index='Gender', columns='Preference', values='Count').fillna(0)

# Perform the test
chi2, p_value, _, expected = chi2_contingency(contingency_table)

print(f'Chi-Squared Value: {chi2:.2f}')
print(f'p-value: {p_value:.4f}')
print('Expected Frequencies:')
print(expected)

In [None]:
# Interpretation of the Chi-Squared Test results

alpha = 0.05  # significance level

print(f'Chi-Squared Value: {chi2:.2f}')
print(f'p-value: {p_value:.4f}')

if p_value <= alpha:
    print("The results are statistically significant. There is an association between the categorical variables.")
else:
    print("The results are not statistically significant. There is no evidence of an association between the categorical variables.")

# Visualization of Expected vs. Observed Frequencies
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Observed Frequencies
sns.heatmap(contingency_table, annot=True, cmap='coolwarm', fmt='g', ax=ax[0])
ax[0].set_title('Observed Frequencies')

# Expected Frequencies
sns.heatmap(expected, annot=True, cmap='coolwarm', fmt='.1f', ax=ax[1])
ax[1].set_title('Expected Frequencies')

plt.tight_layout()
plt.show()


#### Exercise 13 Spearman’s Rank Correlation

In [None]:
# Import necessary libraries
from numpy.random import rand
from numpy.random import seed
from scipy.stats import spearmanr

In [None]:
# Seed the random number generator
seed(1)

In [None]:
# Prepare data
data1 = rand(1000) * 20
data2 = data1 + (rand(1000) * 10)

In [None]:
# Calculate Spearman's correlation coefficient and p-value
coef, p = spearmanr(data1, data2)

In [None]:
# Print Spearman's correlation coefficient
print('Spearmans correlation coefficient: %.3f' % coef)

In [None]:
# Interpret the significance
alpha = 0.05
if p > alpha:
    print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
    print('Samples are correlated (reject H0) p=%.3f' % p)

#### Exercise 14 Bootstrap Resampling

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = np.array([23, 45, 56, 78, 89, 12, 67, 49, 55, 77, 88, 90, 34, 56, 71])

In [None]:
n_iterations = 10000
bootstrap_means = []

for _ in range(n_iterations):
    bootstrap_sample = np.random.choice(data, size=len(data), replace=True)
    bootstrap_means.append(np.mean(bootstrap_sample))


In [None]:
confidence_level = 0.95
lower_percentile = (1 - confidence_level) / 2 * 100
upper_percentile = (1 + confidence_level) / 2 * 100

confidence_interval = (np.percentile(bootstrap_means, lower_percentile),
                       np.percentile(bootstrap_means, upper_percentile))

print(f'95% Confidence Interval for the Mean: {confidence_interval}')


In [None]:
plt.hist(bootstrap_means, bins=50, color='skyblue', edgecolor='black')
plt.axvline(confidence_interval[0], color='red', linestyle='dashed')
plt.axvline(confidence_interval[1], color='red', linestyle='dashed')
plt.title('Bootstrap Distribution of the Mean')
plt.xlabel('Mean')
plt.ylabel('Frequency')
plt.show()


#### Exercise 15 Normality Assumption

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import shapiro, probplot

In [None]:
data = np.random.randn(100)

In [None]:
plt.hist(data, bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of the Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
probplot(data, plot=plt)
plt.title('Q-Q Plot')
plt.show()


In [None]:
stat, p = shapiro(data)
print(f'Statistic: {stat}, p-value: {p}')


#### Excercise 16 Make Data Gaussian and Gaussian-Like

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import boxcox, yeojohnson, shapiro, probplot
from sklearn.preprocessing import QuantileTransformer

In [None]:
data = np.random.exponential(scale=2, size=1000)
plt.hist(data, bins=30, color='skyblue', edgecolor='black')
plt.title('Original Skewed Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()


In [None]:
data_log = np.log(data)
plt.hist(data_log, bins=30, color='lightgreen', edgecolor='black')
plt.title('Log Transformed Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()


In [None]:
data_boxcox, _ = boxcox(data)
plt.hist(data_boxcox, bins=30, color='lightcoral', edgecolor='black')
plt.title('Box-Cox Transformed Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()


In [None]:
data_yj, _ = yeojohnson(data)
plt.hist(data_yj, bins=30, color='lightpink', edgecolor='black')
plt.title('Yeo-Johnson Transformed Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()


In [None]:
transformer = QuantileTransformer(output_distribution='normal')
data_quantile = transformer.fit_transform(data.reshape(-1, 1)).flatten()
plt.hist(data_quantile, bins=30, color='lightsalmon', edgecolor='black')
plt.title('Quantile Transformed Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()


#### Excercise 17 Normality Assumption Exercise using the Iris Dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro


In [None]:
iris = sns.load_dataset('iris')
iris.head()


In [None]:
iris.hist(figsize=(12, 10), bins=30, color='skyblue', edgecolor='black')
plt.suptitle('Histograms of Iris Dataset Features')
plt.show()


In [None]:
features = iris.columns[:-1]  # Exclude the 'species' column
for feature in features:
    stat, p = shapiro(iris[feature])
    print(f'{feature} - Statistic: {stat:.4f}, p-value: {p:.4f}')


In [None]:
from scipy.stats import probplot

for feature in features:
    plt.figure(figsize=(8, 6))
    probplot(iris[feature], plot=plt)
    plt.title(f'Q-Q Plot for {feature}')
    plt.show()


---

Revised Date: October 7, 2023