In [26]:
#Import Packages
import pandas as pd
from scipy import stats
import numpy as np

# Load File
df = pd.read_excel('Salaries.xlsx', engine='openpyxl')

# Extract data
salary = df['Salary']
experience = df['Experience']

# Define sample statistics
n = len(salary)
mean_salary = np.mean(salary)
std_salary = np.std(salary, ddof=1)

In [28]:
#---------------------------------------------
# Question 4 Part A - 95% Confidence Interval (Two-Tailed)
#---------------------------------------------

# Define variables
confidence = 0.95
alpha = 1 - confidence
t_crit = stats.t.ppf(1 - alpha/2, n - 1)

# Calculate statistics
margin = t_crit * (std_salary / np.sqrt(n))
lower = mean_salary - margin
upper = mean_salary + margin

# Output statement
print(f"With a mean Salary of ${mean_salary:,.2f}, ")
print(f"The 95% Confidence Interval for the average Salary for employees in the company is $({lower:,.2f} to ${upper:,.2f})")
print()
print(
    f"An interpretation of these results is that we are 95% confident that the mean salary "
    f"of an employee, ${mean_salary:,.2f}, lies between ${lower:,.2f} and ${upper:,.2f}."
)

With a mean Salary of $45,141.51, 
The 95% Confidence Interval for the average Salary for employees in the company is $(42,583.80 to $47,699.21)

An interpretation of these results is that we are 95% confident that the mean salary of an employee, $45,141.51, lies between $42,583.80 and $47,699.21.


In [58]:
#---------------------------------------------
# Question 4, Part B - Can it confidently be concluded that the average experience in the companhy is less than 6 yrs?
#---------------------------------------------

# Define Variables
mu_0 = 6
alpha = 0.05

# Calculate statistics
t_stat, p_value_two_tailed = stats.ttest_1samp(experience, mu_0)

# Convert to one-tailed (since mean < 6)
if t_stat < 0:
    p_value_one_tailed = p_value_two_tailed / 2
else:
    p_value_one_tailed = 1 - (p_value_two_tailed / 2)

# Output statement
print("Key Values:")
print(f"{'-'*60}")
print(f"Sample Mean Experience: {experience.mean():.2f} years")
print(f"Hypothesized Mean (μ₀): {mu_0:.2f} years")
print(f"T-Statistic: {t_stat:.3f}")
print(f"One-tailed P-Value: {p_value_one_tailed:.4f}")
print(f"Significance Level (α): {alpha:.2f}")

print(f"{'-'*60}\n")

# Decision statement
print("Test: ")
if p_value_one_tailed < alpha:
    print("Reject the null hypothesis (H0).")
    print("There is significant evidence that the average experience is less than 6 years.")
else:
    print("Fail to reject the null hypothesis (H1).")
    print("There is insufficient evidence that the average experience is less than 6 years.")

Key Values:
------------------------------------------------------------
Sample Mean Experience: 5.75 years
Hypothesized Mean (μ₀): 6.00 years
T-Statistic: -0.659
One-tailed P-Value: 0.2560
Significance Level (α): 0.05
------------------------------------------------------------

Test: 
Fail to reject the null hypothesis (H1).
There is insufficient evidence that the average experience is less than 6 years.


In [70]:
#---------------------------------------------
# Question 5 - Does the average salary of 
#              employees in Level C exceed that 
#              of the average salary of employees 
#              in Level A by more than $10k?
#---------------------------------------------

# Hypothesis
# muC = mean salary of Level C employees
# muA = mean salary of Level A employees
# Test = whether Salaries of Level C exceed Level A by > $10,000
# H0: uC - uA <= 10000 (average salary difference <= $10,000)
# H1: uC - uA >  10000 (average salary difference >  $10,000)

# Define variables
level_A = df[df['Level'] == 'A']['Salary']
level_C = df[df['Level'] == 'C']['Salary']

# Compute statistics
mean_A = level_A.mean()
mean_C = level_C.mean()
n_A = level_A.count()
n_C = level_C.count()

# Step 1: Shift mean difference by $10,000
t_stat, p_value_two_tailed = stats.ttest_ind(level_C, level_A, equal_var = False)
mean_diff = mean_C - mean_A
std_err = np.sqrt(level_C.var(ddof=1)/n_C + level_A.var(ddof=1)/n_A)
t_stat_adjusted = (mean_diff - 10000) / std_err

# Step 2: Convert two-tailed p-value to one-tailed
p_value_one_tailed = 1 - stats.t.cdf(t_stat_adjusted, df=min(n_A, n_C) - 1)

# Output statement
print("Key Values")
print(f"{'-'*60}")
print(f"Mean Salary (Level A): ${mean_A:,.2f}")
print(f"Mean Salary (Level C): ${mean_C:,.2f}")
print(f"Observed Mean Difference: ${mean_diff:,.2f}")
print(f"T-Statistic (Adjusted for $10,000 difference): {t_stat_adjusted:.3f}")
print(f"One-tailed P-Value: {p_value_one_tailed:.4f}")
print(f"Significance Level (alpha): 0.05")
print(f"{'-'*60}\n")

# Decision statement
print("Test:")
if p_value_one_tailed < 0.05:
    print("Reject the null hypothesis (H₀).")
    print("There is significant evidence that Level C salaries exceed Level A by more than $10,000.")
else:
    print("Fail to reject the null hypothesis (H₀).")
    print("There is not enough evidence that Level C salaries exceed Level A by more than $10,000.")




Key Values
------------------------------------------------------------
Mean Salary (Level A): $41,010.87
Mean Salary (Level C): $53,926.89
Observed Mean Difference: $12,916.02
T-Statistic (Adjusted for $10,000 difference): 1.327
One-tailed P-Value: 0.1106
Significance Level (alpha): 0.05
------------------------------------------------------------

Test:
Fail to reject the null hypothesis (H₀).
There is not enough evidence that Level C salaries exceed Level A by more than $10,000.
