# Before your start:
- Read the README.md file
- Comment as much as you can and use the resources (README.md file)
- Happy learning!

In [6]:
# import numpy and pandas
import pandas as pd
import numpy as np
from scipy.stats import trim_mean, mode, skew, gaussian_kde, pearsonr, spearmanr, beta
from statsmodels.stats.weightstats import ztest as ztest

from scipy.stats import ttest_ind, norm, t
from scipy.stats import f_oneway
from scipy.stats import sem

# Challenge 1 - Exploring the Data

In this challenge, we will examine all salaries of employees of the City of Chicago. We will start by loading the dataset and examining its contents

In [2]:
# Run this code:
salaries = pd.read_csv('../data/Current_Employee_Names__Salaries__and_Position_Titles.csv')

Examine the `salaries` dataset using the `head` function below.

In [3]:
# Your code here
salaries.head()

Unnamed: 0,Name,Job Titles,Department,Full or Part-Time,Salary or Hourly,Typical Hours,Annual Salary,Hourly Rate
0,"AARON, JEFFERY M",SERGEANT,POLICE,F,Salary,,101442.0,
1,"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,F,Salary,,94122.0,
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,F,Salary,,101592.0,
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,F,Salary,,110064.0,
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,P,Hourly,20.0,,19.86


In [5]:
salaries.info()

# Count NaN values in the 'Hourly Rate' column
nan_count = salaries['Hourly Rate'].isna().sum()
print(f"Number of NaN values in 'Hourly Rate': {nan_count}")

# Drop rows with NaN in the 'Hourly Rate' column
salaries_cleaned = salaries.dropna(subset=['Hourly Rate'])

# Count rows that do have a value in 'Hourly Rate'
non_nan_count = salaries_cleaned['Hourly Rate'].notna().sum()
print(f"Number of rows with a value in 'Hourly Rate': {non_nan_count}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33183 entries, 0 to 33182
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               33183 non-null  object 
 1   Job Titles         33183 non-null  object 
 2   Department         33183 non-null  object 
 3   Full or Part-Time  33183 non-null  object 
 4   Salary or Hourly   33183 non-null  object 
 5   Typical Hours      8022 non-null   float64
 6   Annual Salary      25161 non-null  float64
 7   Hourly Rate        8022 non-null   float64
dtypes: float64(3), object(5)
memory usage: 2.0+ MB
Number of NaN values in 'Hourly Rate': 25161
Number of rows with a value in 'Hourly Rate': 8022


In [13]:
salaries_cleaned.describe()

Unnamed: 0,Typical Hours,Annual Salary,Hourly Rate
count,8022.0,0.0,8022.0
mean,34.507604,,32.788558
std,9.252077,,12.112573
min,10.0,,2.65
25%,20.0,,21.2
50%,40.0,,35.6
75%,40.0,,40.2
max,40.0,,109.0


In [7]:
salaries_cleaned

Unnamed: 0,Name,Job Titles,Department,Full or Part-Time,Salary or Hourly,Typical Hours,Annual Salary,Hourly Rate
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,P,Hourly,20.0,,19.86
6,"ABBATACOLA, ROBERT J",ELECTRICAL MECHANIC,AVIATION,F,Hourly,40.0,,46.10
7,"ABBATE, JOSEPH L",POOL MOTOR TRUCK DRIVER,STREETS & SAN,F,Hourly,40.0,,35.60
10,"ABBOTT, BETTY L",FOSTER GRANDPARENT,FAMILY & SUPPORT,P,Hourly,20.0,,2.65
18,"ABDULLAH, LAKENYA N",CROSSING GUARD,OEMC,P,Hourly,20.0,,17.68
...,...,...,...,...,...,...,...,...
33164,"ZUREK, FRANCIS",ELECTRICAL MECHANIC,OEMC,F,Hourly,40.0,,46.10
33168,"ZWARYCZ MANN, IRENE A",CROSSING GUARD,OEMC,P,Hourly,20.0,,17.68
33169,"ZWARYCZ, THOMAS J",POOL MOTOR TRUCK DRIVER,WATER MGMNT,F,Hourly,40.0,,35.60
33174,"ZYGADLO, JOHN P",MACHINIST (AUTOMOTIVE),GENERAL SERVICES,F,Hourly,40.0,,46.35


# Challenge 2
This is a placeholder to make the AI corrector be able to find the correct exercise for feedback

# Challenge 3 - Constructing Confidence Intervals

We will test whether the hourly wage of all hourly workers is significantly different from $30/hr.

In the cell below, we will construct a 95% confidence interval for the mean hourly wage of all hourly workers. Is $30/hr within that interval?

The confidence interval is computed in SciPy using the `t.interval` function. You can read more about this function [here](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.t.html).

To compute the confidence interval of the hourly wage, use the 0.95 for the confidence level, number of rows - 1 for degrees of freedom, the mean of the sample for the location parameter and the standard error for the scale. The standard error can be computed using [this](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.sem.html) function in SciPy.

In [17]:
# Your code here
# Calculate the mean of the 'Hourly Rate' column
sample_mean = mean_hourly_rate = salaries_cleaned['Hourly Rate'].mean()
print(f"Mean Hourly Rate: {mean_hourly_rate}")
sample_std = salaries_cleaned['Hourly Rate'].std(ddof=1)  # ddof=1 for unbiased estimate
print(f"Sample Standard Deviation: {sample_std}")
# Sample Size (n):
n = len(salaries_cleaned['Hourly Rate'])
se = sample_std / np.sqrt(n)
df = n - 1

# 95% confidence interval
ci_lower, ci_upper = t.interval(0.95, df, loc=sample_mean, scale=se)

# Check if $30 is in the interval
is_30_in_interval = ci_lower <= 30 <= ci_upper


# Check if $30 is in the interval
is_30_in_interval = ci_lower <= 30 <= ci_upper

print(f"95% Confidence Interval: ({ci_lower:.2f}, {ci_upper:.2f})")
print(f"Is $30 within the interval? {'Yes' if is_30_in_interval else 'No'}")

Mean Hourly Rate: 32.78855771628024
Sample Standard Deviation: 12.11257268427681
95% Confidence Interval: (32.52, 33.05)
Is $30 within the interval? No


This is fine if we have thousands of worker data. But what if we have only 100 workers data?

Sample 100 workers and re-construct the 95% confidence interval. Is the interval wider of narrower? And why?
Do you still encapsulate the $30/hr mark in this case?

In [23]:
# Your code here

# Assume `hourly_wages` is the full dataset with thousands of workers
np.random.seed(42)  # For reproducibility
sample_100 = np.random.choice(salaries_cleaned['Hourly Rate'], size=100, replace=False)
sample_mean = np.mean(sample_100)
sample_std_100 = sample_100.std(ddof=1)  # ddof=1 for unbiased estimate
n = 100
se = sample_std / np.sqrt(n)
df = n - 1

# 95% confidence interval
ci_lower, ci_upper = t.interval(0.95, df, loc=sample_mean, scale=se)

print(f"95% CI with 100 workers: ({ci_lower:.2f}, {ci_upper:.2f})")
print("$30/hr is in the interval?" , "Yes" if ci_lower <= 30 <= ci_upper else "No")



95% CI with 100 workers: (30.83, 36.08)
$30/hr is in the interval? No
