In [None]:
#Upload the required dataset
from google.colab import files
files.upload()

In [None]:
#Import the required libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import warnings
warnings.filterwarnings('ignore')


In [None]:
employee_data=pd.read_csv('employee_data.csv')
employee_data.head()
employee_data.describe()
employee_data.info()

In [None]:
#Change the headings of the columns into a reliable format
employee_data.columns=(employee_data.columns
                       .str.strip()
                       .str.lower()
                       .str.replace(r'[\s\-]+', '_', regex=True)
)
employee_data.head()



In [None]:
#Convert the datatypes for the below columns
employee_data['hire_date']=pd.to_datetime(employee_data['hire_date'])
employee_data['gender'] = employee_data['gender'].astype('category')
employee_data['job_title'] = employee_data['job_title'].astype('category')
employee_data['department'] = employee_data['department'].astype('category')
employee_data['education_level'] = employee_data['education_level'].astype('category')


In [None]:
#Check whether null values are present in the dataset
employee_data.isnull().sum()

In [None]:
#Split the employee groups into low training and high training groups
median_training= employee_data['training_hours'].median()
employee_data['training_group']=np.where(employee_data['training_hours']<=median_training,'Low Training','High Training')
employee_data['training_group'].value_counts()

In [None]:
#Descriptive statistics
descriptive_statistics=employee_data.groupby('training_group')['performance_score'].describe()
descriptive_statistics

In [None]:
mean_performance = employee_data.groupby('training_group')['performance_score'].mean()
mean_performance

In [None]:
mean_performance.plot(kind='bar')
plt.title('figure 4.1:Average Performance Score by Training Group')
plt.xlabel('Training Group')
plt.ylabel('Average Performance Score')
plt.xticks(rotation=0)
plt.show()


In [None]:
sns.boxplot(x='training_group', y='performance_score', data=employee_data)
plt.title('figure 4.2 Performance Score by Training Group')
plt.show()

Hypothesis Formulation
**Null Hypothesis:** The Employee training hours do not have a statistically significant impact on the employee productivity as measured by the performance scores.
**Alternate Hypothesis:** The Employee training hours have a statistically significant impact on the employee productivity as measured by the performance scores.

In [None]:
low = employee_data[employee_data['training_group'] == 'Low Training']['performance_score']
high = employee_data[employee_data['training_group'] == 'High Training']['performance_score']

t_stat, p_value = ttest_ind(high, low, equal_var=False)
t_stat, p_value

**Hypothesis testing conclusion**

Independent Samples t-test result
t-statistic=1.03
p-value=0.3
Since the p-value is greater than 0.05, we cannot reject the null hypothesis. This indicates that there is no statistically significant difference in performance scores between employees with high training hours and low training hours


**Business Interpretation**

The employees with higher training hours showed a slightly higher average performance than employees with lower training hours.But the difference shown is not statistically significant.From this, it can be interpreted that the training hours alone do not help in improving employee productivity. Along with training given,there may be other factors influencing productivity such as environment,work-life balance,motivation,job satisfaction and other psychological factors of an employee.

**Summary**

This notebook analyzed the impact of employee training hours on productivity using hypothesis testing.Based on the independent samples t-test results, training hours were found to have no statistically significant impact on employee performance scores in the given dataset.
