## Employee Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('employee_data.csv')
data.head(50)


Unnamed: 0,ID,Name,Age,Department,Salary
0,1,John,28.0,HR,50000.0
1,2,Jane,35.0,Finance,60000.0
2,3,Emily,,HR,55000.0
3,4,Michael,40.0,Human Resources,
4,5,Sarah,29.0,IT,52000.0
5,6,David,50.0,Finance,75000.0
6,7,Laura,38.0,H.R.,68000.0
7,8,Robert,32.0,HR,57000.0
8,9,Linda,45.0,IT,62000.0
9,10,James,30.0,HR,51000.0


In [3]:
mean_age = data['Age'].mean()
mean_age_ceil = np.ceil(mean_age)

data['Age'].fillna(mean_age_ceil, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(mean_age_ceil, inplace=True)


In [4]:
mean_salary = data['Salary'].mean()
mean_salary_ceil = np.ceil(mean_salary)
data['Salary'].fillna(mean_salary_ceil, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Salary'].fillna(mean_salary_ceil, inplace=True)


In [5]:
unique_departments = data['Department'].unique()
unique_departments

array(['HR', 'Finance', 'Human Resources', 'IT', 'H.R.'], dtype=object)

In [6]:
data['Department'] = data['Department'].replace({
    'H.R.':'HR',
    'HR': 'HR',
    'Human Resources': 'HR',

})


In [7]:
data.drop_duplicates(subset='ID', keep='first', inplace=True)
data


Unnamed: 0,ID,Name,Age,Department,Salary
0,1,John,28.0,HR,50000.0
1,2,Jane,35.0,Finance,60000.0
2,3,Emily,36.0,HR,55000.0
3,4,Michael,40.0,HR,58100.0
4,5,Sarah,29.0,IT,52000.0
5,6,David,50.0,Finance,75000.0
6,7,Laura,38.0,HR,68000.0
7,8,Robert,32.0,HR,57000.0
8,9,Linda,45.0,IT,62000.0
9,10,James,30.0,HR,51000.0


## Student data

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
std_df = pd.read_csv('student_scores.csv')
std_df.head()

Unnamed: 0,StudentID,Math,Science,English
0,1,78,65,80
1,2,88,75,85
2,3,60,50,55
3,4,90,78,92
4,5,55,48,58


In [10]:
scaler = MinMaxScaler()
std_df[['Math', 'Science', 'English']] = scaler.fit_transform(std_df[['Math', 'Science', 'English']])
std_df.head()


Unnamed: 0,StudentID,Math,Science,English
0,1,0.657143,0.53125,0.675676
1,2,0.942857,0.84375,0.810811
2,3,0.142857,0.0625,0.0
3,4,1.0,0.9375,1.0
4,5,0.0,0.0,0.081081


## Customer Data

In [11]:
cus_df = pd.read_csv('customer_ages.csv')
cus_df.head()

Unnamed: 0,CustomerID,Age
0,1,25
1,2,42
2,3,36
3,4,53
4,5,28


In [12]:
bins = [18, 30, 50, 100]
labels = ['Young', 'Middle-aged', 'Senior']
cus_df['AgeGroup'] = pd.cut(cus_df['Age'], bins=bins, labels=labels, right=False)

cus_df.head()

Unnamed: 0,CustomerID,Age,AgeGroup
0,1,25,Young
1,2,42,Middle-aged
2,3,36,Middle-aged
3,4,53,Senior
4,5,28,Young


## Sales data

In [13]:
sales_df = pd.read_csv('sales_data.csv')
sales_df.head()

Unnamed: 0,Month,Sales
0,January,15000
1,February,18000
2,March,12000
3,April,30000
4,May,22000


In [14]:
bins = [0, 5000, 20000, float('inf')]
labels = ['Low', 'Medium', 'High']
sales_df['SalesCategory'] = pd.cut(sales_df['Sales'], bins=bins, labels=labels)
sales_df.head()

Unnamed: 0,Month,Sales,SalesCategory
0,January,15000,Medium
1,February,18000,Medium
2,March,12000,Medium
3,April,30000,High
4,May,22000,High


In [15]:
sales_category_distribution = sales_df['SalesCategory'].value_counts()
print("\nSales Category Distribution:\n", sales_category_distribution)



Sales Category Distribution:
 SalesCategory
Medium    7
High      4
Low       1
Name: count, dtype: int64


## Medical Data

In [16]:
from sklearn.feature_selection import SelectKBest, chi2


In [17]:
med_df = pd.read_csv('medical_data.csv')
med_df.head()

Unnamed: 0,PatientID,Age,BloodPressure,Cholesterol,Glucose,HeartRate,Disease
0,1,45,130,180,95,70,1
1,2,50,140,200,105,75,1
2,3,60,150,240,120,80,1
3,4,40,120,170,90,65,0
4,5,35,110,160,85,60,0


In [18]:
X = med_df.drop(columns=['Disease'])
y = med_df['Disease']

In [19]:
selector = SelectKBest(score_func=chi2, k=3)
selector.fit(X, y)

0,1,2
,score_func,<function chi...001C7D8479940>
,k,3


In [20]:
top_features = X.columns[selector.get_support()]
print("\nTop 3 Features for Predicting Disease:\n", top_features)


Top 3 Features for Predicting Disease:
 Index(['Age', 'Cholesterol', 'Glucose'], dtype='object')
