In [1]:
# Import necessary library
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [3]:
data={'Age':[25,23,np.nan,45,34],
     'Gender':['Male','Female','Female','Male','Male'],
     'Salary':[500000,5200000,650000,450000,8500000],
     'Productivity':[10,8,9,7,11],
     'Text':['hello World','python is great','Ai is great','ML is good','It is very good'],
     'Religion':['religion1','religion2','religion3','religion4','religion5']}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,500000,10,hello World,religion1
1,23.0,Female,5200000,8,python is great,religion2
2,,Female,650000,9,Ai is great,religion3
3,45.0,Male,450000,7,ML is good,religion4
4,34.0,Male,8500000,11,It is very good,religion5


In [4]:
# Imputation
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])
df.head()

Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,500000,10,hello World,religion1
1,23.0,Female,5200000,8,python is great,religion2
2,31.75,Female,650000,9,Ai is great,religion3
3,45.0,Male,450000,7,ML is good,religion4
4,34.0,Male,8500000,11,It is very good,religion5


In [5]:
# Normalization/Scaling
scaler = MinMaxScaler()
df[['Salary', 'Productivity']] = scaler.fit_transform(df[['Salary', 'Productivity']])
df.head()

Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,0.006211,0.75,hello World,religion1
1,23.0,Female,0.590062,0.25,python is great,religion2
2,31.75,Female,0.024845,0.5,Ai is great,religion3
3,45.0,Male,0.0,0.0,ML is good,religion4
4,34.0,Male,1.0,1.0,It is very good,religion5


In [6]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=['Gender'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male
0,25.0,0.006211,0.75,hello World,religion1,False,True
1,23.0,0.590062,0.25,python is great,religion2,True,False
2,31.75,0.024845,0.5,Ai is great,religion3,True,False
3,45.0,0.0,0.0,ML is good,religion4,False,True
4,34.0,1.0,1.0,It is very good,religion5,False,True


In [7]:
# Label Encoding
label_encoder = LabelEncoder()
df['Religion_LabelEncoded'] = label_encoder.fit_transform(df['Religion'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded
0,25.0,0.006211,0.75,hello World,religion1,False,True,0
1,23.0,0.590062,0.25,python is great,religion2,True,False,1
2,31.75,0.024845,0.5,Ai is great,religion3,True,False,2
3,45.0,0.0,0.0,ML is good,religion4,False,True,3
4,34.0,1.0,1.0,It is very good,religion5,False,True,4


In [8]:
# Binning/Discretization
df['Age_Bin'] = pd.cut(df['Age'], bins=3, labels=['Young', 'Middle-aged', 'Old'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded,Age_Bin
0,25.0,0.006211,0.75,hello World,religion1,False,True,0,Young
1,23.0,0.590062,0.25,python is great,religion2,True,False,1,Young
2,31.75,0.024845,0.5,Ai is great,religion3,True,False,2,Middle-aged
3,45.0,0.0,0.0,ML is good,religion4,False,True,3,Old
4,34.0,1.0,1.0,It is very good,religion5,False,True,4,Middle-aged


In [9]:
# Feature Interactions
df['Salary_Productivity'] = df['Salary'] * df['Productivity']
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded,Age_Bin,Salary_Productivity
0,25.0,0.006211,0.75,hello World,religion1,False,True,0,Young,0.004658
1,23.0,0.590062,0.25,python is great,religion2,True,False,1,Young,0.147516
2,31.75,0.024845,0.5,Ai is great,religion3,True,False,2,Middle-aged,0.012422
3,45.0,0.0,0.0,ML is good,religion4,False,True,3,Old,0.0
4,34.0,1.0,1.0,It is very good,religion5,False,True,4,Middle-aged,1.0


In [10]:
# Fill the missing value of the salary column
df['Salary'].fillna(0, inplace=True)
df['Salary_Productivity'].fillna(0, inplace=True)
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded,Age_Bin,Salary_Productivity
0,25.0,0.006211,0.75,hello World,religion1,False,True,0,Young,0.004658
1,23.0,0.590062,0.25,python is great,religion2,True,False,1,Young,0.147516
2,31.75,0.024845,0.5,Ai is great,religion3,True,False,2,Middle-aged,0.012422
3,45.0,0.0,0.0,ML is good,religion4,False,True,3,Old,0.0
4,34.0,1.0,1.0,It is very good,religion5,False,True,4,Middle-aged,1.0


In [11]:

# Feature Selection
selector = SelectKBest(score_func=f_regression, k=2)
X = df.drop(['Religion', 'Text', 'Age_Bin'], axis=1)  # dropping non-numeric columns for feature selection
X_new = selector.fit_transform(X, df['Productivity'])
X_new

array([[0.75      , 0.00465839],
       [0.25      , 0.14751553],
       [0.5       , 0.01242236],
       [0.        , 0.        ],
       [1.        , 1.        ]])

In [None]:
# Dimensionality Reduction
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
df['PCA1'] = principal_components[:, 0]
df['PCA2'] = principal_components[:, 1]
df.head()