In [88]:
import pandas as pd
import numpy as np
df = pd.read_csv('employee_promotion.csv')

In [89]:
df.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion


In [90]:
df.shape

(5000, 9)

# Feature Encoding

In [91]:
from sklearn.preprocessing import LabelEncoder

# Copying the original dataset
df_encoded = df.copy()

# Label Encoding for Promotion (mostly for target variable)
label_encoder = LabelEncoder()
df_encoded["Promotion_Enc"] = label_encoder.fit_transform(df_encoded["Promotion"])

df_encoded

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion,Promotion_Enc
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion,0
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion,1
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion,0
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion,1
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion,1
...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,Austin,No Promotion,1
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Chicago,Got Promotion,0
4997,34,81007,4,0,4,Web developer,PhD,Austin,Got Promotion,0
4998,41,106947,12,2,10,Data Scientist,Bachelor's,Seattle,No Promotion,1


In [92]:
# label encoding with order for Education (for feature having ordinal categories)
education_encoded_dict = {'High School':0,"Bachelor's":1, "Master's":2, 'PhD':3}
df_encoded["Education_Enc"]= df_encoded["Education"].map(education_encoded_dict)
df_encoded

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion,Promotion_Enc,Education_Enc
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion,0,0
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion,1,2
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion,0,3
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion,1,0
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,Austin,No Promotion,1,0
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Chicago,Got Promotion,0,1
4997,34,81007,4,0,4,Web developer,PhD,Austin,Got Promotion,0,3
4998,41,106947,12,2,10,Data Scientist,Bachelor's,Seattle,No Promotion,1,1


In [93]:
#  One-Hot Encoding for City
df_encoded = pd.get_dummies(df_encoded, columns=["City"], prefix="City")
df_encoded

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True


In [94]:
# Frequency Encoding for JobTitle
job_title_counts = df_encoded["JobTitle"].value_counts().to_dict()
df_encoded["JobTitle_Freq_Enc"] = df_encoded["JobTitle"].map(job_title_counts)
df_encoded

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle,JobTitle_Freq_Enc
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True,699
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False,705
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False,733
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False,699
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False,750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False,705
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False,705
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False,733
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True,705


In [95]:
# Target Encoding for Education (Encoding it based on the average promotion rate)
education_target_mean = df_encoded.groupby("Education")["Promotion_Enc"].mean().to_dict()
df_encoded["Education_Target_Enc"] = df_encoded["Education"].map(education_target_mean)
df_encoded

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle,JobTitle_Freq_Enc,Education_Target_Enc
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True,699,0.697692
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False,705,0.718826
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False,733,0.702893
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False,699,0.697692
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False,750,0.718593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False,705,0.697692
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False,705,0.718593
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False,733,0.702893
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True,705,0.718593


In [96]:
df

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion
...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,Austin,No Promotion
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Chicago,Got Promotion
4997,34,81007,4,0,4,Web developer,PhD,Austin,Got Promotion
4998,41,106947,12,2,10,Data Scientist,Bachelor's,Seattle,No Promotion


# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Copying the dataset
df_scaled = df_encoded.copy()
  
# Standard Scaler
standard_scaler = StandardScaler()
df_scaled["Salary_StandardScaled"] = standard_scaler.fit_transform(df_scaled[["Salary"]])

# Min-Max Scaler
minmax_scaler = MinMaxScaler()
df_scaled["Salary_MinMaxScaled"] = minmax_scaler.fit_transform(df_scaled[["Salary"]])

# Displaying the first few rows
df_scaled[["Salary", "Salary_StandardScaled", "Salary_MinMaxScaled"]].head()

Unnamed: 0,Salary,Salary_StandardScaled,Salary_MinMaxScaled
0,132612,1.12006,0.826129
1,116641,0.572455,0.666363
2,113811,0.475421,0.638053
3,102160,0.075938,0.521503
4,101313,0.046897,0.51303


In [98]:
df_scaled[["Salary", "Salary_StandardScaled", "Salary_MinMaxScaled"]].describe()

Unnamed: 0,Salary,Salary_StandardScaled,Salary_MinMaxScaled
count,5000.0,5000.0,5000.0
mean,99945.2532,-2.192024e-16,0.499347
std,29168.083473,1.0001,0.291783
min,50028.0,-1.711537,0.0
25%,74622.5,-0.8682533,0.246031
50%,99449.0,-0.01701527,0.494383
75%,125889.5,0.8895628,0.758881
max,149993.0,1.716011,1.0


# Feature Extraction

In [99]:
import pandas as pd
df = pd.read_csv('retail_customer.csv')
df.head()

Unnamed: 0,customer_id,name,age,gender,signup_date,last_purchase_date,total_purchases,total_spent,country,email,device_type,is_subscribed,feedback_score
0,5c7825f3-13bb-4843-9cf1-5b7a944def13,Andrew Miller,56,Male,2025-01-01,2024-06-12,18,6143.01,Bangladesh,qwilson@example.org,Mobile,1,2.7
1,ad1cf2e3-be49-4031-9770-c308b2d1292c,Kevin Ramos,69,Other,2025-05-16,2024-10-03,86,9498.14,Eritrea,shall@example.net,Mobile,1,1.6
2,33194def-ebb0-4d75-8942-9c83d7d579f2,John Smith,46,Other,2025-04-15,2025-04-27,34,4026.83,Russian Federation,leebrian@example.org,Mobile,0,3.7
3,6a1ce340-ec27-482e-a339-e89e2db11a01,Dustin Nolan,32,Male,2023-10-10,2024-10-20,8,9545.46,Serbia,sguzman@example.org,Mobile,1,1.9
4,3e529d86-76ed-4c85-83cf-c5ab446cb3c0,Amy Johnson,60,Female,2024-02-02,2025-03-03,40,1436.51,Mauritius,griffithsarah@example.org,Mobile,1,1.4


In [100]:
df.shape

(500, 13)

In [101]:
df.dtypes

customer_id            object
name                   object
age                     int64
gender                 object
signup_date            object
last_purchase_date     object
total_purchases         int64
total_spent           float64
country                object
email                  object
device_type            object
is_subscribed           int64
feedback_score        float64
dtype: object

In [102]:
# Binning Age into categories
bins = [10, 30, 50, 70]  # Age ranges
labels = ["Young", "Mid", "Senior"]  # Categories
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels)

# Displaying the first few rows
df[["age", "age_group"]].head(20)

Unnamed: 0,age,age_group
0,56,Senior
1,69,Senior
2,46,Mid
3,32,Mid
4,60,Senior
5,25,Young
6,38,Mid
7,56,Senior
8,36,Mid
9,40,Mid


In [103]:
# converting dates columns to datetime 
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['last_purchase_date'] = pd.to_datetime(df['last_purchase_date'])

In [104]:
df.dtypes

customer_id                   object
name                          object
age                            int64
gender                        object
signup_date           datetime64[ns]
last_purchase_date    datetime64[ns]
total_purchases                int64
total_spent                  float64
country                       object
email                         object
device_type                   object
is_subscribed                  int64
feedback_score               float64
age_group                   category
dtype: object

In [105]:
# Time-based features
df['signup_year'] = df['signup_date'].dt.year
df['signup_month'] = df['signup_date'].dt.month
df['days_since_signup'] = (pd.Timestamp.today() - df['signup_date']).dt.days

df['days_since_last_purchase'] = (pd.Timestamp.today() - df['last_purchase_date']).dt.days


In [106]:
# Name features
df['first_name'] = df['name'].apply(lambda x: x.split()[0])
df['last_name'] = df['name'].apply(lambda x: x.split()[-1])
df['name_length'] = df['name'].apply(len)

In [107]:
# High spender flag
df['is_high_spender'] = (df['total_spent'] > 5000).astype(int)


In [108]:
df

Unnamed: 0,customer_id,name,age,gender,signup_date,last_purchase_date,total_purchases,total_spent,country,email,...,feedback_score,age_group,signup_year,signup_month,days_since_signup,days_since_last_purchase,first_name,last_name,name_length,is_high_spender
0,5c7825f3-13bb-4843-9cf1-5b7a944def13,Andrew Miller,56,Male,2025-01-01,2024-06-12,18,6143.01,Bangladesh,qwilson@example.org,...,2.7,Senior,2025,1,166,369,Andrew,Miller,13,1
1,ad1cf2e3-be49-4031-9770-c308b2d1292c,Kevin Ramos,69,Other,2025-05-16,2024-10-03,86,9498.14,Eritrea,shall@example.net,...,1.6,Senior,2025,5,31,256,Kevin,Ramos,11,1
2,33194def-ebb0-4d75-8942-9c83d7d579f2,John Smith,46,Other,2025-04-15,2025-04-27,34,4026.83,Russian Federation,leebrian@example.org,...,3.7,Mid,2025,4,62,50,John,Smith,10,0
3,6a1ce340-ec27-482e-a339-e89e2db11a01,Dustin Nolan,32,Male,2023-10-10,2024-10-20,8,9545.46,Serbia,sguzman@example.org,...,1.9,Mid,2023,10,615,239,Dustin,Nolan,12,1
4,3e529d86-76ed-4c85-83cf-c5ab446cb3c0,Amy Johnson,60,Female,2024-02-02,2025-03-03,40,1436.51,Mauritius,griffithsarah@example.org,...,1.4,Senior,2024,2,500,105,Amy,Johnson,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,cae07587-71ec-4ad9-95d3-1733b04039f6,Natalie Clark,65,Male,2024-04-12,2023-09-30,40,3550.88,Christmas Island,coxelizabeth@example.com,...,2.1,Senior,2024,4,430,625,Natalie,Clark,13,0
496,91c6672b-6ace-4367-9e31-8bd1d8fea975,Jennifer Jones,42,Male,2024-07-18,2023-10-23,11,1676.95,Vanuatu,curtisjesse@example.org,...,3.7,Mid,2024,7,333,602,Jennifer,Jones,14,0
497,9521ecfa-af06-4865-9a4b-e70827ba2485,Jeremy Luna,57,Male,2025-04-20,2023-10-27,36,6089.04,Nauru,uwalker@example.org,...,1.1,Senior,2025,4,57,598,Jeremy,Luna,11,1
498,28da13d6-c253-4279-b3aa-e41599598e9f,Leah Williams,62,Female,2022-06-20,2024-08-20,59,3249.30,British Virgin Islands,perryjacob@example.com,...,2.2,Senior,2022,6,1092,300,Leah,Williams,13,0


# Feature Selection

In [109]:
df = pd.read_csv('employee_interview.csv')
df.head()

Unnamed: 0,Age,Years_of_Experience,Education_Level,Certifications_Count,Previous_Employers,Tech_Skill_Score,Communication_Score,Problem_Solving_Score,Leadership_Score,Teamwork_Score,...,Current_CTC,Expected_CTC,Notice_Period_Days,Job_Changes_Count,Projects_Completed,Hackathons_Participated,Client_Interaction_Score,Python_Proficiency,SQL_Proficiency,Target
0,50.0,4.0,2.0,1.0,3.0,86.0,70.0,61.0,98.0,48.0,...,4.19,16.7,31.0,0.0,6.0,2.0,91.0,42.0,58.0,1
1,60.0,0.0,1.0,5.0,0.0,63.0,81.0,84.0,34.0,72.0,...,10.38,19.86,35.0,3.0,0.0,0.0,69.0,0.0,79.0,1
2,18.0,12.0,1.0,3.0,3.0,81.0,57.0,54.0,37.0,85.0,...,7.63,1.0,23.0,2.0,1.0,0.0,44.0,66.0,65.0,0
3,19.0,2.0,5.0,1.0,4.0,63.0,74.0,100.0,85.0,76.0,...,2.92,1.0,54.0,2.0,14.0,5.0,89.0,100.0,69.0,0
4,60.0,0.0,2.0,-0.0,0.0,77.0,84.0,53.0,89.0,39.0,...,18.53,12.03,32.0,5.0,8.0,4.0,71.0,23.0,94.0,0


In [110]:
df.columns

Index(['Age', 'Years_of_Experience', 'Education_Level', 'Certifications_Count',
       'Previous_Employers', 'Tech_Skill_Score', 'Communication_Score',
       'Problem_Solving_Score', 'Leadership_Score', 'Teamwork_Score',
       'Work_Life_Balance_Score', 'Current_CTC', 'Expected_CTC',
       'Notice_Period_Days', 'Job_Changes_Count', 'Projects_Completed',
       'Hackathons_Participated', 'Client_Interaction_Score',
       'Python_Proficiency', 'SQL_Proficiency', 'Target'],
      dtype='object')

## 1. Filter Methods

### 1.1 Correlation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X = df.drop(['Target','Expected_CTC'], axis =1) #x contains the features (independent variables)—every column except these two
y = df['Expected_CTC']

# train-test split 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=42)

# Initialize and train the Multiple Regression model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('Multiple Linear Regression Accuracy:', round(r2_score(y_test, y_pred),3))


Multiple Linear Regression Accuracy: 0.792


In [112]:
for i in df.columns:
    for index, value in df.corr()[i].to_dict().items():
        if abs(value)>0.53 and abs(value)!=1:
            print(i,',',index, round(value,2))
            

Certifications_Count , Projects_Completed -0.54
Previous_Employers , Job_Changes_Count -0.61
Expected_CTC , Client_Interaction_Score 0.57
Job_Changes_Count , Previous_Employers -0.61
Projects_Completed , Certifications_Count -0.54
Client_Interaction_Score , Expected_CTC 0.57


In [113]:
df.corr()['Expected_CTC'].sort_values()

Projects_Completed         -0.319165
Problem_Solving_Score      -0.314635
Hackathons_Participated    -0.303393
Current_CTC                -0.219592
Python_Proficiency         -0.190006
Teamwork_Score             -0.090790
Tech_Skill_Score           -0.041756
Job_Changes_Count          -0.028765
Target                     -0.028099
Work_Life_Balance_Score    -0.011123
Previous_Employers          0.015714
Communication_Score         0.016597
Notice_Period_Days          0.016840
SQL_Proficiency             0.022820
Certifications_Count        0.038531
Years_of_Experience         0.074915
Education_Level             0.373949
Age                         0.387056
Leadership_Score            0.504003
Client_Interaction_Score    0.571400
Expected_CTC                1.000000
Name: Expected_CTC, dtype: float64

In [114]:
# removing multicollinear features
X = df.drop(['Target','Expected_CTC','Previous_Employers','Certifications_Count'], axis =1)
y = df['Expected_CTC']

# train-test split 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=42)

# Initialize and train the Multiple Regression model
model =  LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('Multiple Linear Regression Accuracy:', round(r2_score(y_test, y_pred),3))


Multiple Linear Regression Accuracy: 0.792


### 1.2 Chi-Square

In [115]:
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [116]:
cat_df=df[['sex','embarked','class','alone','sibsp','parch','fare']]
cat_df.head()

Unnamed: 0,sex,embarked,class,alone,sibsp,parch,fare
0,male,S,Third,False,1,0,7.25
1,female,C,First,False,1,0,71.2833
2,female,S,Third,True,0,0,7.925
3,female,S,First,False,1,0,53.1
4,male,S,Third,True,0,0,8.05


In [117]:
from sklearn.preprocessing import LabelEncoder
for col in ['sex','embarked','class','alone']:
    encoder = LabelEncoder()
    cat_df[col] = encoder.fit_transform(cat_df[col])

In [118]:
cat_df

Unnamed: 0,sex,embarked,class,alone,sibsp,parch,fare
0,1,2,2,0,1,0,7.2500
1,0,0,0,0,1,0,71.2833
2,0,2,2,1,0,0,7.9250
3,0,2,0,0,1,0,53.1000
4,1,2,2,1,0,0,8.0500
...,...,...,...,...,...,...,...
886,1,2,1,1,0,0,13.0000
887,0,2,0,1,0,0,30.0000
888,0,2,2,0,1,2,23.4500
889,1,0,0,1,0,0,30.0000


In [119]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
X = cat_df
y = df['survived']

# train-test split 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=42)

# Initialize and train the SVC model
model = SVC(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('SVC Accuracy:', round(accuracy_score(y_test, y_pred),3))

SVC Accuracy: 0.654


In [120]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif

chi2_selector = SelectKBest(score_func=chi2, k=2)
chi2_selector.fit(X[['sex','embarked','class','alone','sibsp','parch']], y)

# Show selected features
chi2_scores = pd.Series(chi2_selector.scores_, index=['sex','embarked','class','alone','sibsp','parch'])
print("Chi2 Top Features:\n", chi2_scores.sort_values(ascending=False))


Chi2 Top Features:
 sex         92.702447
class       54.465866
alone       14.640793
parch       10.097499
embarked     9.755456
sibsp        2.581865
dtype: float64


In [121]:
X_chi2 = chi2_selector.fit_transform(X[['sex','embarked','class','alone','sibsp','parch']], y)

In [122]:
X_train.shape

(712, 7)

In [123]:
# train-test split 
X_train, X_test, y_train, y_test = train_test_split(X_chi2,y, test_size =0.2, random_state=42)

model =SVC(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('SVC Accuracy:', round(accuracy_score(y_test, y_pred),3))

SVC Accuracy: 0.765


### 1.3 ANOVA 

In [124]:
selector = SelectKBest(score_func=f_classif, k=2)
X_anova = selector.fit_transform(X[['sex','embarked','class','alone','sibsp','parch']], y)

print('Original number of features:', X.shape)
print('Reduced number of features:', X_anova.shape)

Original number of features: (891, 7)
Reduced number of features: (891, 2)


In [125]:
# train-test split 
X_train, X_test, y_train, y_test = train_test_split(X_anova,y, test_size =0.2, random_state=42)

# Initialize and train the Support model
model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('SVC Accuracy:', round(accuracy_score(y_test, y_pred),3))

SVC Accuracy: 0.765


## 2. Wrapper Method

In [126]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

model = LogisticRegression(random_state = 42)
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X, y)

rfe_selected = pd.Series(rfe.support_, index=X.columns)
print("RFE Selected Features:\n", rfe_selected[rfe_selected == True].index.tolist())


RFE Selected Features:
 ['sex', 'class']


In [127]:
X_train, X_test, y_train, y_test = train_test_split(X[['sex','class']],y, test_size =0.2, random_state=42)

model = LogisticRegression(random_state = 42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('SVC Accuracy:', round(accuracy_score(y_test, y_pred),3))

SVC Accuracy: 0.782


## 3. Embedded Method

In [128]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X, y)

rf_importance = pd.Series(rf.feature_importances_, index=X.columns)
print("Top Features from Random Forest:\n", rf_importance.sort_values(ascending=False).head())


Top Features from Random Forest:
 fare     0.402572
sex      0.324042
class    0.105457
sibsp    0.054703
parch    0.052441
dtype: float64


In [129]:
X_train, X_test, y_train, y_test = train_test_split(X[['fare','sex','class']],y, test_size =0.2, random_state=42)

model = RandomForestClassifier(random_state = 42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('Random Forest Accuracy:', round(accuracy_score(y_test, y_pred),3))

SVC Accuracy: 0.816


| **Method**                              | **Type**                           | **Use When…**                                                                 | **Best For**                                  |
| --------------------------------------- | ---------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------- |
| **Correlation**                         | Filter                             | You want to remove multicollinearity or quickly assess relationships          | Any problem (classification/regression)       |
| **Chi-Square**                          | Filter                             | Target is categorical, and features are non-negative and categorical/discrete | Classification                                |
| **ANOVA F-test**                        | Filter                             | Target is categorical, and features are continuous                            | Classification                                |
| **RFE (Recursive Feature Elimination)** | Wrapper                            | You want high accuracy, don't mind slower training                            | Small to medium datasets                      |
| **Tree-based (Random Forest, XGBoost)** | Embedded                           | You want fast, accurate feature ranking                                       | Any problem (auto-handles nonlinearity)       |
