# Featur Selection for linear regression ,logistic r, decision tree and Random forest using Titanic Dataset

# RFECV

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression

# Assuming you have split your dataset into features (X) and target variable (y)
estimator = LinearRegression()
selector = RFECV(estimator)
selector.fit(X, y)
best_feature_index = selector.support_.argmax()
best_feature = X.columns[best_feature_index]
print("Best feature for supervised learning:", best_feature)


# for Gini impurity

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Assuming you have split your dataset into features (X) and target variable (y)
model = DecisionTreeClassifier(criterion='entropy')  # 'gini' can also be used
model.fit(X, y)
best_feature_index = model.feature_importances_.argmax()
best_feature = X.columns[best_feature_index]
print("Best feature for decision tree:", best_feature)


# Filter base (SelectKBest)

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Titanic dataset (you can download it from https://www.kaggle.com/c/titanic/data)
# Make sure to replace 'train.csv' with the path to your dataset file.
data  =pd.read_csv(r'C:\Users\Ankit\Desktop\All folders\BrainyBeam tasks\train.csv')

# Preprocessing: Handle missing values and encode categorical variables
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
data['Cabin'].fillna('Unknown', inplace=True)

label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

# Select features and target variable
X = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform feature selection using SelectKBest with logistic regression
k_best = SelectKBest(score_func=f_classif, k=3)  # Choose the number of top features (e.g., 3)
X_train_new = k_best.fit_transform(X_train, y_train)
X_test_new = k_best.transform(X_test)

# Train a logistic regression model on the selected features
logistic_model = LogisticRegression()
logistic_model.fit(X_train_new, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test_new)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the Logistic Regression model: {accuracy:.2f}')


Accuracy of the Logistic Regression model: 0.78


# Wrapper (REF)

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Titanic dataset (you can download it from https://www.kaggle.com/c/titanic/data)
# Make sure to replace 'train.csv' with the path to your dataset file.
data  =pd.read_csv(r'C:\Users\Ankit\Desktop\All folders\BrainyBeam tasks\train.csv')

# Preprocessing: Handle missing values and encode categorical variables
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
data['Cabin'].fillna('Unknown', inplace=True)

label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

# Select features and target variable
X = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
logistic_model = LogisticRegression()

# Create an RFE object with the logistic regression model
rfe = RFE(estimator=logistic_model, n_features_to_select=3)  # Choose the number of features to select
rfe.fit(X_train, y_train)

# Get the selected features
selected_features = X_train.columns[rfe.support_]

# Train a logistic regression model on the selected features
logistic_model.fit(X_train[selected_features], y_train)

# Make predictions on the test set using the model with selected features
y_pred = logistic_model.predict(X_test[selected_features])

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the Logistic Regression model with RFE: {accuracy:.2f}')


Accuracy of the Logistic Regression model with RFE: 0.77


# Hybrid technique

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load and preprocess the Titanic dataset as shown in previous examples

# Select features and target variable
X = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hybrid approach: SelectKBest as the filter step, followed by RFE as the wrapper step
# SelectKBest with ANOVA F-statistic as a filter method
k_best = SelectKBest(score_func=f_classif, k=5)
X_train_k_best = k_best.fit_transform(X_train, y_train)

# Create a logistic regression model
logistic_model = LogisticRegression()

# Use RFE for feature selection as the wrapper step
rfe = RFE(estimator=logistic_model, n_features_to_select=3)
X_train_rfe = rfe.fit_transform(X_train_k_best, y_train)

# Train a logistic regression model on the selected features
logistic_model.fit(X_train_rfe, y_train)

# Transform the test data using the same feature selection steps
X_test_k_best = k_best.transform(X_test)
X_test_rfe = rfe.transform(X_test_k_best)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test_rfe)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the Logistic Regression model with hybrid feature selection: {accuracy:.2f}')


Accuracy of the Logistic Regression model with hybrid feature selection: 0.77


# Embedded technique

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Titanic dataset (you can download it from https://www.kaggle.com/c/titanic/data)
# Make sure to replace 'train.csv' with the path to your dataset file.
data  =pd.read_csv(r'C:\Users\Ankit\Desktop\All folders\BrainyBeam tasks\train.csv')

# Preprocessing: Handle missing values and encode categorical variables
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
data['Cabin'].fillna('Unknown', inplace=True)

label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

# Select features and target variable
X = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the Random Forest model to your data
rf_classifier.fit(X_train, y_train)

# Get feature importances from the trained model
feature_importances = rf_classifier.feature_importances_

# Print feature importances
for feature, importance in zip(X.columns, feature_importances):
    print(f'{feature}: {importance:.4f}')

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the Random Forest model: {accuracy:.2f}')


Pclass: 0.0870
Sex: 0.2714
Age: 0.2500
SibSp: 0.0537
Parch: 0.0399
Fare: 0.2650
Embarked: 0.0330
Accuracy of the Random Forest model: 0.82


In [20]:
feature_importances

array([0.08695697, 0.2714104 , 0.24999525, 0.05368549, 0.03989733,
       0.26501031, 0.03304424])