<a href="https://colab.research.google.com/github/Bhuvana1797/Data-Analysis-with-python-/blob/main/Python_DataAnalysis_Internshp%20TASK-5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score



In [15]:
# Load your dataset
df = pd.read_csv('/content/hearts.csv')



In [16]:
# 1. Feature Generation

# Example: Categorize age into groups
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 100], labels=['young', 'middle-aged', 'senior'])

# Example: Categorize cholesterol levels
df['cholesterol_category'] = pd.cut(df['chol'], bins=[0, 200, 240, 600], labels=['normal', 'high', 'very high'])

# Example: Calculate Heart Rate Reserve
df['heart_rate_reserve'] = df['thalach'] - df['trestbps']

# Example: Blood Pressure Ratio
df['bp_ratio'] = df['trestbps'] / df['thalach']

# Drop any columns that are not useful or redundant, if needed
# df = df.drop(['column_name'], axis=1)

# Define features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [19]:
# 2. Feature Selection using Recursive Feature Elimination (RFE)

# Initialize a Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Initialize RFE with the Random Forest model, selecting the top 5 features
rfe = RFE(estimator=rf_model, n_features_to_select=5, step=1)

# Convert categorical columns to numerical using one-hot encoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test) # Apply one-hot encoding to X_test before fitting RFE

# Fit RFE
rfe.fit(X_train, y_train)

# Selected features by RFE
selected_features = X_train.columns[rfe.support_]
print(f"Selected Features by RFE: {selected_features}")

# Transform the dataset to include only selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Train the model using the selected features
rf_model.fit(X_train_rfe, y_train)

# Predict on the test set and evaluate
y_pred = rf_model.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with RFE: {accuracy}")

Selected Features by RFE: Index(['age', 'cp', 'thalach', 'oldpeak', 'heart_rate_reserve'], dtype='object')
Test Accuracy with RFE: 0.9853658536585366


In [20]:
# 3. Optional: Apply PCA on the selected features

# Standardize the selected features
scaler = StandardScaler()
X_train_rfe_scaled = scaler.fit_transform(X_train_rfe)
X_test_rfe_scaled = scaler.transform(X_test_rfe)

# Apply PCA
pca = PCA(n_components=3)  # Adjust n_components based on explained variance
X_train_pca = pca.fit_transform(X_train_rfe_scaled)
X_test_pca = pca.transform(X_test_rfe_scaled)

# Train the model on PCA-transformed features
rf_model.fit(X_train_pca, y_train)

# Predict and evaluate
y_pred_pca = rf_model.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"Test Accuracy with RFE + PCA: {accuracy_pca}")

Test Accuracy with RFE + PCA: 1.0
