In [11]:
import pandas as pd

# Load the dataset
data = pd.read_csv('heart.csv')


In [12]:
# Example: Create an interaction feature between age and cholesterol level
data['age_chol'] = data['age'] * data['chol']


In [19]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Load the dataset
data = pd.read_csv('heart.csv')

# Create polynomial features for 'age' and 'chol'
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(data[['age', 'chol']])

# Create a DataFrame with polynomial features
feature_names = poly.get_feature_names_out(['age', 'chol'])
poly_features_df = pd.DataFrame(poly_features, columns=feature_names, index=data.index)

# Concatenate polynomial features with the original dataset
data = pd.concat([data, poly_features_df], axis=1)

# Ensure 'chol' is a Series and apply logarithm transformation
if isinstance(data['chol'], pd.Series):
    # Applying log1p transformation
    data['log_chol'] = np.log1p(data['chol'])
else:
    print("Error: 'chol' is not a Series")

# Display the first few rows to verify
print(data.head())


Error: 'chol' is not a Series
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target   age   chol   age^2  age chol   chol^2  
0   2     3       0  52.0  212.0  2704.0   11024.0  44944.0  
1   0     3       0  53.0  203.0  2809.0   10759.0  41209.0  
2   0     3       0  70.0  174.0  4900.0   12180.0  30276.0  
3   1     3       0  61.0  203.0  3721.0   12383.0  41209.0  
4   3     2       0  62.0  294.0  3844.0   18228.0  86436.0  


In [20]:
import pandas as pd
import numpy as np

# Load your data
data = pd.read_csv('heart.csv')

# Check the columns to ensure 'chol' exists and is not duplicated
print(data.columns)

# Drop duplicate 'chol' columns if necessary
data = data.loc[:, ~data.columns.duplicated()]

# Apply the logarithm transformation to the 'chol' column
# Ensure that 'chol' column is numeric and handle cases where it might have zero or negative values
data['log_chol'] = np.log1p(data['chol'])

# Display the first few rows to verify the transformation
print(data.head())


Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  log_chol  
0   2     3       0  5.361292  
1   0     3       0  5.318120  
2   0     3       0  5.164786  
3   1     3       0  5.318120  
4   3     2       0  5.686975  


In [27]:
from sklearn.preprocessing import PolynomialFeatures

# Initialize PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)

# Generate polynomial features based on 'age' and 'log_chol'
poly_features = poly.fit_transform(data[['age', 'log_chol']])

# Create a DataFrame for polynomial features
# Generate feature names manually
feature_names = []
for i in range(poly_features.shape[1]):
    feature_names.append(f'feature_{i+1}')

# Create DataFrame with generated feature names
poly_features_df = pd.DataFrame(poly_features, columns=feature_names)

# Concatenate with the original data
data = pd.concat([data, poly_features_df], axis=1)


In [28]:
from sklearn.ensemble import RandomForestClassifier

# Define the target variable and features
X = data.drop(columns=['target'])
y = data['target']

# Initialize and fit the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame for feature importance
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)


         Feature  Importance
2             cp    0.096868
12          thal    0.090310
11            ca    0.089581
9        oldpeak    0.067533
7        thalach    0.065853
8          exang    0.045279
25     feature_7    0.035855
29    feature_11    0.033145
28    feature_10    0.032632
3       trestbps    0.032422
26     feature_8    0.030568
17  age log_chol    0.030324
10         slope    0.029709
1            sex    0.027236
4           chol    0.020526
18    log_chol^2    0.019974
21     feature_3    0.019837
15      log_chol    0.019043
22     feature_4    0.019034
32    feature_14    0.018296
27     feature_9    0.017836
13      log_chol    0.017722
0            age    0.016056
30    feature_12    0.016053
24     feature_6    0.016032
31    feature_13    0.015878
20     feature_2    0.014397
14           age    0.013713
19     feature_1    0.013474
23     feature_5    0.012853
16         age^2    0.010979
6        restecg    0.006544
5            fbs    0.004439


In [29]:
from sklearn.decomposition import PCA

# Initialize PCA and fit_transform on features
pca = PCA(n_components=0.95)  # 95% variance explained
X_pca = pca.fit_transform(X)

# Create a DataFrame for PCA components
pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])

# Concatenate with the original target
data_pca = pd.concat([pca_df, y.reset_index(drop=True)], axis=1)

print(data_pca.head())


           PC1  target
0  -684.395489       0
1  -474.309496       0
2  3710.898063       0
3  1352.131543       0
4  1602.173531       0


In [30]:
# Select top N important features
top_features = importance_df.head(10)['Feature'].tolist()
X_filtered = X[top_features]

# Fit a model with selected features
model.fit(X_filtered, y)


In [31]:
# Use the PCA-transformed data
X_pca_filtered = X_pca  # Already filtered by variance explained

# Fit a model with PCA features
model.fit(X_pca_filtered, y)


In [32]:
data.to_csv('heart_with_features.csv', index=False)
