### Load dataset split dataset into train and CV

In [41]:
!pip install -q catboost
import os
import numpy as np
import pandas as pd
from sklearn.metrics import  accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [42]:
BASE_PATH = "/content/drive/MyDrive/DSN Quali"

train_set = pd.read_csv(os.path.join( BASE_PATH, "train-03-06.csv"))
test_set = pd.read_csv(os.path.join( BASE_PATH, "test-03-06.csv"))

### Feature engineering

In [43]:
train_set['Age_MaxHR'] = train_set['Age'] * train_set['thalach']
train_set['Cholesterol_RestingBP'] = train_set['chol'] * train_set['trestbps']
train_set['Age_RestingBP'] = train_set['Age'] * train_set['trestbps']

train_set['BP_Cholesterol_Ratio'] = train_set['trestbps'] / train_set['chol']
train_set['HR_Age_Ratio'] = train_set['thalach'] / train_set['Age']

train_set['Cardio_Load'] = train_set['trestbps'] + train_set['thalach']

train_set['Log_Cholesterol'] = np.log1p(train_set['chol'])
train_set['Log_Oldpeak'] = np.log1p(train_set['oldpeak'])

train_set['sqrt_MaxHR'] = np.sqrt(train_set['thalach'])

train_set['HR_Reserve'] = 220 - train_set['Age'] - train_set['thalach']


In [44]:
# feature engineering on  test set
test_set['Age_MaxHR'] = test_set['Age'] * test_set['thalach']
test_set['Cholesterol_RestingBP'] = test_set['chol'] * test_set['trestbps']
test_set['Age_RestingBP'] = test_set['Age'] * test_set['trestbps']

test_set['BP_Cholesterol_Ratio'] = test_set['trestbps'] / test_set['chol']
test_set['HR_Age_Ratio'] = test_set['thalach'] / test_set['Age']

test_set['Cardio_Load'] = test_set['trestbps'] + test_set['thalach']

test_set['Log_Cholesterol'] = np.log1p(test_set['chol'])
test_set['Log_Oldpeak'] = np.log1p(test_set['oldpeak'])

test_set['sqrt_MaxHR'] = np.sqrt(test_set['thalach'])

test_set['HR_Reserve'] = 220 - test_set['Age'] - test_set['thalach']

In [45]:
#test_set.columns = train_set.columns[:-1]
cat_variables = ["age_clusters",
                 "trestbps_clusters",
                 "chol_clusters",
                 "thalach_clusters",
                 "restecg",
                 "cp",
                 "fbs",
                 "exang",
                 "slope",
                 "ca",
                 "thal"
                 ]
train_set = pd.get_dummies(data = train_set,
                         prefix = cat_variables,
                         columns = cat_variables)
test_set = pd.get_dummies(data = test_set,
                         prefix = cat_variables,
                         columns = cat_variables)

### Get important features

In [46]:
top_10_positives = ['exang_1', 'cp_0', 'HR_Reserve', 'thalach', 'Sex', 'oldpeak', 'thal_1', 'ca_0', 'Age', 'Log_Oldpeak', 'Id', 'target']

train_set  =   train_set[top_10_positives]
test_set = test_set[top_10_positives[:-1]]

In [47]:
X_train, X_val, y_train, y_val = train_test_split(train_set.drop(['Id','target'], axis=1), train_set['target'], train_size = .8, random_state=42)

### Algorithms

In [49]:
def logit_reg(X_train, y_train):
    log_reg = LogisticRegression(
        max_iter=100,          # Increase max_iter if you encounter convergence issues
        solver='lbfgs',       # Optimization algorithm
        C=206.913808111479,                # Inverse regularization strength
        penalty='l2',         # Regularization type
        random_state=42       # For reproducibility
    )

    log_reg.fit(X_train, y_train)
    return log_reg

def random_forest(X_train, y_train):
    random_forest = RandomForestClassifier(
        n_estimators= 50,       # Number of trees in the forest
        criterion= 'entropy',     # or 'entropy'
        max_depth= 10,          # Limit the depth of the trees
        min_samples_split=2,     # Minimum number of samples required to split an internal node
        min_samples_leaf=1,      # Minimum number of samples required to be at a leaf node
        random_state=42          # For reproducibility
    )
    random_forest.fit(X_train, y_train)
    return random_forest



### Call all algorithms

In [50]:
# call all defined algorithms into a dictionary1.0
algos = {"logistic": logit_reg(X_train, y_train), "random_rf": random_forest(X_train, y_train)}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
for i in algos:
#    print(algos[i])
    y_pred = algos[i].predict(X_train)
    y_pred_val = algos[i].predict(X_val)

    print(f"{i} Train: {accuracy_score(y_train, y_pred)}  Val: {accuracy_score(y_val, y_pred_val)}")


logistic Train: 0.8195823348168435  Val: 0.8254620123203286
random_rf Train: 0.921773365285861  Val: 0.8234086242299795


### Run prediction

In [52]:
#test_set.columns = train_set.columns[:-1]
#test_set = test_set[top_10_positives[:-1]]
for i in algos:
    y_pred = algos[i].predict(test_set.drop(['Id'], axis=1))

    submission = pd.DataFrame({'Id': test_set['Id'], 'target': y_pred})
    submission.to_csv(f"submission_{i}.csv", index=False)
