In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb


# Load the data
data_path = '/kaggle/input/binary-classification-competition/train.csv'
test_data_path = '/kaggle/input/binary-classification-competition/test.csv'
train_data = pd.read_csv(data_path)
test_data = pd.read_csv(test_data_path)

X_test = test_data.drop(columns=['Unnamed: 0'])

# Display basic information about the data
train_data_info = train_data.info()
train_data_head = train_data.head()

print(train_data_info, train_data_head)

# Set style
sns.set_style("whitegrid")

# Plot
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=train_data)
plt.title('Distribution of Target Variable (label)')
plt.xlabel('Label')
plt.ylabel('Count')
# plt.show()

# Get the count of each class in the 'label' column
print(train_data['label'].value_counts())

# Display summary statistics of the features (excluding 'Unnamed: 0' and 'label')
print(train_data.drop(columns=['Unnamed: 0', 'label']).describe())


# Selecting a few columns for visualization
selected_columns = ['column_a', 'column_c', 'column_d', 'column_e', 'column_f']

# Plotting the distribution of selected columns
plt.figure(figsize=(15, 10))

for i, col in enumerate(selected_columns, 1):
    plt.subplot(2, 3, i)
    sns.histplot(train_data[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

# plt.tight_layout()
# plt.show()


# Data Preparation: Extract features and target, and split the data
X = train_data.drop(columns=['Unnamed: 0', 'label'])
y = train_data['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Model Training and Evaluation: Logistic Regression
logreg_model = LogisticRegression(random_state=42, max_iter=500)
logreg_model.fit(X_train, y_train)
y_pred_train = logreg_model.predict(X_train)
y_pred_val = logreg_model.predict(X_val)

# Evaluation Metrics
metrics_train = {
    'Accuracy': accuracy_score(y_train, y_pred_train),
    'Precision': precision_score(y_train, y_pred_train),
    'Recall': recall_score(y_train, y_pred_train),
    'F1 Score': f1_score(y_train, y_pred_train),
    'ROC AUC': roc_auc_score(y_train, y_pred_train)
}

metrics_val = {
    'Accuracy': accuracy_score(y_val, y_pred_val),
    'Precision': precision_score(y_val, y_pred_val),
    'Recall': recall_score(y_val, y_pred_val),
    'F1 Score': f1_score(y_val, y_pred_val),
    'ROC AUC': roc_auc_score(y_val, y_pred_val)
}


print (metrics_train)
print (metrics_val)


# Initialize the model for Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predictions
y_pred_train_dt = dt_model.predict(X_train)
y_pred_val_dt = dt_model.predict(X_val)

# Evaluation
metrics_train_dt = {
    'Accuracy': accuracy_score(y_train, y_pred_train_dt),
    'Precision': precision_score(y_train, y_pred_train_dt),
    'Recall': recall_score(y_train, y_pred_train_dt),
    'F1 Score': f1_score(y_train, y_pred_train_dt),
    'ROC AUC': roc_auc_score(y_train, y_pred_train_dt)
}

metrics_val_dt = {
    'Accuracy': accuracy_score(y_val, y_pred_val_dt),
    'Precision': precision_score(y_val, y_pred_val_dt),
    'Recall': recall_score(y_val, y_pred_val_dt),
    'F1 Score': f1_score(y_val, y_pred_val_dt),
    'ROC AUC': roc_auc_score(y_val, y_pred_val_dt)
}

print(metrics_train_dt, metrics_val_dt)



# Initialize the model for random forest
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_pred_train_rf = rf_model.predict(X_train)
y_pred_val_rf = rf_model.predict(X_val)

# Evaluation
metrics_train_rf = {
    'Accuracy': accuracy_score(y_train, y_pred_train_rf),
    'Precision': precision_score(y_train, y_pred_train_rf),
    'Recall': recall_score(y_train, y_pred_train_rf),
    'F1 Score': f1_score(y_train, y_pred_train_rf),
    'ROC AUC': roc_auc_score(y_train, y_pred_train_rf)
}

metrics_val_rf = {
    'Accuracy': accuracy_score(y_val, y_pred_val_rf),
    'Precision': precision_score(y_val, y_pred_val_rf),
    'Recall': recall_score(y_val, y_pred_val_rf),
    'F1 Score': f1_score(y_val, y_pred_val_rf),
    'ROC AUC': roc_auc_score(y_val, y_pred_val_rf)
}

print(metrics_train_rf, metrics_val_rf)


# Initialize the model for Support Vector Machines
svm_model = SVC(random_state=42, probability=True)

# Train the model
svm_model.fit(X_train, y_train)

# Predictions
y_pred_train_svm = svm_model.predict(X_train)
y_pred_val_svm = svm_model.predict(X_val)

# Evaluation
metrics_train_svm = {
    'Accuracy': accuracy_score(y_train, y_pred_train_svm),
    'Precision': precision_score(y_train, y_pred_train_svm),
    'Recall': recall_score(y_train, y_pred_train_svm),
    'F1 Score': f1_score(y_train, y_pred_train_svm),
    'ROC AUC': roc_auc_score(y_train, y_pred_train_svm)
}

metrics_val_svm = {
    'Accuracy': accuracy_score(y_val, y_pred_val_svm),
    'Precision': precision_score(y_val, y_pred_val_svm),
    'Recall': recall_score(y_val, y_pred_val_svm),
    'F1 Score': f1_score(y_val, y_pred_val_svm),
    'ROC AUC': roc_auc_score(y_val, y_pred_val_svm)
}

print(metrics_train_svm, metrics_val_svm)

# Initialize the model using Neural Networks
nn_model = MLPClassifier(random_state=42, max_iter=500)

# Train the model
nn_model.fit(X_train, y_train)

# Predictions
y_pred_train_nn = nn_model.predict(X_train)
y_pred_val_nn = nn_model.predict(X_val)

# Evaluation
metrics_train_nn = {
    'Accuracy': accuracy_score(y_train, y_pred_train_nn),
    'Precision': precision_score(y_train, y_pred_train_nn),
    'Recall': recall_score(y_train, y_pred_train_nn),
    'F1 Score': f1_score(y_train, y_pred_train_nn),
    'ROC AUC': roc_auc_score(y_train, y_pred_train_nn)
}

metrics_val_nn = {
    'Accuracy': accuracy_score(y_val, y_pred_val_nn),
    'Precision': precision_score(y_val, y_pred_val_nn),
    'Recall': recall_score(y_val, y_pred_val_nn),
    'F1 Score': f1_score(y_val, y_pred_val_nn),
    'ROC AUC': roc_auc_score(y_val, y_pred_val_nn)
}

print(metrics_train_nn, metrics_val_nn)


# Initialize the model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_train_xgb = xgb_model.predict(X_train)
y_pred_val_xgb = xgb_model.predict(X_val)

# Evaluation
metrics_train_xgb = {
    'Accuracy': accuracy_score(y_train, y_pred_train_xgb),
    'Precision': precision_score(y_train, y_pred_train_xgb),
    'Recall': recall_score(y_train, y_pred_train_xgb),
    'F1 Score': f1_score(y_train, y_pred_train_xgb),
    'ROC AUC': roc_auc_score(y_train, y_pred_train_xgb)
}

metrics_val_xgb = {
    'Accuracy': accuracy_score(y_val, y_pred_val_xgb),
    'Precision': precision_score(y_val, y_pred_val_xgb),
    'Recall': recall_score(y_val, y_pred_val_xgb),
    'F1 Score': f1_score(y_val, y_pred_val_xgb),
    'ROC AUC': roc_auc_score(y_val, y_pred_val_xgb)
}

print(metrics_train_xgb, metrics_val_xgb)


# Make predictions on the test set using svm, since svm was the best
predictions = svm_model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({'Unnamed: 0': test_data['Unnamed: 0'], 'label': predictions})
submission.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
submission_path = '/kaggle/working/submission.csv'
submission.to_csv(submission_path, index=False)