In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
dataset_path = "/content/Preprocessed_Data (1).csv"  # Replace with the actual path to your dataset
df = pd.read_csv(dataset_path)

# Assuming 'df' contains your dataset
# Preprocessing: Assuming 'X' contains your features and 'y' contains your target variable
X = df[['Sex', 'Age', 'Height', 'Weight', 'Low Income', 'Lower Middle Income', 'Upper Middle Income']]
y = df['Status']  # Assuming 'status' is your target variable

# Convert categorical variables to numerical using one-hot encoding
X = pd.get_dummies(X)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)
gb_predictions = gb_classifier.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_predictions)
print("Gradient Boosting Classifier Accuracy:", gb_accuracy)




Gradient Boosting Classifier Accuracy: 0.7976190476190477


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Gradient Boosting Classifier
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Instantiate Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()

# Perform grid search
grid_search = GridSearchCV(gb_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters and best accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Use the best estimator from grid search
best_gb_classifier = grid_search.best_estimator_

# Evaluate the model on test data
gb_predictions = best_gb_classifier.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_predictions)
print("Gradient Boosting Classifier Accuracy after Hyperparameter Tuning:", gb_accuracy)




Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}
Best Accuracy: 0.8400852878464817
Gradient Boosting Classifier Accuracy after Hyperparameter Tuning: 0.8511904761904762


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature Selection using Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_scaled, y)
feature_importances = rf_classifier.feature_importances_
selected_features = SelectFromModel(rf_classifier, threshold='mean')
X_selected = selected_features.fit_transform(X_scaled, y)

# Ensemble of Gradient Boosting and XGBoost classifiers
gb_classifier = GradientBoostingClassifier(learning_rate=0.05, max_depth=3, n_estimators=50, random_state=42)
xgb_classifier = XGBClassifier(learning_rate=0.05, max_depth=3, n_estimators=50, random_state=42)
voting_classifier = VotingClassifier(estimators=[('gb', gb_classifier), ('xgb', xgb_classifier)], voting='soft')

# Pipeline for feature selection, scaling, and classification
pipeline = Pipeline([
    ('feature_selection', selected_features),
    ('scaler', scaler),
    ('classification', voting_classifier)
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))

# Fit the pipeline on the entire dataset
pipeline.fit(X, y)

# Make predictions on the test set
test_predictions = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)




Cross-Validation Scores: [0.82738095 0.82738095 0.82634731 0.83233533 0.83233533]
Mean Accuracy: 0.8291559737667523
Test Accuracy: 0.8511904761904762
