In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load Data
dataset = pd.read_csv("Kidney_data.csv")
dataset = dataset.drop('id', axis=1)
dataset.dtypes
dataset['rbc'] = dataset['rbc'].replace(to_replace = {'normal' : 0, 'abnormal' : 1})
dataset['pc'] = dataset['pc'].replace(to_replace = {'normal' : 0, 'abnormal' : 1})
dataset['pcc'] = dataset['pcc'].replace(to_replace = {'notpresent':0,'present':1})
dataset['ba'] = dataset['ba'].replace(to_replace = {'notpresent':0,'present':1})
dataset['htn'] = dataset['htn'].replace(to_replace = {'yes' : 1, 'no' : 0})
dataset['dm'] = dataset['dm'].replace(to_replace = {'\tyes':'yes', ' yes':'yes', '\tno':'no'})
dataset['dm'] = dataset['dm'].replace(to_replace = {'yes' : 1, 'no' : 0})
dataset['cad'] = dataset['cad'].replace(to_replace = {'\tno':'no'})
dataset['cad'] = dataset['cad'].replace(to_replace = {'yes' : 1, 'no' : 0})

dataset['appet'] = dataset['appet'].replace(to_replace={'good':1,'poor':0,'no':np.nan})

dataset['pe'] = dataset['pe'].replace(to_replace = {'yes' : 1, 'no' : 0})

dataset['ane'] = dataset['ane'].replace(to_replace = {'yes' : 1, 'no' : 0})

dataset['classification'] = dataset['classification'].replace(to_replace={'ckd\t':'ckd'})
dataset["classification"] = [1 if i == "ckd" else 0 for i in dataset["classification"]]

dataset.dtypes


dataset['pcv'] = pd.to_numeric(dataset['pcv'], errors='coerce')
dataset['wc'] = pd.to_numeric(dataset['wc'], errors='coerce')
dataset['rc'] = pd.to_numeric(dataset['rc'], errors='coerce')
dataset.dtypes

for feature in dataset.columns:
    dataset[feature] = dataset[feature].fillna(dataset[feature].median())
X = dataset.drop('classification', axis=1)
y = dataset['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
### RANDOM FOREST CLASSIFIER ###

# Baseline Model with fewer estimators and less informative features
rf_baseline = RandomForestClassifier(n_estimators=5, random_state=42)  # Few trees
rf_baseline.fit(X_train[['sg', 'pc']], y_train)  # Using less informative features
y_pred_rf_baseline = rf_baseline.predict(X_test[['sg', 'pc']])

# Baseline Random Forest Model Accuracy and Confusion Matrix
rf_baseline_accuracy = accuracy_score(y_test, y_pred_rf_baseline)
print(f"Random Forest Baseline Model Accuracy: {rf_baseline_accuracy:.2f}")
print("Random Forest Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_baseline))
# Plot Confusion Matrix for Baseline Random Forest Model
sns.heatmap(confusion_matrix(y_test, y_pred_rf_baseline), annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest Baseline Model Confusion Matrix')
plt.show()

### ADA BOOST CLASSIFIER ###

# Baseline Model with fewer estimators
ada_baseline = AdaBoostClassifier(n_estimators=5, random_state=42)  # Few trees
ada_baseline.fit(X_train[['sg', 'pc']], y_train)
y_pred_ada_baseline = ada_baseline.predict(X_test[['sg', 'pc']])

# Baseline AdaBoost Model Accuracy and Confusion Matrix
ada_baseline_accuracy = accuracy_score(y_test, y_pred_ada_baseline)
print(f"AdaBoost Baseline Model Accuracy: {ada_baseline_accuracy:.2f}")
print("AdaBoost Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ada_baseline))
# Plot Confusion Matrix for Baseline AdaBoost Model
sns.heatmap(confusion_matrix(y_test, y_pred_ada_baseline), annot=True, fmt='d', cmap='Blues')
plt.title('AdaBoost Baseline Model Confusion Matrix')
plt.show()

### GRADIENT BOOSTING CLASSIFIER ###

# Baseline Model with fewer estimators
gb_baseline = GradientBoostingClassifier(n_estimators=5, random_state=42)  # Few trees
gb_baseline.fit(X_train[['sg', 'pc']], y_train)
y_pred_gb_baseline = gb_baseline.predict(X_test[['sg', 'pc']])

# Baseline Gradient Boosting Model Accuracy and Confusion Matrix
gb_baseline_accuracy = accuracy_score(y_test, y_pred_gb_baseline)
print(f"Gradient Boosting Baseline Model Accuracy: {gb_baseline_accuracy:.2f}")
print("Gradient Boosting Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb_baseline))

# Plot Confusion Matrix for Baseline Gradient Boosting Model
sns.heatmap(confusion_matrix(y_test, y_pred_gb_baseline), annot=True, fmt='d', cmap='Blues')
plt.title('Gradient Boosting Baseline Model Confusion Matrix')
plt.show()
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# Baseline Model (SVM without feature selection or scaling)
svm_baseline = SVC(random_state=42)
svm_baseline.fit(X_train, y_train)
y_pred_svm_baseline = svm_baseline.predict(X_test)

# Baseline SVM Model Accuracy and Confusion Matrix
svm_baseline_accuracy = accuracy_score(y_test, y_pred_svm_baseline)
print(f"SVM Baseline Model Accuracy: {svm_baseline_accuracy:.2f}")
print("SVM Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm_baseline))
# Plot Confusion Matrix for Baseline Gradient Boosting Model
sns.heatmap(confusion_matrix(y_test, y_pred_gb_baseline), annot=True, fmt='d', cmap='Blues')
plt.title('Gradient Boosting Baseline Model Confusion Matrix')
plt.show()




### K-NEAREST NEIGHBORS (KNN) ###

# Baseline Model (KNN without feature selection or scaling)
knn_baseline = KNeighborsClassifier()
knn_baseline.fit(X_train, y_train)
y_pred_knn_baseline = knn_baseline.predict(X_test)

# Baseline KNN Model Accuracy and Confusion Matrix
knn_baseline_accuracy = accuracy_score(y_test, y_pred_knn_baseline)
print(f"KNN Baseline Model Accuracy: {knn_baseline_accuracy:.2f}")
print("KNN Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn_baseline))

# Plot Confusion Matrix for Baseline KNN Model
sns.heatmap(confusion_matrix(y_test, y_pred_knn_baseline), annot=True, fmt='d', cmap='Blues')
plt.title('KNN Baseline Model Confusion Matrix')
plt.show()
# Comparison of performance across baseline models only
plt.figure(figsize=(10, 6))
models_baseline = ['RF Baseline', 'GB Baseline', 'SVM Baseline', 'KNN Baseline', 'AdaBoost Baseline']
accuracies_baseline = [rf_baseline_accuracy, gb_baseline_accuracy, svm_baseline_accuracy, knn_baseline_accuracy, ada_baseline_accuracy]

sns.barplot(x=models_baseline, y=accuracies_baseline, palette='Set2')
plt.ylim(0, 1)
plt.title('Accuracy Comparison Across Baseline Models')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.show()

# Importing Libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# for displaying all feature from dataset:
pd.pandas.set_option('display.max_columns', None)
# Reading Dataset:
dataset = pd.read_csv("Kidney_data.csv")
# Top 5 records:
dataset.head()
# Dropping unneccsary feature :
dataset = dataset.drop('id', axis=1)
# Shape of dataset:
dataset.shape
# Cheaking Missing (NaN) Values:
dataset.isnull().sum()

# Description:
dataset.describe()
# Datatypes:
dataset.dtypes
dataset.head()
dataset['rbc'].value_counts()
dataset['rbc'] = dataset['rbc'].replace(to_replace = {'normal' : 0, 'abnormal' : 1})
dataset['pc'].value_counts()
dataset['pc'] = dataset['pc'].replace(to_replace = {'normal' : 0, 'abnormal' : 1})


dataset['pcc'].value_counts()
data=dataset['pcc']
data
dataset['pcc'] = dataset['pcc'].replace(to_replace = {'notpresent':0,'present':1})
dataset['ba'].value_counts()

dataset['ba'] = dataset['ba'].replace(to_replace = {'notpresent':0,'present':1})
data=dataset['ba']
data
dataset['htn'].value_counts()
dataset['htn'] = dataset['htn'].replace(to_replace = {'yes' : 1, 'no' : 0})
data=dataset['htn']
data
dataset['dm'].value_counts()

dataset['dm'] = dataset['dm'].replace(to_replace = {'\tyes':'yes', ' yes':'yes', '\tno':'no'})

dataset['dm'] = dataset['dm'].replace(to_replace = {'yes' : 1, 'no' : 0})

dataset['cad'].value_counts()

dataset['cad'] = dataset['cad'].replace(to_replace = {'\tno':'no'})

dataset['cad'] = dataset['cad'].replace(to_replace = {'yes' : 1, 'no' : 0})

dataset['appet'].unique()
dataset['appet'] = dataset['appet'].replace(to_replace={'good':1,'poor':0,'no':np.nan})
dataset['pe'].value_counts()
dataset['pe'] = dataset['pe'].replace(to_replace = {'yes' : 1, 'no' : 0})
dataset['ane'].value_counts()
dataset['ane'] = dataset['ane'].replace(to_replace = {'yes' : 1, 'no' : 0})
dataset['classification'].value_counts()
dataset['classification'] = dataset['classification'].replace(to_replace={'ckd\t':'ckd'})
dataset["classification"] = [1 if i == "ckd" else 0 for i in dataset["classification"]]
dataset.head()
dataset.dtypes
dataset['pcv'] = pd.to_numeric(dataset['pcv'], errors='coerce')
dataset['wc'] = pd.to_numeric(dataset['wc'], errors='coerce')
dataset['rc'] = pd.to_numeric(dataset['rc'], errors='coerce')
dataset.dtypes
dataset.describe()
# Cheaking Missing (NaN) Values:
dataset.isnull().sum().sort_values(ascending=False)
dataset.columns
features = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
           'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
           'appet', 'pe', 'ane']
for feature in features:
    dataset[feature] = dataset[feature].fillna(dataset[feature].median())
dataset.isnull().any().sum()
plt.figure(figsize=(24,14))
sns.heatmap(dataset.corr(), annot=True, cmap='YlGnBu')
plt.show()
dataset.drop('pcv', axis=1, inplace=True)
dataset.head()
sns.countplot(dataset['classification'])
# Independent and Dependent Feature:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
X.head()
# Feature Importance:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(X,y)

plt.figure(figsize=(8,6))
ranked_features=pd.Series(model.feature_importances_,index=X.columns)
ranked_features.nlargest(24).plot(kind='barh')
plt.show()
ranked_features.nlargest(8).index

X = dataset[['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']]
X.head()
X.tail()
y.head()
# Train Test Split:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=33)
print(X_train.shape)
print(X_test.shape)
# Importing Performance Metrics:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# RandomForestClassifier:
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier()
RandomForest = RandomForest.fit(X_train,y_train)

# Predictions:
y_pred = RandomForest.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# AdaBoostClassifier:
from sklearn.ensemble import AdaBoostClassifier
AdaBoost = AdaBoostClassifier()
AdaBoost = AdaBoost.fit(X_train,y_train)

# Predictions:
y_pred = AdaBoost.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
# GradientBoostingClassifier:
from sklearn.ensemble import GradientBoostingClassifier
GradientBoost = GradientBoostingClassifier()
GradientBoost = GradientBoost.fit(X_train,y_train)

# Predictions:
y_pred = GradientBoost.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
from sklearn.svm import SVC
SVM = SVC(kernel='linear')  
SVM = SVM.fit(X_train, y_train)

# Predictions:
y_pred = SVM.predict(X_test)

# Performance:
print('SVM Accuracy:', accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# # KNN Classifier:
# from sklearn.neighbors import KNeighborsClassifier
# KNN = KNeighborsClassifier(n_neighbors=5)  
# KNN = KNN.fit(X_train, y_train)

# # Predictions:
# y_pred = KNN.predict(X_test)

# # Performance:
# print('KNN Accuracy:', accuracy_score(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

# # Collecting accuracies of all models
# models = ['RandomForest', 'AdaBoost', 'GradientBoost', 'SVM', 'KNN']
# accuracies = [accuracy_score(y_test, RandomForest.predict(X_test)),
#               accuracy_score(y_test, AdaBoost.predict(X_test)),
#               accuracy_score(y_test, GradientBoost.predict(X_test)),
#               accuracy_score(y_test, SVM.predict(X_test)),
#               accuracy_score(y_test, KNN.predict(X_test))]

# # Plotting the results
# plt.figure(figsize=(10,6))
# sns.barplot(x=models, y=accuracies, palette="viridis")
# plt.title('Comparison of Model Accuracies')
# plt.ylabel('Accuracy')
# plt.show()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
# print(f"Baseline Model Accuracy 1: {baseline_accuracy:.2f}")
# print("Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_baseline))


# selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train[selected_features])
# X_test_scaled = scaler.transform(X_test[selected_features])

# # RandomForest Classifier with optimized features and scaling
# rf_optimized = RandomForestClassifier(random_state=42)
# rf_optimized.fit(X_train_scaled, y_train)
# y_pred_optimized = rf_optimized.predict(X_test_scaled)
# optimized_accuracy = accuracy_score(y_test, y_pred_optimized)
# print(f"Optimized Model Accuracy: {optimized_accuracy:.2f}")
# print("Optimized Confusion Matrix:\n", confusion_matrix(y_test, y_pred_optimized))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Assuming X and y are already defined and are your feature matrix and target variable
# If not, replace this with the correct dataset loading method.
# X, y = ...  # Load your dataset here

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### BASELINE MODEL ###

# RandomForest Classifier as a baseline
baseline_model = RandomForestClassifier(random_state=42)
baseline_model.fit(X_train, y_train)

# Predictions for the baseline model
y_pred_baseline = baseline_model.predict(X_test)

# Baseline model performance
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline Model Accuracy: {baseline_accuracy:.2f}")
print("Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_baseline))
print("Baseline Classification Report:\n", classification_report(y_test, y_pred_baseline))

### OPTIMIZED FEATURES & SCALING ###

# Feature selection
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_test_scaled = scaler.transform(X_test[selected_features])

# RandomForest Classifier with optimized features and scaling
rf_optimized = RandomForestClassifier(random_state=42)
rf_optimized.fit(X_train_scaled, y_train)

# Predictions for the optimized model
y_pred_optimized = rf_optimized.predict(X_test_scaled)

# Optimized model performance
optimized_accuracy = accuracy_score(y_test, y_pred_optimized)
print(f"Optimized Model Accuracy: {optimized_accuracy:.2f}")
print("Optimized Confusion Matrix:\n", confusion_matrix(y_test, y_pred_optimized))
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_optimized))

### OTHER MODELS ###

# Instantiate and train models
AdaBoost = AdaBoostClassifier(random_state=42)
GradientBoost = GradientBoostingClassifier(random_state=42)
SVM = SVC(kernel='linear', random_state=42)
KNN = KNeighborsClassifier(n_neighbors=5)

# Fit models
AdaBoost.fit(X_train, y_train)
GradientBoost.fit(X_train, y_train)
SVM.fit(X_train, y_train)
KNN.fit(X_train, y_train)

# Predictions for each model
y_pred_adaboost = AdaBoost.predict(X_test)
y_pred_gradientboost = GradientBoost.predict(X_test)
y_pred_svm = SVM.predict(X_test)
y_pred_knn = KNN.predict(X_test)

# Collecting accuracies of all models
models = ['RandomForest', 'AdaBoost', 'GradientBoost', 'SVM', 'KNN']
accuracies = [
    accuracy_score(y_test, y_pred_baseline),
    accuracy_score(y_test, y_pred_adaboost),
    accuracy_score(y_test, y_pred_gradientboost),
    accuracy_score(y_test, y_pred_svm),
    accuracy_score(y_test, y_pred_knn)
]

# Plotting the results
plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies, palette="viridis")
plt.title('Comparison of Model Accuracies')
plt.ylabel('Accuracy')
plt.show()


# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Split dataset for baseline model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

# Baseline RandomForest model
rf_baseline = RandomForestClassifier(random_state=42)
rf_baseline.fit(X_train, y_train)

# Baseline predictions
y_pred_baseline = rf_baseline.predict(X_test)

# Baseline performance
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline Model Accuracy: {baseline_accuracy:.2f}")
print("Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_baseline))
print("Baseline Classification Report:\n", classification_report(y_test, y_pred_baseline))

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names

# Split dataset for optimized model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale selected features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_test_scaled = scaler.transform(X_test[selected_features])

# Optimized RandomForest model
rf_optimized = RandomForestClassifier(random_state=42)
rf_optimized.fit(X_train_scaled, y_train)

# Optimized predictions
y_pred_optimized = rf_optimized.predict(X_test_scaled)

# Optimized performance
optimized_accuracy = accuracy_score(y_test, y_pred_optimized)
print(f"Optimized Model Accuracy: {optimized_accuracy:.2f}")
print("Optimized Confusion Matrix:\n", confusion_matrix(y_test, y_pred_optimized))
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_optimized))

### Plotting the Accuracy Comparison
plt.figure(figsize=(8, 6))
models = ['Baseline Model', 'Optimized Model']
accuracies = [baseline_accuracy, optimized_accuracy]

# Plotting the bar chart
sns.barplot(x=models, y=accuracies, palette='viridis')

# Adding accuracy labels on top of the bars
for i, accuracy in enumerate(accuracies):
    plt.text(i, accuracy + 0.001, f'{accuracy:.2f}', ha='center', fontsize=12, fontweight='bold')

# Setting y-axis limits for better differentiation
plt.ylim(0.9, 1.05)  # Adjust this range as necessary

# Title and labels
plt.title('Accuracy Comparison: Baseline vs Optimized RandomForest')
plt.ylabel('Accuracy')
plt.xlabel('Model Type')

# Display the plot
plt.show()


# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Split dataset for baseline model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

### BASELINE MODEL ###

# Baseline Gradient Boosting model
gb_baseline = GradientBoostingClassifier(random_state=42)
gb_baseline.fit(X_train, y_train)

# Baseline predictions
y_pred_gb_baseline = gb_baseline.predict(X_test)

# Baseline performance
gb_baseline_accuracy = accuracy_score(y_test, y_pred_gb_baseline)
print(f"Gradient Boosting Baseline Model Accuracy: {gb_baseline_accuracy:.2f}")
print("Gradient Boosting Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb_baseline))
print("Gradient Boosting Baseline Classification Report:\n", classification_report(y_test, y_pred_gb_baseline))

### OPTIMIZED MODEL ###

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names

# Split dataset for optimized model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale selected features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_test_scaled = scaler.transform(X_test[selected_features])

# Optimized Gradient Boosting model
gb_optimized = GradientBoostingClassifier(random_state=42)
gb_optimized.fit(X_train_scaled, y_train)

# Optimized predictions
y_pred_gb_optimized = gb_optimized.predict(X_test_scaled)

# Optimized performance
gb_optimized_accuracy = accuracy_score(y_test, y_pred_gb_optimized)
print(f"Gradient Boosting Optimized Model Accuracy: {gb_optimized_accuracy:.2f}")
print("Gradient Boosting Optimized Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb_optimized))
print("Gradient Boosting Optimized Classification Report:\n", classification_report(y_test, y_pred_gb_optimized))

### Plotting the Accuracy Comparison ###
plt.figure(figsize=(8, 6))
models = ['Gradient Boosting Baseline', 'Gradient Boosting Optimized']
accuracies = [gb_baseline_accuracy, gb_optimized_accuracy]

# Plotting the bar chart
sns.barplot(x=models, y=accuracies, palette='viridis')

# Adding accuracy labels on top of the bars
for i, accuracy in enumerate(accuracies):
    plt.text(i, accuracy + 0.001, f'{accuracy:.2f}', ha='center', fontsize=12, fontweight='bold')

# Setting y-axis limits for better differentiation
plt.ylim(0.9, 1.05)  # Adjust this range as necessary

# Title and labels
plt.title('Accuracy Comparison: Baseline vs Optimized Gradient Boosting')
plt.ylabel('Accuracy')
plt.xlabel('Model Type')

# Display the plot
plt.show()

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Split dataset for baseline model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

### BASELINE MODEL ###

# Baseline AdaBoost model
ada_baseline = AdaBoostClassifier(n_estimators=50, learning_rate=0.5, random_state=42)  # Lower learning rate
ada_baseline.fit(X_train, y_train)

# Baseline predictions
y_pred_ada_baseline = ada_baseline.predict(X_test)

# Baseline performance
ada_baseline_accuracy = accuracy_score(y_test, y_pred_ada_baseline)
print(f"AdaBoost Baseline Model Accuracy: {ada_baseline_accuracy:.2f}")
print("AdaBoost Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ada_baseline))
print("AdaBoost Baseline Classification Report:\n", classification_report(y_test, y_pred_ada_baseline))

### OPTIMIZED MODEL ###

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names

# Split dataset for optimized model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale selected features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_test_scaled = scaler.transform(X_test[selected_features])

# Optimized AdaBoost model
ada_optimized = AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42)  # Higher learning rate
ada_optimized.fit(X_train_scaled, y_train)

# Optimized predictions
y_pred_ada_optimized = ada_optimized.predict(X_test_scaled)

# Optimized performance
ada_optimized_accuracy = accuracy_score(y_test, y_pred_ada_optimized)
print(f"AdaBoost Optimized Model Accuracy: {ada_optimized_accuracy:.2f}")
print("AdaBoost Optimized Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ada_optimized))
print("AdaBoost Optimized Classification Report:\n", classification_report(y_test, y_pred_ada_optimized))

### Plotting the Accuracy Comparison ###
plt.figure(figsize=(8, 6))
models = ['AdaBoost Baseline', 'AdaBoost Optimized']
accuracies = [ada_baseline_accuracy, ada_optimized_accuracy]

# Plotting the bar chart
sns.barplot(x=models, y=accuracies, palette='viridis')

# Adding accuracy labels on top of the bars
for i, accuracy in enumerate(accuracies):
    plt.text(i, accuracy + 0.001, f'{accuracy:.2f}', ha='center', fontsize=12, fontweight='bold')

# Setting y-axis limits for better differentiation
plt.ylim(0.9, 1.05)  # Adjust this range as necessary

# Title and labels
plt.title('Accuracy Comparison: Baseline vs Optimized AdaBoost')
plt.ylabel('Accuracy')
plt.xlabel('Model Type')

# Display the plot
plt.show()

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Split dataset for baseline model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

### BASELINE MODEL ###

# Baseline SVM model (linear kernel)
SVM = SVC(kernel='linear', random_state=42)
SVM.fit(X_train, y_train)

# Baseline predictions
y_pred_svm_baseline = SVM.predict(X_test)

# Baseline performance
svm_baseline_accuracy = accuracy_score(y_test, y_pred_svm_baseline)
print(f"SVM Baseline Model Accuracy: {svm_baseline_accuracy:.2f}")
print("SVM Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm_baseline))
print("SVM Baseline Classification Report:\n", classification_report(y_test, y_pred_svm_baseline))

### OPTIMIZED MODEL ###

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names

# Split dataset for optimized model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale selected features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_test_scaled = scaler.transform(X_test[selected_features])

# Optimized SVM model
svm_optimized = SVC(random_state=42)
svm_optimized.fit(X_train_scaled, y_train)

# Optimized predictions
y_pred_svm_optimized = svm_optimized.predict(X_test_scaled)

# Optimized performance
svm_optimized_accuracy = accuracy_score(y_test, y_pred_svm_optimized)
print(f"SVM Optimized Model Accuracy: {svm_optimized_accuracy:.2f}")
print("SVM Optimized Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm_optimized))
print("SVM Optimized Classification Report:\n", classification_report(y_test, y_pred_svm_optimized))

### Plotting the Accuracy Comparison ###
plt.figure(figsize=(8, 6))
models = ['SVM Baseline', 'SVM Optimized']
accuracies = [svm_baseline_accuracy, svm_optimized_accuracy]

# Plotting the bar chart
sns.barplot(x=models, y=accuracies, palette='coolwarm')

# Adding accuracy labels on top of the bars
for i, accuracy in enumerate(accuracies):
    plt.text(i, accuracy + 0.001, f'{accuracy:.2f}', ha='center', fontsize=12, fontweight='bold')

# Setting y-axis limits for better differentiation
plt.ylim(0.9, 1.05)  # Adjust this range as necessary

# Title and labels
plt.title('Accuracy Comparison: Baseline vs Optimized SVM')
plt.ylabel('Accuracy')
plt.xlabel('Model Type')

# Display the plot
plt.show()

# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Baseline Model Performance (already provided in the previous step)
knn_baseline = KNeighborsClassifier(n_neighbors=5)  
knn_baseline.fit(X_train, y_train)
y_pred_knn_baseline = knn_baseline.predict(X_test)

# Baseline Accuracy
knn_baseline_accuracy = accuracy_score(y_test, y_pred_knn_baseline)
print(f"KNN Baseline Model Accuracy: {knn_baseline_accuracy:.2f}")
print("Baseline Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn_baseline))
print("Baseline Classification Report:\n", classification_report(y_test, y_pred_knn_baseline))

# Optimized Model Performance
knn_optimized = KNeighborsClassifier()
knn_optimized.fit(X_train_scaled, y_train)
y_pred_knn_optimized = knn_optimized.predict(X_test_scaled)

# Optimized Accuracy
knn_optimized_accuracy = accuracy_score(y_test, y_pred_knn_optimized)
print(f"KNN Optimized Model Accuracy: {knn_optimized_accuracy:.2f}")
print("Optimized Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn_optimized))
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_knn_optimized))

# Plotting the accuracy comparison
models = ['KNN Baseline', 'KNN Optimized']
accuracies = [knn_baseline_accuracy, knn_optimized_accuracy]

plt.figure(figsize=(8, 5))
plt.bar(models, accuracies, color=['blue', 'green'])
plt.title('KNN Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  # Accuracy ranges from 0 to 1
plt.show()


# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Train Test Split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
print(X_train.shape)
print(X_test.shape)

# Initialize a list to store model names and accuracies
models = []
accuracies = []

# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
models.append('Random Forest')
accuracies.append(rf_accuracy)
print('Random Forest Accuracy:', rf_accuracy)


# AdaBoost Classifier
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
ada_accuracy = accuracy_score(y_test, y_pred_ada)
models.append('AdaBoost')
accuracies.append(ada_accuracy)
print('AdaBoost Accuracy:', ada_accuracy)


# Gradient Boosting Classifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
models.append('Gradient Boosting')
accuracies.append(gb_accuracy)
print('Gradient Boosting Accuracy:', gb_accuracy)


# Support Vector Machine (SVM)
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
models.append('SVM')
accuracies.append(svm_accuracy)
print('SVM Accuracy:', svm_accuracy)


# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
models.append('KNN')
accuracies.append(knn_accuracy)
print('KNN Accuracy:', knn_accuracy)




import matplotlib.pyplot as plt
import seaborn as sns

# Sample data for optimized models (replace with your actual accuracies)
optimized_models = ['Random Forest', 'AdaBoost', 'Gradient Boost', 'SVM', 'KNN']
optimized_accuracies = [1.00, 1.00, 1.00, 0.99, 1.00]  # Replace with actual optimized accuracies

# Set up the figure size
plt.figure(figsize=(8, 6))

# Plot Optimized Model Accuracies
sns.barplot(x=optimized_accuracies, y=optimized_models, palette='Greens_d')
plt.xlim(0.85, 1.0)  # Adjust limits as needed
plt.title('Optimized Model Accuracies', fontsize=16)
plt.xlabel('Accuracy', fontsize=14)
plt.ylabel('Models', fontsize=14)
plt.xticks(fontsize=12)

# Add accuracy values on the bars
for i, v in enumerate(optimized_accuracies):
    plt.text(v + 0.002, i, f"{v:.2f}", ha='center', fontsize=12)

plt.grid(axis='x', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()  # Adjust layout to fit into the figure area.
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Sample data for baseline and optimized models (replace with your actual accuracies)
baseline_models = ['Random Forest', 'AdaBoost', 'Gradient Boost', 'SVM', 'KNN']
baseline_accuracies = [0.9750, 1.00, 0.9750, 0.9416, 0.933]  # Replace with actual baseline accuracies

optimized_models = ['Random Forest', 'AdaBoost', 'Gradient Boost', 'SVM', 'KNN']
optimized_accuracies = [1.00, 1.00, 1.00, 0.99, 1.00]  # Replace with actual optimized accuracies

# Combine data into a DataFrame
data = {
    'Model': baseline_models + optimized_models,
    'Accuracy': baseline_accuracies + optimized_accuracies,
    'Type': ['Baseline'] * len(baseline_models) + ['Optimized'] * len(optimized_models)
}

df = pd.DataFrame(data)

# Set up the figure size
plt.figure(figsize=(12, 8))

# Plotting the comparison of model accuracies
sns.barplot(x='Model', y='Accuracy', hue='Type', data=df, palette='viridis')

# Adding titles and labels
plt.title('Comparison of Model Accuracies', fontsize=16)
plt.ylabel('Accuracy', fontsize=14)
plt.xlabel('Models', fontsize=14)
plt.ylim(0.85, 1.05)  # Adjust limits as needed
plt.xticks(fontsize=12)
plt.legend(title='Type', fontsize=12)

# Add accuracy values on the bars
for i, row in df.iterrows():
    plt.text(i % len(baseline_models) + (0.15 if row['Type'] == 'Optimized' else -0.15), 
             row['Accuracy'] + 0.002, 
             f"{row['Accuracy']:.2f}", 
             ha='center', fontsize=12)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()  # Adjust layout to fit into the figure area.
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Function to plot learning curve
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure(figsize=(10, 7))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()

    # Plot learning curves with error bars
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

# Baseline RandomForest model
rf_baseline = RandomForestClassifier(random_state=42)

# Plot Learning Curve for RandomForest Baseline Model
plot_learning_curve(rf_baseline, "Learning Curve (RandomForest Baseline Model)", X, y, cv=5)
plt.show()

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_features])

# Optimized RandomForest model
rf_optimized = RandomForestClassifier(random_state=42)

# Plot Learning Curve for RandomForest Optimized Model
plot_learning_curve(rf_optimized, "Learning Curve (RandomForest Optimized Model)", X_scaled, y, cv=5)
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Function to plot learning curve
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure(figsize=(10, 7))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()

    # Plot learning curves with error bars
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

### BASELINE MODEL ###

# Baseline Gradient Boosting model
gb_baseline = GradientBoostingClassifier(random_state=42)

# Plot Learning Curve for Gradient Boosting Baseline Model
plot_learning_curve(gb_baseline, "Learning Curve (Gradient Boosting Baseline Model)", X, y, cv=5)
plt.show()

### OPTIMIZED MODEL ###

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_features])

# Optimized Gradient Boosting model
gb_optimized = GradientBoostingClassifier(random_state=42)

# Plot Learning Curve for Gradient Boosting Optimized Model
plot_learning_curve(gb_optimized, "Learning Curve (Gradient Boosting Optimized Model)", X_scaled, y, cv=5)
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Function to plot learning curve
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure(figsize=(10, 7))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()

    # Plot learning curves with error bars
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

### BASELINE MODEL ###

# Baseline AdaBoost model
ada_baseline = AdaBoostClassifier(n_estimators=50, learning_rate=0.5, random_state=42)

# Plot Learning Curve for AdaBoost Baseline Model
plot_learning_curve(ada_baseline, "Learning Curve (AdaBoost Baseline Model)", X, y, cv=5)
plt.show()

### OPTIMIZED MODEL ###

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_features])

# Optimized AdaBoost model
ada_optimized = AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

# Plot Learning Curve for AdaBoost Optimized Model
plot_learning_curve(ada_optimized, "Learning Curve (AdaBoost Optimized Model)", X_scaled, y, cv=5)
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Function to plot learning curve
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure(figsize=(10, 7))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()

    # Plot learning curves with error bars
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

### BASELINE MODEL ###

# Baseline SVM model (linear kernel)
svm_baseline = SVC(kernel='linear', random_state=42)

# Plot Learning Curve for SVM Baseline Model
plot_learning_curve(svm_baseline, "Learning Curve (SVM Baseline Model)", X, y, cv=5)
plt.show()

### OPTIMIZED MODEL ###

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_features])

# Optimized SVM model
svm_optimized = SVC(random_state=42)

# Plot Learning Curve for SVM Optimized Model
plot_learning_curve(svm_optimized, "Learning Curve (SVM Optimized Model)", X_scaled, y, cv=5)
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler

# Sample dataset (replace with your actual dataset)
# Assuming X is your feature matrix and y is your target variable
# X, y = ...  # Load your dataset here

# Function to plot learning curve
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure(figsize=(10, 7))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score (Accuracy)")
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes
    )
    
    # Compute mean and standard deviation for training and cross-validation scores
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()

    # Plot learning curves with error bars (standard deviation)
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    plt.ylim(0, 1)  # Score (accuracy) ranges from 0 to 1
    return plt

### BASELINE MODEL ###

# Split dataset for baseline model (optional if you've already split it)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

# Baseline SVM model (linear kernel)
svm_baseline = SVC(kernel='linear', random_state=42)

# Plot Learning Curve for SVM Baseline Model
plot_learning_curve(svm_baseline, "Learning Curve (SVM Baseline Model)", X, y, cv=5)
plt.show()

### OPTIMIZED MODEL ###

# Feature selection and scaling for optimized model
selected_features = ['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']  # Customize this according to your feature names
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_features])

# Optimized SVM model
svm_optimized = SVC(random_state=42)

# Plot Learning Curve for SVM Optimized Model
plot_learning_curve(svm_optimized, "Learning Curve (SVM Optimized Model)", X_scaled, y, cv=5)
plt.show()



ModuleNotFoundError: No module named 'seaborn'

In [None]:

import graphviz

# Set the path to the Graphviz executable (if needed)
graphviz.set_default_engine('dot')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load your dataset
dataset = pd.read_csv("Kidney_data.csv")

# Preprocess your dataset
# Assuming 'classification' is your target variable
X = dataset[['sg', 'htn', 'hemo', 'dm', 'al', 'appet', 'rc', 'pc']]  # Adjust as needed
y = dataset['classification']  # Adjust as needed

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Impute missing values for numerical columns using the mean
imputer_num = SimpleImputer(strategy='mean')
X[numerical_cols] = imputer_num.fit_transform(X[numerical_cols])

# Impute missing values for categorical columns using the most frequent value
imputer_cat = SimpleImputer(strategy='most_frequent')
X[categorical_cols] = imputer_cat.fit_transform(X[categorical_cols])

# Convert categorical variables to numerical using Label Encoding
label_encoders = {}
for column in categorical_cols:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# If your target variable is also categorical, encode it
if y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y)

# Create a Random Forest classifier
rf = RandomForestClassifier(n_estimators=5, random_state=42)
rf.fit(X, y)

# Visualize one of the trees in the Random Forest
# Export as dot file
dot_data = export_graphviz(rf.estimators_[0], out_file=None, 
                           feature_names=X.columns,  
                           class_names=np.unique(y).astype(str),  # Convert class names to string
                           filled=True, rounded=True,  
                           special_characters=True)  

# Create a Graphviz source object
graph = graphviz.Source(dot_data)  
graph.render("random_forest_tree")  # Saves the tree as a PDF file
graph.view()  # Opens the PDF file

# To display the tree in a Jupyter notebook, uncomment the following line:
# graph


ModuleNotFoundError: No module named 'graphviz'