In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

# Load the dataset
file_path = '/content/diabetes (1).csv'
diabetes_data = pd.read_csv(file_path)

total_rows, total_columns = diabetes_data.shape
print(f"Total number of rows: {total_rows}")
print(f"Total number of columns: {total_columns}")

# Split data into features and target
X = diabetes_data.drop(columns=['Outcome'])
y = diabetes_data['Outcome']

# Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
print("Decision Tree Classifier Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_pred))

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Random Forest Classifier Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))

# KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print("KNN Accuracy:", knn_accuracy)
print("KNN Classification Report:")
print(classification_report(y_test, knn_pred))

# SVM classifier
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred))

# Initial accuracies
initial_accuracies = {
    'Decision Tree': dt_accuracy,
    'Random Forest': rf_accuracy,
    'KNN': knn_accuracy,
    'SVM': svm_accuracy
}

  and should_run_async(code)


Total number of rows: 768
Total number of columns: 9
Decision Tree Classifier Accuracy: 0.7056277056277056
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.72      0.76       151
           1       0.56      0.69      0.62        80

    accuracy                           0.71       231
   macro avg       0.69      0.70      0.69       231
weighted avg       0.73      0.71      0.71       231

Random Forest Classifier Accuracy: 0.7489177489177489
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81       151
           1       0.63      0.65      0.64        80

    accuracy                           0.75       231
   macro avg       0.72      0.73      0.72       231
weighted avg       0.75      0.75      0.75       231

KNN Accuracy: 0.6926406926406926
KNN Classification Report:
              precision    recall  f1-score   suppo

In [49]:
import pandas as pd
import numpy as np

# Load your chosen dataset (replace 'path_to_your_dataset.csv' with your actual file path)
df = pd.read_csv('/content/diabetes (1).csv')

# Set the random seed for reproducibility
np.random.seed(0)

# Introduce 35% missing values
missing_rate = 0.35
n_missing_samples = int(np.floor(missing_rate * df.size))

# Randomly select indices to be replaced with NaN
missing_samples = np.random.choice(df.size, n_missing_samples, replace=False)

# Flatten the DataFrame to introduce NaNs and then reshape it back
flat_df = df.values.flatten()
flat_df[missing_samples] = np.nan
df_with_missing = pd.DataFrame(flat_df.reshape(df.shape), columns=df.columns)

# Save the dataset with missing values
df_with_missing.to_csv('dataset_with_missing_values.csv', index=False)

  and should_run_async(code)


In [50]:
import pandas as pd
import numpy as np

# Assuming you have a DataFrame called 'original_df' with your dataset
# Replace 'original_df' with the actual name of your DataFrame

# Calculate the number of missing values to create
missing_percentage = 0.35
total_cells = df.shape[0] * df.shape[1]
missing_cells = int(total_cells * missing_percentage)

# Randomly select cells to be set as missing
missing_indices = np.random.choice(df.index, size=missing_cells, replace=True)
missing_columns = np.random.choice(df.columns, size=missing_cells, replace=True)

# Set the selected cells to NaN
for idx, col in zip(missing_indices, missing_columns):
    df.at[idx, col] = np.nan

# Now 'original_df' will have approximately 35% missing values

  and should_run_async(code)


In [51]:
import pandas as pd
import numpy as np

# Assuming you have a DataFrame called 'original_df' with your dataset
# Replace 'original_df' with the actual name of your DataFrame

# Print previous missing values
previous_missing_values = df.isnull().sum().sum()
print("Previous missing values:", previous_missing_values)

# Calculate the number of missing values to create
missing_percentage = 0.35
total_cells = df.shape[0] * df.shape[1]
missing_cells = int(total_cells * missing_percentage)

# Randomly select cells to be set as missing
missing_indices = np.random.choice(df.index, size=missing_cells, replace=True)
missing_columns = np.random.choice(df.columns, size=missing_cells, replace=True)

# Set the selected cells to NaN
for idx, col in zip(missing_indices, missing_columns):
    df.at[idx, col] = np.nan

# Print new missing values
new_missing_values = df.isnull().sum().sum()
print("New missing values (35%):", new_missing_values)

Previous missing values: 2054
New missing values (35%): 3526


  and should_run_async(code)


In [52]:
# Mean imputation for numeric columns
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df.mean())

# Mode imputation for categorical columns
categorical_cols = df.select_dtypes(exclude='number').columns
df[categorical_cols] = df[categorical_cols].fillna(df.mode().iloc[0])

  and should_run_async(code)


In [39]:
#Forward Fill/Backward Fill:

#Propagate the last known value forward to fill missing values (forward fill) or propagate the next known value backward (backward fill).
#Suitable for time series data where missing values are assumed to have similar values to adjacent observations.
# Forward fill
df_ffill = df.ffill()

# Backward fill
df_bfill = df.bfill()

#tnterpolation:
#Estimate missing values based on the values of neighboring data points.
#Suitable for numeric data with a linear relationship between variables.
# Linear interpolation
df_interpolated = df.interpolate(method='linear')

  and should_run_async(code)


In [40]:
from sklearn.impute import KNNImputer

# Create KNN imputer object
imputer = KNNImputer(n_neighbors=5)

# Apply imputation
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

  and should_run_async(code)


In [41]:
# Check for missing values
missing_values = df.isnull().any()

# Print columns with missing values
print("Columns with missing values:")
print(missing_values[missing_values].index.tolist())

# Print rows with missing values
print("\nRows with missing values:")
print(df[df.isnull().any(axis=1)])


Columns with missing values:
[]

Rows with missing values:
Empty DataFrame
Columns: [Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome]
Index: []


  and should_run_async(code)


In [42]:
# Check for missing values
if df.isnull().values.any():
    # Print columns with missing values
    print("Columns with missing values:")
    print(df.columns[df.isnull().any()].tolist())
else:
    print("NO MISSING VALUE AVAILABLE")

NO MISSING VALUE AVAILABLE


  and should_run_async(code)


In [43]:
import pandas as pd

# Assuming your DataFrame is named df
# Extract the target variable (y)
y = df["Outcome"]

# Extract the feature variables (X)
#X = df.drop(columns=["Outcome"])
# Defining X variables
X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]

  and should_run_async(code)


In [44]:
y = y.astype(int)

print(y.dtype)

int64


  and should_run_async(code)


In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

# Load the dataset
file_path = '/content/diabetes (1).csv'
diabetes_data = pd.read_csv(file_path)

# Split data into features and target
X = diabetes_data.drop(columns=['Outcome'])
y = diabetes_data['Outcome']

# Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
print("Decision Tree Classifier Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_pred))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Random Forest Classifier Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))

# KNN classifier with Hyperparameter Tuning
knn_param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5)
grid_search_knn.fit(X_train, y_train)
best_knn = grid_search_knn.best_estimator_
best_knn.fit(X_train, y_train)
knn_pred = best_knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print("KNN Accuracy:", knn_accuracy)
print("KNN Classification Report:")
print(classification_report(y_test, knn_pred))

# SVM classifier
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred))

# Apriori Algorithm
frequent_itemsets = apriori(diabetes_data.astype(bool), min_support=0.1, use_colnames=True)
apriori_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
print("Number of Apriori Rules:", len(apriori_rules))

# FP-Growth Algorithm
frequent_itemsets_fp = fpgrowth(diabetes_data.astype(bool), min_support=0.1, use_colnames=True)
fp_growth_rules = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.7)
print("Number of FP-Growth Rules:", len(fp_growth_rules))

# Initial and Final accuracies
initial_accuracies = {
    'Decision Tree': dt_accuracy,
    'Random Forest': rf_accuracy,
    'KNN': knn_accuracy,
    'SVM': svm_accuracy
}



  and should_run_async(code)


Decision Tree Classifier Accuracy: 0.7467532467532467
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.79        99
           1       0.62      0.73      0.67        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

Random Forest Classifier Accuracy: 0.7272727272727273
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        99
           1       0.62      0.62      0.62        55

    accuracy                           0.73       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.73      0.73      0.73       154

KNN Accuracy: 0.7142857142857143
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      

In [46]:
import pandas as pd
import numpy as np
import time
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

# Load the dataset
file_path = '/content/diabetes (1).csv'
diabetes_data = pd.read_csv(file_path)

# Convert dataset to boolean type for association rule mining
diabetes_data_bool = diabetes_data.astype(bool)

# Measure time for Apriori
start_time = time.time()
frequent_itemsets_apriori = apriori(diabetes_data_bool, min_support=0.1, use_colnames=True)
apriori_rules = association_rules(frequent_itemsets_apriori, metric="confidence", min_threshold=0.7)
apriori_time = time.time() - start_time
print("Apriori execution time:", apriori_time)
print("Number of Apriori Rules:", len(apriori_rules))

# Measure time for FP-Growth
start_time = time.time()
frequent_itemsets_fp = fpgrowth(diabetes_data_bool, min_support=0.1, use_colnames=True)
fp_growth_rules = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.7)
fp_growth_time = time.time() - start_time
print("FP-Growth execution time:", fp_growth_time)
print("Number of FP-Growth Rules:", len(fp_growth_rules))

# Compare times
if apriori_time < fp_growth_time:
    print("Apriori is more efficient.")
else:
    print("FP-Growth is more efficient.")


  and should_run_async(code)


Apriori execution time: 0.41843533515930176
Number of Apriori Rules: 8116
FP-Growth execution time: 0.2651960849761963
Number of FP-Growth Rules: 8116
FP-Growth is more efficient.
