In [None]:
import pyspark
from pyspark.sql.functions import col, count, isnan, when
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("PySpark in Jupyter") \
    .getOrCreate()

In [None]:
# One with dementia one without dementia
df = spark.read.csv('Dataset/df_new_6d.csv', header=True, inferSchema=True)
df2 = spark.read.csv('Dataset/df_new_6wd.csv', header=True, inferSchema=True)

# Association

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.sql.types import ArrayType, StringType

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Association Rules Mining") \
    .getOrCreate()

# Assuming your DataFrame is named 'df' and contains transaction data

# Convert string columns to numeric using StringIndexer
string_cols = ['Education_Level', 'Family_History', 'Smoking_Status', 'APOE_ε4', 
               'Depression_Status', 'Education_Group']

# Initialize StringIndexer objects
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="skip") for col in string_cols]

# Apply transformations
for indexer in indexers:
    df = indexer.fit(df).transform(df)

# Convert indexed columns to one-hot encoded vectors
encoder = OneHotEncoder(inputCols=[indexer.getOutputCol() for indexer in indexers], 
                        outputCols=[col+"_encoded" for col in string_cols])

# Apply one-hot encoding
df = encoder.fit(df).transform(df)

# Define the columns you want to assemble into an array
input_cols = ['AlcoholLevel', 'HeartRate', 'BodyTemperature', 'Weight', 'MRI_Delay', 'Age', 
              'Education_Level_encoded', 'Family_History_encoded', 'Smoking_Status_encoded', 
              'APOE_ε4_encoded', 'Depression_Status_encoded', 'Cognitive_Test_Scores', 'Education_Group_encoded','Dementia']

# Initialize the VectorAssembler
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")

# Transform the DataFrame to add the features as an array column
df = assembler.transform(df)

# Split the 'Dementia' column into an array of items
df = df.withColumn("Dementia_array", split(df["Dementia"], ",").cast(ArrayType(StringType())))

# Select only the 'Dementia_array' and 'features' columns
df = df.select("Dementia_array", "features")

# Show the updated DataFrame
df.show(truncate=False)

# Now, let's perform the FP-growth model

# Assuming 'Dementia_array' is the column containing the items in each transaction
fp_growth = FPGrowth(itemsCol="Dementia_array", minSupport=0.0, minConfidence=0.0)

# Train the FP-growth model
model = fp_growth.fit(df)

# Display frequent itemsets
print("Frequent Itemsets:")
model.freqItemsets.show()

# Extract association rules from the model
association_rules = model.associationRules

# Display generated association rules
print("Association Rules:")
model.associationRules.show()

# Transform examines the input items against all the association rules and summarizes the consequents as prediction
print("Transformed DataFrame:")
transformed_df = model.transform(df)
transformed_df.show()


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, expr
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.fpm import PrefixSpan

# Create a SparkSession
spark = SparkSession.builder \
    .appName("PrefixSpan") \
    .getOrCreate()

# Load your DataFrame
df = spark.read.csv('Dataset/df_new_6d.csv', header=True, inferSchema=True)

# Show the schema to understand the available columns
df.printSchema()

# Assuming 'Dementia' is a column that contains sequences in a string format
# First, let's transform this column into an array of strings
df = df.withColumn("Dementia_array", split(col("Dementia"), ","))

# Transform each string in the array into an array containing that string
df = df.withColumn("Dementia_array_of_arrays", expr("transform(Dementia_array, x -> array(x))"))

# PrefixSpan expects a DataFrame with a single column 'sequence' containing the sequences
df = df.select(col("Dementia_array_of_arrays").alias("sequence"))

# Show the transformed DataFrame
df.show(truncate=False)

# Apply PrefixSpan
prefixspan = PrefixSpan(minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000)

# Train the model
model = prefixspan.findFrequentSequentialPatterns(df)

# Show the frequent sequential patterns
print("Frequent Sequential Patterns:")
model.show(truncate=False)




# Classification

In [None]:
# restart df
df = spark.read.csv('Dataset/df_new_6d.csv', header=True, inferSchema=True)

In [None]:
# check df
def spark_info(df):
    # Get the schema of the DataFrame
    schema = df.schema
    
    # Create a list to hold column information
    columns_info = []
    
    # Iterate through the schema to get column information
    for field in schema:
        column_name = field.name
        column_type = field.dataType.simpleString()
        
        # Count non-null values
        non_null_count = df.filter(col(column_name).isNotNull()).count()
        
        # Count null values
        null_count = df.filter(col(column_name).isNull() | isnan(col(column_name))).count()
        
        columns_info.append((column_name, column_type, non_null_count, null_count))
    
    # Display the DataFrame schema and summary
    total_rows = df.count()
    total_columns = len(schema)
    
    # Print the summary table
    print(f"DataFrame Summary:")
    print(f"{'Total Rows':<15}: {total_rows}")
    print(f"{'Total Columns':<15}: {total_columns}")
    print("\nDataFrame Schema:")
    print(f"{'Column':<25} {'Non-Null Count':<15} {'Null Count':<10} {'Dtype':<10}")
    print("-" * 60)
    for column_info in columns_info:
        print(f"{column_info[0]:<25} {column_info[2]:<15} {column_info[3]:<10} {column_info[1]:<10}")

# Call the function to describe the DataFrame
spark_info(df)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import numpy as np
import matplotlib.pyplot as plt

# Create a SparkSession
spark = SparkSession.builder \
    .appName("LogisticRegressionClassification") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv('Dataset/df_new_6d.csv', header=True, inferSchema=True)

# We excluded Cognitive Test Score
# Select all features and the target variable
categorical_features = ['Family_History', 'Smoking_Status', 'APOE_ε4', 'Depression_Status', 'Education_Group']
numeric_features = ['Age', 'AlcoholLevel', 'HeartRate', 'BodyTemperature', 'Weight', 'MRI_Delay']
target = 'Dementia'

# Index categorical features
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_features]

# Assemble features into a feature vector
assembler = VectorAssembler(inputCols=[column + "_index" for column in categorical_features] + numeric_features, outputCol="features")

# Initialize the LogisticRegression model
lr = LogisticRegression(labelCol=target, featuresCol="features", maxIter=10)

# Create a Pipeline
pipeline = Pipeline(stages=indexers + [assembler, lr])

# Train-Test Split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Train the model
pipeline_model = pipeline.fit(train_df)

# Make predictions on both the training and testing data
train_predictions = pipeline_model.transform(train_df)
test_predictions = pipeline_model.transform(test_df)

# Evaluate the Model for both training and testing sets using accuracy metric
evaluator = MulticlassClassificationEvaluator(labelCol=target, metricName="accuracy")
train_accuracy = evaluator.evaluate(train_predictions)
test_accuracy = evaluator.evaluate(test_predictions)
print(f"Training Set Accuracy (Evaluator): {train_accuracy}")
print(f"Testing Set Accuracy (Evaluator): {test_accuracy}")

# Calculate correct and incorrect predictions
def calculate_correct_wrong(predictions, label_col):
    pred_labels = predictions.select('prediction', label_col).rdd
    pred_labels = pred_labels.map(lambda row: (row['prediction'], row[label_col]))
    
    tp = pred_labels.filter(lambda pl: pl[0] == 1.0 and pl[1] == 1.0).count()
    tn = pred_labels.filter(lambda pl: pl[0] == 0.0 and pl[1] == 0.0).count()
    fp = pred_labels.filter(lambda pl: pl[0] == 1.0 and pl[1] == 0.0).count()
    fn = pred_labels.filter(lambda pl: pl[0] == 0.0 and pl[1] == 1.0).count()
    
    correct = tp + tn
    wrong = fp + fn
    
    total = correct + wrong
    correct_pct = (correct / total) * 100
    wrong_pct = (wrong / total) * 100
    
    return correct, wrong, correct_pct, wrong_pct

train_correct, train_wrong, train_correct_pct, train_wrong_pct = calculate_correct_wrong(train_predictions, target)
test_correct, test_wrong, test_correct_pct, test_wrong_pct = calculate_correct_wrong(test_predictions, target)

print(f"Training Set Correct: {train_correct}")
print(f"Training Set Wrong: {train_wrong}")
print(f"Training Set Correct (%): {train_correct_pct}")
print(f"Training Set Wrong (%): {train_wrong_pct}")
print(f"Testing Set Correct: {test_correct}")
print(f"Testing Set Wrong: {test_wrong}")
print(f"Testing Set Correct (%): {test_correct_pct}")
print(f"Testing Set Wrong (%): {test_wrong_pct}")

# Define function to plot logistic regression results for both training and testing sets
def plot_logistic_regression_results(train_correct, train_correct_pct, train_wrong, train_wrong_pct,
                                     test_correct, test_correct_pct, test_wrong, test_wrong_pct):
    labels = ['Training Set', 'Testing Set']
    correct = [train_correct, test_correct]
    correct_pct = [round(train_correct_pct, 2), round(test_correct_pct, 2)]
    wrong = [train_wrong, test_wrong]
    wrong_pct = [round(train_wrong_pct, 2), round(test_wrong_pct, 2)]
    x = np.arange(len(labels))
    width = 0.2
    fig, ax = plt.subplots(figsize=(10, 6))
    rects1 = ax.bar(x - width, correct, width, label='Correct', color='lightgreen')
    rects2 = ax.bar(x, wrong, width, label='Wrong', color='salmon')
    rects3 = ax.bar(x + width, correct_pct, width, label='Correct (%)', color='skyblue')
    rects4 = ax.bar(x + 2*width, wrong_pct, width, label='Wrong (%)', color='orange')
    ax.set_ylabel('Count / Percentage')
    ax.set_title('Logistic Regression Model Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend(loc='center')
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  
                        textcoords="offset points",
                        ha='center', va='bottom')
    autolabel(rects1)
    autolabel(rects2)
    autolabel(rects3)
    autolabel(rects4)
    plt.show()

# Plot logistic regression results for both training and testing sets
plot_logistic_regression_results(train_correct, train_correct_pct, train_wrong, train_wrong_pct,
                                 test_correct, test_correct_pct, test_wrong, test_wrong_pct)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_accuracy(train_accuracy, test_accuracy):
    # Generate x values for interpolation
    x_values = np.linspace(0, 1, num=100)
    
    # Interpolate between the two accuracy points
    train_line = np.linspace(0, train_accuracy, num=100)
    test_line = np.linspace(0, test_accuracy, num=100)
    
    # Plot training set accuracy
    plt.figure(figsize=(10, 5))
    plt.plot(x_values, train_line, label='Training Set Accuracy')
    plt.plot(x_values, test_line, label='Testing Set Accuracy')
    plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Perfect Accuracy')
    plt.xlabel('Iterations')
    plt.ylabel('Accuracy')
    plt.title('Logistic Regression Model Evaluation')
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot accuracy for both training and testing sets
plot_accuracy(train_accuracy, test_accuracy)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import numpy as np
import matplotlib.pyplot as plt

# Create a SparkSession
spark = SparkSession.builder \
    .appName("DecisionTreeClassification") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv('Dataset/df_new_6d.csv', header=True, inferSchema=True)
# Drop the "Cognitive_Test_Score" column from the DataFrame
df = df.drop("Cognitive_Test_Score")

# We excluded Cognitive Test Score
# Select all features and the target variable
categorical_features = ['Family_History', 'Smoking_Status', 'APOE_ε4', 'Depression_Status', 'Education_Group']
numeric_features = ['Age', 'AlcoholLevel', 'HeartRate', 'BodyTemperature', 'Weight', 'MRI_Delay']
target = 'Dementia'

# Index categorical features
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_features]

# Assemble features into a feature vector
assembler = VectorAssembler(inputCols=[column + "_index" for column in categorical_features] + numeric_features, outputCol="features")

# Initialize the DecisionTreeClassifier model
dt = DecisionTreeClassifier(labelCol=target, featuresCol="features")

# Create a Pipeline
pipeline_dt = Pipeline(stages=indexers + [assembler, dt])

# Train-Test Split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Train the Decision Tree model
pipeline_model_dt = pipeline_dt.fit(train_df)

# Make predictions on both the training and testing data
train_predictions_dt = pipeline_model_dt.transform(train_df)
test_predictions_dt = pipeline_model_dt.transform(test_df)

# Evaluate the Model for both training and testing sets using accuracy metric
evaluator_dt = MulticlassClassificationEvaluator(labelCol=target, metricName="accuracy")
train_accuracy_dt = evaluator_dt.evaluate(train_predictions_dt)
test_accuracy_dt = evaluator_dt.evaluate(test_predictions_dt)
print(f"Decision Tree - Training Set Accuracy (Evaluator): {train_accuracy_dt}")
print(f"Decision Tree - Testing Set Accuracy (Evaluator): {test_accuracy_dt}")

# Calculate correct and incorrect predictions
def calculate_correct_wrong(predictions, label_col):
    pred_labels = predictions.select('prediction', label_col).rdd
    pred_labels = pred_labels.map(lambda row: (row['prediction'], row[label_col]))
    
    tp = pred_labels.filter(lambda pl: pl[0] == 1.0 and pl[1] == 1.0).count()
    tn = pred_labels.filter(lambda pl: pl[0] == 0.0 and pl[1] == 0.0).count()
    fp = pred_labels.filter(lambda pl: pl[0] == 1.0 and pl[1] == 0.0).count()
    fn = pred_labels.filter(lambda pl: pl[0] == 0.0 and pl[1] == 1.0).count()
    
    correct = tp + tn
    wrong = fp + fn
    
    total = correct + wrong
    correct_pct = (correct / total) * 100
    wrong_pct = (wrong / total) * 100
    
    return correct, wrong, correct_pct, wrong_pct

train_correct_dt, train_wrong_dt, train_correct_pct_dt, train_wrong_pct_dt = calculate_correct_wrong(train_predictions_dt, target)
test_correct_dt, test_wrong_dt, test_correct_pct_dt, test_wrong_pct_dt = calculate_correct_wrong(test_predictions_dt, target)

print(f"Decision Tree - Training Set Correct: {train_correct_dt}")
print(f"Decision Tree - Training Set Wrong: {train_wrong_dt}")
print(f"Decision Tree - Training Set Correct (%): {train_correct_pct_dt}")
print(f"Decision Tree - Training Set Wrong (%): {train_wrong_pct_dt}")
print(f"Decision Tree - Testing Set Correct: {test_correct_dt}")
print(f"Decision Tree - Testing Set Wrong: {test_wrong_dt}")
print(f"Decision Tree - Testing Set Correct (%): {test_correct_pct_dt}")
print(f"Decision Tree - Testing Set Wrong (%): {test_wrong_pct_dt}")

# Define function to plot decision tree results for both training and testing sets
def plot_decision_tree_results(train_correct, train_correct_pct, train_wrong, train_wrong_pct,
                               test_correct, test_correct_pct, test_wrong, test_wrong_pct):
    labels = ['Training Set', 'Testing Set']
    correct = [train_correct, test_correct]
    correct_pct = [round(train_correct_pct, 2), round(test_correct_pct, 2)]
    wrong = [train_wrong, test_wrong]
    wrong_pct = [round(train_wrong_pct, 2), round(test_wrong_pct, 2)]
    x = np.arange(len(labels))
    width = 0.2
    fig, ax = plt.subplots(figsize=(10, 6))
    rects1 = ax.bar(x - width, correct, width, label='Correct', color='lightgreen')
    rects2 = ax.bar(x, wrong, width, label='Wrong', color='salmon')
    rects3 = ax.bar(x + width, correct_pct, width, label='Correct (%)', color='skyblue')
    rects4 = ax.bar(x + 2*width, wrong_pct, width, label='Wrong (%)', color='orange')
    ax.set_ylabel('Count / Percentage')
    ax.set_title('Decision Tree Model Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend(loc='center')
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  
                        textcoords="offset points",
                        ha='center', va='bottom')
    autolabel(rects1)
    autolabel(rects2)
    autolabel(rects3)
    autolabel(rects4)
    plt.show()

# Plot decision tree results for both training and testing sets
plot_decision_tree_results(train_correct_dt, train_correct_pct_dt, train_wrong_dt, train_wrong_pct_dt,
                           test_correct_dt, test_correct_pct_dt, test_wrong_dt, test_wrong_pct_dt)



In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_accuracy(train_accuracy, test_accuracy):
    # Generate x values for interpolation
    x_values = np.linspace(0, 1, num=100)
    
    # Interpolate between the two accuracy points
    train_line = np.linspace(0, train_accuracy, num=100)
    test_line = np.linspace(0, test_accuracy, num=100)
    
    # Plot training set accuracy
    plt.figure(figsize=(10, 5))
    plt.plot(x_values, train_line, label='Training Set Accuracy')
    plt.plot(x_values, test_line, label='Testing Set Accuracy')
    plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Perfect Accuracy')
    plt.xlabel('Iterations')
    plt.ylabel('Accuracy')
    plt.title('Decision Tree Model Evaluation')
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot accuracy for both training and testing sets
plot_accuracy(train_accuracy_dt, test_accuracy_dt)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline as SparkPipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator


#categorical feature causing the issue
#categorical_features = ['Dementia','Education_Group']
categorical_features = ['Family_History', 'Smoking_Status', 'APOE_ε4', 'Depression_Status', 'Education_Group']

# Define preprocessing steps for numerical and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

# Create a column transformer to apply different preprocessing steps to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Convert PySpark DataFrame to Pandas DataFrame for sklearn
X_train = train_df.select(*numeric_features, *categorical_features).toPandas()
y_train = train_df.select(target).toPandas()

X_test = test_df.select(*numeric_features, *categorical_features).toPandas()
y_test = test_df.select(target).toPandas()


# Create a pipeline with preprocessing and KNN classifier
pipeline_knn = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', KNeighborsClassifier())])

# Fit the pipeline on the training data
pipeline_knn.fit(X_train, y_train)

# Make predictions on both the training and testing data
train_predictions_knn = pipeline_knn.predict(X_train)
test_predictions_knn = pipeline_knn.predict(X_test)

# Calculate accuracy using sklearn for both training and testing sets
train_accuracy_sklearn_knn = accuracy_score(y_train, train_predictions_knn)
test_accuracy_sklearn_knn = accuracy_score(y_test, test_predictions_knn)
print("KNN - Training Set Accuracy:", train_accuracy_sklearn_knn)
print("KNN - Testing Set Accuracy:", test_accuracy_sklearn_knn)

# Calculate confusion matrix using sklearn for both training and testing sets
train_cm_knn = confusion_matrix(y_train, train_predictions_knn)
test_cm_knn = confusion_matrix(y_test, test_predictions_knn)

# Calculate number of correct and wrong predictions for both training and testing sets
train_correct_knn = train_cm_knn[0, 0] + train_cm_knn[1, 1]
train_wrong_knn = train_cm_knn[0, 1] + train_cm_knn[1, 0]
test_correct_knn = test_cm_knn[0, 0] + test_cm_knn[1, 1]
test_wrong_knn = test_cm_knn[0, 1] + test_cm_knn[1, 0]

# Calculate percentages of correct and wrong predictions for both training and testing sets
train_total_knn = len(y_train)
test_total_knn = len(y_test)
train_correct_pct_knn = (train_correct_knn / train_total_knn) * 100
train_wrong_pct_knn = (train_wrong_knn / train_total_knn) * 100
test_correct_pct_knn = (test_correct_knn / test_total_knn) * 100
test_wrong_pct_knn = (test_wrong_knn / test_total_knn) * 100

# Define function to plot KNN results for both training and testing sets
def plot_knn_results(train_correct, train_correct_pct, train_wrong, train_wrong_pct,
                     test_correct, test_correct_pct, test_wrong, test_wrong_pct):
    labels = ['Training Set', 'Testing Set']
    correct = [train_correct, test_correct]
    correct_pct = [round(train_correct_pct, 2), round(test_correct_pct, 2)]
    wrong = [train_wrong, test_wrong]
    wrong_pct = [round(train_wrong_pct, 2), round(test_wrong_pct, 2)]
    x = np.arange(len(labels))
    width = 0.2
    fig, ax = plt.subplots(figsize=(10, 6))
    rects1 = ax.bar(x - width, correct, width, label='Correct', color='lightgreen')
    rects2 = ax.bar(x, wrong, width, label='Wrong', color='salmon')
    rects3 = ax.bar(x + width, correct_pct, width, label='Correct (%)', color='skyblue')
    rects4 = ax.bar(x + 2*width, wrong_pct, width, label='Wrong (%)', color='orange')
    ax.set_ylabel('Count / Percentage')
    ax.set_title('KNN Model Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend(loc='center')
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  
                        textcoords="offset points",
                        ha='center', va='bottom')
    autolabel(rects1)
    autolabel(rects2)
    autolabel(rects3)
    autolabel(rects4)
    plt.show()

# Plot KNN results for both training and testing sets
plot_knn_results(train_correct_knn, train_correct_pct_knn, train_wrong_knn, train_wrong_pct_knn,
                 test_correct_knn, test_correct_pct_knn, test_wrong_knn, test_wrong_pct_knn)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_accuracy_knn(train_accuracy, test_accuracy):
    # Generate x values for interpolation
    x_values = np.linspace(0, 1, num=100)
    
    # Interpolate between the two accuracy points
    train_line = np.linspace(0, train_accuracy, num=100)
    test_line = np.linspace(0, test_accuracy, num=100)
    
    # Plot training set accuracy
    plt.figure(figsize=(10, 5))
    plt.plot(x_values, train_line, label='Training Set Accuracy')
    plt.plot(x_values, test_line, label='Testing Set Accuracy')
    plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Perfect Accuracy')
    plt.xlabel('Iterations')
    plt.ylabel('Accuracy')
    plt.title('KNN Model Evaluation')
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot accuracy for both training and testing sets
plot_accuracy_knn(train_accuracy_sklearn_knn, test_accuracy_sklearn_knn)


In [None]:
# finish