In [4]:
import sys
import os
import pandas as pd
from sklearn.model_selection import KFold  # train/test splits
from sklearn.model_selection import GridSearchCV  # selecting
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import plotnine as p9
import time
import statistics


params_df = pd.read_csv("params.csv")

# if len(sys.argv) == 2:
#     prog_name, task_str = sys.argv
#     param_row = int(task_str)
# else:
#     print("len(sys.argv)=%d so trying first param" % len(sys.argv))
param_row = 0

data_path = "/projects/genomic-ml/da2343/ml_project_3/data"
param_dict = dict(params_df.iloc[param_row, :])

dataset_name = param_dict["dataset_name"]
algorithm = param_dict["algorithm"]



def run_main(n_rows=100):
    # Record the start time
    start_time = time.time()

    # Read the zip file into a pandas dataframe
    zip_df = pd.read_csv(
        f"{data_path}/zip.test.gz",
        header=None,
        sep=" ")
    is01 = zip_df[0].isin([0, 1])
    zip01_df = zip_df.loc[is01, :]
    zip01_shuffled_df = zip01_df.sample(frac=1, random_state=1).reset_index(drop=True)
    
    # Read the spam.csv data into a pandas dataframe
    spam_df = pd.read_csv(
        f"{data_path}/spam.data",
        sep=" ",
        header=None)
    spam_df_shuffled = spam_df.sample(frac=1, random_state=1).reset_index(drop=True)

    data_dict = {
        "zip": (zip01_shuffled_df.loc[:, 1:].to_numpy(), zip01_shuffled_df[0]),
        "spam": (spam_df_shuffled.iloc[:, :-1].to_numpy(), spam_df_shuffled.iloc[:, -1])
    }

    algo_dict = {
        "KNeighborsClassifier": GridSearchCV(estimator=KNeighborsClassifier(),
                                param_grid=[{'n_neighbors': [x]} for x in range(1, 21)], cv=5),
        "LinearModel": make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000)),
        "Featureless": DummyClassifier(strategy="most_frequent"),
    }
    
    classifier = algo_dict[algorithm]
    data_set = data_dict[dataset_name]
    input_mat = data_set[0][:n_rows, :]
    output_vec = data_set[1][:n_rows]

    test_acc_df_list = []
    kf = KFold(n_splits=3, shuffle=True, random_state=1)
    for fold_id, indices in enumerate(kf.split(input_mat)):
        index_dict = dict(zip(["train", "test"], indices))

        set_data_dict = {}
        for set_name, index_vec in index_dict.items():
            set_data_dict[set_name] = {
                "X": input_mat[index_vec],
                "y": output_vec.iloc[index_vec]
            }
        classifier.fit(**set_data_dict["train"])
        pred_vec = classifier.predict(set_data_dict["test"]["X"])
        accuracy = accuracy_score(set_data_dict["test"]["y"], pred_vec)
        
        test_acc_df_list.append(pd.DataFrame({
                "test_accuracy_percent": accuracy * 100,
                "data_set": dataset_name,
                "fold_id": fold_id,
                "algorithm": algorithm
            }, index=[0]))
    
    test_acc_df = pd.concat(test_acc_df_list)
    print(test_acc_df)
        
    # Record the end time
    end_time = time.time()
    elapsed_time = end_time - start_time
    return elapsed_time


n_row_list = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
n_row_time_list = []
for i in n_row_list:
    time_list = []
    for j in range(3):
        elapsed_time = run_main(i)
        time_list.append(elapsed_time)
        
    mean_time = statistics.mean(time_list)
    std_time = statistics.stdev(time_list)
    n_row_time_list.append({
        "n_rows": i,
        "mean_time": mean_time,
        "std_time": std_time,
        "algorithm": algorithm,
        "dataset_name": dataset_name,
        "type": "parallel"
    })
n_row_time_df = pd.DataFrame(n_row_time_list)

# Save dataframe as a csv to output directory
# out_file = f"results/{param_row}.csv"
# n_row_time_df.to_csv(out_file, encoding='utf-8', index=False)
print("Done!!")

   test_accuracy_percent data_set  fold_id             algorithm
0                  100.0      zip        0  KNeighborsClassifier
0                  100.0      zip        1  KNeighborsClassifier
0                  100.0      zip        2  KNeighborsClassifier
   test_accuracy_percent data_set  fold_id             algorithm
0                  100.0      zip        0  KNeighborsClassifier
0                  100.0      zip        1  KNeighborsClassifier
0                  100.0      zip        2  KNeighborsClassifier
   test_accuracy_percent data_set  fold_id             algorithm
0                  100.0      zip        0  KNeighborsClassifier
0                  100.0      zip        1  KNeighborsClassifier
0                  100.0      zip        2  KNeighborsClassifier
   test_accuracy_percent data_set  fold_id             algorithm
0             100.000000      zip        0  KNeighborsClassifier
0              97.014925      zip        1  KNeighborsClassifier
0              98.484848 

In [5]:
n_row_time_df

Unnamed: 0,n_rows,mean_time,std_time,algorithm,dataset_name,type
0,100,1.053829,0.002496,KNeighborsClassifier,zip,parallel
1,200,1.106328,0.01164,KNeighborsClassifier,zip,parallel
2,300,1.164248,0.003834,KNeighborsClassifier,zip,parallel
3,400,1.389387,0.098354,KNeighborsClassifier,zip,parallel
4,500,1.640957,0.167401,KNeighborsClassifier,zip,parallel
5,600,1.604709,0.017487,KNeighborsClassifier,zip,parallel
6,700,1.628374,0.026981,KNeighborsClassifier,zip,parallel
7,800,1.624474,0.006951,KNeighborsClassifier,zip,parallel
8,900,1.620542,0.010193,KNeighborsClassifier,zip,parallel
9,1000,1.606624,0.007292,KNeighborsClassifier,zip,parallel
