In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# machine learning model_pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# model evaluation
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
def generate_accuracy_models(path_to_clean_train_data, path_to_clean_test_data, path_to_train_labels, num_features):
    """Load data into dataframes"""
    dataset = pd.read_csv(path_to_clean_train_data, header= None, delim_whitespace=True) # delim_whitespace=True
    test_dataset = pd.read_csv(path_to_clean_test_data, header= None, delim_whitespace=True)
    output = pd.read_csv(path_to_train_labels, sep='\n', header= None)
    
    """Format dataframes"""
    x = dataset.iloc[:,0:num_features].values
    y = output.iloc[:,:1].values
    test = test_dataset.iloc[:,0:num_features]
    
    test.head() # check if data is loaded properly
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) # split training data
    # 2/3 rds for training and 1/3 for testing
    
    # create pipeline for accuracy comparison
    model_pipeline = []
    model_pipeline.append(SVC())
    model_pipeline.append(KNeighborsClassifier())
    model_pipeline.append(DecisionTreeClassifier())
    model_pipeline.append(RandomForestClassifier())
    model_pipeline.append(GaussianNB())
    
    model_list = ['SVM', 'KNN', 'Decision Tree', 'Random Forest', 'Naive Bayes']
    acc_list = []
    cm_list = []
    
    # train the model and predict accuracy score using split training data
    for model in model_pipeline:
        model.fit(x_train, y_train.ravel())
        y_pred = model.predict(x_test)
        acc_list.append(metrics.accuracy_score(y_test, y_pred))
        #cm_list.append(confusion_matrix(y_test, y_pred))
        
    result_df = pd.DataFrame({'Model' :model_list, 'Accuracy' : acc_list})
    print(result_df)

In [3]:
print("Dataset 1 accuracy:")
generate_accuracy_models("./clean_datasets/TrainData1.txt", "./clean_datasets/TestData1.txt",
                         "./datasets/TrainLabel1.txt", 3312)

Dataset 1 accuracy:
           Model  Accuracy
0            SVM      0.86
1            KNN      0.94
2  Decision Tree      0.80
3  Random Forest      0.86
4    Naive Bayes      0.84


In [4]:
print("Dataset 2 accuracy:")
generate_accuracy_models("./datasets/TrainData2.txt", "./datasets/TestData2.txt",
                         "./datasets/TrainLabel2.txt", 9182)

Dataset 2 accuracy:
           Model  Accuracy
0            SVM  0.545455
1            KNN  0.666667
2  Decision Tree  0.545455
3  Random Forest  0.757576
4    Naive Bayes  0.727273


In [5]:
print("Dataset 3 accuracy:")
generate_accuracy_models("./clean_datasets/TrainData3.txt", "./clean_datasets/TestData3.txt",
                         "./datasets/TrainLabel3.txt", 13)

Dataset 3 accuracy:
           Model  Accuracy
0            SVM  0.351611
1            KNN  0.312169
2  Decision Tree  0.259259
3  Random Forest  0.303030
4    Naive Bayes  0.297258


In [6]:
print("Dataset 4 accuracy:")
generate_accuracy_models("./datasets/TrainData4.txt", "./datasets/TestData4.txt",
                         "./datasets/TrainLabel4.txt", 112)

Dataset 4 accuracy:
           Model  Accuracy
0            SVM  0.753864
1            KNN  0.717004
2  Decision Tree  0.833532
3  Random Forest  0.956005
4    Naive Bayes  0.514863


In [7]:
print("Dataset 2 accuracy:")
generate_accuracy_models("./datasets/TrainData5.txt", "./datasets/TestData5.txt",
                         "./datasets/TrainLabel5.txt", 11)

Dataset 2 accuracy:
           Model  Accuracy
0            SVM  0.481081
1            KNN  0.481081
2  Decision Tree  0.602703
3  Random Forest  0.667568
4    Naive Bayes  0.516216
