In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


#load the dataset
dataset = pd.read_csv('./hayuci13a/yXT_seeds.csv', header=None)
df = dataset.iloc[:, 1:]
df.head()
labels = dataset.iloc[:,0]

# Identify the most frequent class
unique, counts = np.unique(labels, return_counts=True)
positive_class = unique[np.argmax(counts)]

# Convert to binary classification
y_binary = np.where(labels == positive_class, 1, 0)

#Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df, y_binary, test_size=0.3, random_state=42)

#standardize the feautures
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Print dataset info
print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Class distribution in train set:", np.bincount(y_train))
print("Class distribution in test set:", np.bincount(y_test))


Train set size: (147, 7)
Test set size: (63, 7)
Class distribution in train set: [97 50]
Class distribution in test set: [43 20]


In [29]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

#train a linear SVM with regularization paramater with lambda = 1/n
n_train = X_train.shape[0]
svm_model = SVC(kernel='linear', C=1/n_train, random_state=42)
svm_model.fit(X_train, y_train)

#evaluate performance on the test set
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)


# Print dataset info and accuracy
print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Class distribution in train set:", np.bincount(y_train))
print("Class distribution in test set:", np.bincount(y_test))
print("Test set accuracy:", accuracy)


Train set size: (147, 7)
Test set size: (63, 7)
Class distribution in train set: [97 50]
Class distribution in test set: [43 20]
Test set accuracy: 0.6984126984126984


In [34]:
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_score

#standardize the feautures
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)

# L1 Normalization
l1_normalizer = Normalizer(norm='l1')
X_train_l1 = l1_normalizer.fit_transform(X_train)
X_test_l1 = l1_normalizer.transform(X_test)

# L2 Normalization
l2_normalizer = Normalizer(norm='l2')
X_train_l2 = l2_normalizer.fit_transform(X_train)
X_test_l2 = l2_normalizer.transform(X_test)

#candidata values for lambda
n_train = X_train.shape[0]
lambda_values = [0.1/n_train, 1/n_train, 10/n_train]

#perform five-fold cross-validation to select the best lambda

def cross_validate_svm(X_train, y_train):
    best_lambda = None
    best_score = 0
    for C in lambda_values:
        svm = SVC(kernel='linear', C=C, random_state=42)
        scores = cross_val_score(svm, X_train, y_train, cv=5, scoring='accuracy')
        mean_score = scores.mean()
        if mean_score > best_score:
            best_score = mean_score
            best_lambda = C
    return best_lambda

## Get best lambda for each normalization method
best_lambda_std = cross_validate_svm(X_train_std, y_train)
best_lambda_l1 = cross_validate_svm(X_train_l1, y_train)
best_lambda_l2 = cross_validate_svm(X_train_l2, y_train)


#train and evaluate SVM with best lambda for each method
svm_std = SVC(kernel='linear', C=best_lambda_std, random_state=42)
svm_std.fit(X_train_std, y_train)
y_pred_std = svm_std.predict(X_test_std)
accuracy_std = accuracy_score(y_test, y_pred_std)

svm_l1 = SVC(kernel='linear', C=best_lambda_l1, random_state=42)
svm_l1.fit(X_train_l1, y_train)
y_pred_l1 = svm_l1.predict(X_test_l1)
accuracy_l1 = accuracy_score(y_test, y_pred_l1)

svm_l2 = SVC(kernel='linear', C=best_lambda_l2, random_state=42)
svm_l2.fit(X_train_l2, y_train)
y_pred_l2 = svm_l2.predict(X_test_l2)
accuracy_l2 = accuracy_score(y_test, y_pred_l2)

# Print results
print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Class distribution in train set:", np.bincount(y_train))
print("Class distribution in test set:", np.bincount(y_test))
print("Most frequent class (positive class):", positive_class)
print("Best lambda (Standardization):", best_lambda_std)
print("Best lambda (L1 Normalization):", best_lambda_l1)
print("Best lambda (L2 Normalization):", best_lambda_l2)
print("Test set accuracy (Standardization):", accuracy_std)
print("Test set accuracy (L1 Normalization):", accuracy_l1)
print("Test set accuracy (L2 Normalization):", accuracy_l2)



Train set size: (147, 7)
Test set size: (63, 7)
Class distribution in train set: [97 50]
Class distribution in test set: [43 20]
Most frequent class (positive class): 1
Best lambda (Standardization): 0.06802721088435375
Best lambda (L1 Normalization): 0.0006802721088435375
Best lambda (L2 Normalization): 0.06802721088435375
Test set accuracy (Standardization): 0.8888888888888888
Test set accuracy (L1 Normalization): 0.6825396825396826
Test set accuracy (L2 Normalization): 0.8888888888888888


In [36]:
from scipy.stats import ttest_1samp


accuracy_scores = {
    "Standardization": [],
    "L1 Normalization": [],
    "L2 Normalization": []
}

# 10回のクロスバリデーション
for _ in range(10):
    for method, X_train in zip(accuracy_scores.keys(), [X_train_std, X_train_l1, X_train_l2]):
        svm = SVC(kernel='linear', C=1.0, random_state=42)
        scores = cross_val_score(svm, X_train, y_train, cv=5, scoring='accuracy')
        accuracy_scores[method].append(scores.mean())

# 平均精度を計算
mean_accuracies = {method: np.mean(scores) for method, scores in accuracy_scores.items()}
best_method = max(mean_accuracies, key=mean_accuracies.get)

print("Average accuracies:", mean_accuracies)
print("Best method:", best_method)

# 最良手法と他の手法のt検定
best_scores = np.array(accuracy_scores[best_method])
for method, scores in accuracy_scores.items():
    if method != best_method:
        t_stat, p_value = ttest_1samp(scores, best_scores.mean())
        print(f"t-test {best_method} vs {method}: t={t_stat:.4f}, p={p_value:.4f}")

        if p_value < 0.05:
            print(f"-> The difference between {best_method} and {method} is statistically significant (p < 0.05).")
        else:
            print(f"-> No significant difference found between {best_method} and {method}.")



Average accuracies: {'Standardization': np.float64(0.9048275862068967), 'L1 Normalization': np.float64(0.9117241379310345), 'L2 Normalization': np.float64(0.9050574712643679)}
Best method: L1 Normalization
t-test L1 Normalization vs Standardization: t=-186355846649807.9688, p=0.0000
-> The difference between L1 Normalization and Standardization is statistically significant (p < 0.05).
t-test L1 Normalization vs L2 Normalization: t=-inf, p=0.0000
-> The difference between L1 Normalization and L2 Normalization is statistically significant (p < 0.05).


  res = hypotest_fun_out(*samples, **kwds)


# １標本のt検定とは？
仮説：
母集団の平均はある値をとる

必要なデータ：
- 独立
- 連続変数
- 標本が母集団から単純無作為抽出されている
- 母集団は正規分布に従うことを想定｀