In [30]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
import numpy as np
from format_data import *
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Importation et formattage des données

In [6]:
train_data = pd.read_csv("./data/GAN_train.csv")
train_data = format_data(train_data)

# Classe pour la régression logistique en utilisant seulement numpy et scipy.sparse

In [7]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

class SoftmaxRegression:
    def __init__(self, n_iterations=100000, regularization=None, reg_coeff=0.01, 
                 weights=False, early_stopping=True, patience=500, verbose=False):
        """
        Softmax Regression Classifier Initialization
        
        Parameters:
        - n_iterations: maximum number of iterations for optimization
        - regularization: regularization type ('L1' or 'L2')
        - reg_coeff: regularization coefficient
        - weights: boolean indicating if instance weights should be used
        - early_stopping: boolean indicating if early stopping should be used
        - patience: number of iterations without improvement to trigger early stopping
        """
        self.learning_rate = None  # We'll be using the custom learning rate function
        self.n_iterations = n_iterations
        self.regularization = regularization
        self.reg_coeff = reg_coeff
        self.use_weights = weights
        self.sample_weights = None
        self.theta = None
        self.early_stopping = early_stopping
        self.patience = patience
        self.verbose = verbose
        self.best_iteration = None

    def softmax(self, scores):
        """
        Computes softmax probabilities for given scores
        
        Parameters:
        - scores: raw score values
        
        Returns:
        - Softmax probabilities
        """
        exp_scores = np.exp(scores - np.max(scores, axis=1, keepdims=True))
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    def compute_class_weights(self, y):
        """
        Computes weights for instances based on their class
        
        Parameters:
        - y: class labels
        
        Returns:
        - Array of instance weights
        """
        class_sample_counts = np.bincount(y)
        weights = 1. / class_sample_counts
        weights = weights / np.sum(weights) * len(np.unique(y))
        return np.array([weights[label] for label in y])

    def get_custom_learning_rate(self, iteration):
        """Get custom learning rate based on current iteration"""
        if iteration < 25:
            return 0.1
        elif iteration < 50:
            return 0.05
        elif iteration < 150:
            return 0.01
        elif iteration < 300:
            return 0.005
        else:
            return 0.001

    def fit(self, X_train, y_train, X_val=None, y_val=None):
        """
        Trains the softmax regression model
        
        Parameters:
        - X_train: training data features
        - y_train: training data labels
        - X_val: validation data features (optional)
        - y_val: validation data labels (optional)
        """
        # Convert to sparse matrix if not already
        if not isinstance(X_train, csr_matrix):
            X_train = csr_matrix(X_train)

        # Add bias term (intercept) to training data
        bias_train = csr_matrix(np.ones((X_train.shape[0], 1)))
        X_train_bias = hstack([bias_train, X_train])

        n_samples, n_features = X_train_bias.shape
        n_classes = len(np.unique(y_train))
        
        # Initialize weights with small random values
        self.theta = np.random.randn(n_features, n_classes) * 0.01

        # Compute or set sample weights
        if self.use_weights:
            self.sample_weights = self.compute_class_weights(y_train)
        else:
            self.sample_weights = np.ones(n_samples)

        # Convert labels to one-hot encoding
        y_onehot = np.zeros((n_samples, n_classes))
        y_onehot[np.arange(n_samples), y_train] = 1

        best_theta = None
        best_val_accuracy = float('-inf')
        no_improvement_count = 0

        # Prepare validation data if early stopping is enabled
        if self.early_stopping and X_val is not None and y_val is not None:
            if not isinstance(X_val, csr_matrix):
                X_val = csr_matrix(X_val)
            bias_val = csr_matrix(np.ones((X_val.shape[0], 1)))
            X_val_bias = hstack([bias_val, X_val])

        for i in range(self.n_iterations):
            # Update learning rate using custom logic
            self.learning_rate = self.get_custom_learning_rate(i)

            # Compute predictions using current weights
            scores = X_train_bias.dot(self.theta)
            probabilities = self.softmax(scores)

            # Compute gradient for optimization
            diff_weighted = self.sample_weights[:, np.newaxis] * (y_onehot - probabilities)
            gradient = -X_train_bias.T.dot(diff_weighted) / n_samples

            # Apply regularization to gradient if specified
            if self.regularization == 'L2':
                gradient[1:] += self.reg_coeff * self.theta[1:]
            elif self.regularization == 'L1':
                gradient[1:] += self.reg_coeff * np.sign(self.theta[1:])

            # Update weights using gradient
            self.theta -= self.learning_rate * gradient

            # Check for early stopping if enabled and validation data is provided
            if self.early_stopping and X_val is not None and y_val is not None:
                val_accuracy = self.score(X_val, y_val)

                if val_accuracy > best_val_accuracy:
                    best_val_accuracy = val_accuracy
                    best_theta = self.theta.copy()
                    self.best_iteration = i  # <-- Record the best iteration
                    no_improvement_count = 0
                else:
                    no_improvement_count += 1

                if no_improvement_count >= self.patience:
                    if self.verbose:
                        print(f"Early stopping after {i} iterations. Best iteration was {self.best_iteration}.")
                    self.theta = best_theta
                    break

    def predict(self, X):
        """
        Predicts class labels for given features
        
        Parameters:
        - X: data features
        
        Returns:
        - Predicted class labels
        """
        # Convert to sparse matrix if not already
        if not isinstance(X, csr_matrix):
            X = csr_matrix(X)

        # Add bias term (intercept) to data
        bias = np.ones((X.shape[0], 1))
        X_bias = np.hstack([bias, X.toarray()])

        # Compute class scores and return predictions
        scores = X_bias.dot(self.theta)
        predictions = np.argmax(scores, axis=1)
        return predictions

    def score(self, X, y):
        """
        Computes accuracy of predictions
        
        Parameters:
        - X: data features
        - y: true class labels
        
        Returns:
        - Accuracy of predictions
        """
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy

# Optimisation bayésienne des hyperparamètres

In [9]:
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from optuna.pruners import MedianPruner

# Assuming SoftmaxRegression class is defined/imported from above

# Prepare your data
X = train_data.drop(['Label'], axis=1)
y = train_data['Label']
regularizations = ['None', 'L1', 'L2']  # Assuming this is a global variable

def objective(trial):

    # Hyperparameter setting
    use_weights = trial.suggest_categorical('weights', [True, False])
    regularization_index = trial.suggest_int('regularization_index', 0, 2)
    regularization = regularizations[regularization_index]
    
    if regularization == 'None':
        reg_coeff = 0  # No regularization coefficient needed
    else:
        reg_coeff = trial.suggest_float('reg_coeff', 0.0, 0.5)
    
    # Storage for scores
    scores = []
    
    for i in range(3):
        # Splitting data
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y)

        # Data scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_valid_scaled = scaler.transform(X_valid)

        # Data whitening using PCA
        pca = PCA(whiten=True)
        X_train_whitened = pca.fit_transform(X_train_scaled)
        X_valid_whitened = pca.transform(X_valid_scaled)

        # Model training
        regressor = SoftmaxRegression(n_iterations=1_000_000, reg_coeff=reg_coeff, regularization=regularization, early_stopping=True, weights=use_weights, verbose=False) 
        regressor.fit(X_train_whitened, y_train, X_valid_whitened, y_valid)

        # Validation
        preds = regressor.predict(X_valid_whitened)
        score = accuracy_score(y_valid, preds)
        scores.append(score)

        # Report the accuracy score and check for pruning
        trial.report(score, i)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    avg_accuracy = np.mean(scores)
    return avg_accuracy

def save_intermediate_results(study, trial):
    """Callback to save the trials dataframe after each iteration."""
    df = study.trials_dataframe()
    df.to_csv("bayes/softmax_regression_optimization_results.csv", index=False)

# Create a study object and specify the direction is 'maximize'.
study = optuna.create_study(direction='maximize', pruner=MedianPruner())
study.optimize(objective, n_trials=150, callbacks=[save_intermediate_results])

# Save the final results
final_results = study.trials_dataframe()
final_results.to_csv("bayes/softmax_regression_optimization_results.csv", index=False)

# Print the best result
best_trial = study.best_trial
print("\nBest trial:")
print(f"  Value (Accuracy): {best_trial.value:.4f}")
for key, value in best_trial.params.items():
    if key == 'regularization_index':
        print(f"  regularization: {regularizations[int(value)]}")
    elif key == 'weights':
        print(f"  {key}: {'Yes' if value else 'No'}")
    else:
        print(f"  {key}: {value}")

[I 2023-11-04 00:15:46,294] A new study created in memory with name: no-name-dbf96dc7-9f38-442f-af5e-a21a4d32d8de


[I 2023-11-04 00:23:29,401] Trial 0 finished with value: 0.7626453488372092 and parameters: {'weights': True, 'regularization_index': 1, 'reg_coeff': 0.004694877489240357}. Best is trial 0 with value: 0.7626453488372092.
[I 2023-11-04 00:23:50,678] Trial 1 finished with value: 0.8046996124031008 and parameters: {'weights': False, 'regularization_index': 2, 'reg_coeff': 0.4535725735761881}. Best is trial 1 with value: 0.8046996124031008.
[I 2023-11-04 00:32:34,953] Trial 2 finished with value: 0.7796996124031009 and parameters: {'weights': True, 'regularization_index': 2, 'reg_coeff': 0.1467906434770233}. Best is trial 1 with value: 0.8046996124031008.
[I 2023-11-04 00:41:40,776] Trial 3 finished with value: 0.7758236434108529 and parameters: {'weights': True, 'regularization_index': 2, 'reg_coeff': 0.1704584342722788}. Best is trial 1 with value: 0.8046996124031008.
[I 2023-11-04 00:42:00,833] Trial 4 finished with value: 0.7316860465116278 and parameters: {'weights': False, 'regulariz


Best trial:
  Value (Accuracy): 0.8349
  weights: No
  regularization: None


# Aggrégation des prédictions pour 500 modèles différents avec les meilleurs hyperparamètres afin de réduire la variance des prédictions 

In [46]:
X = train_data.drop('Label', axis=1)
y = train_data['Label']

total_predictions = np.zeros((10320, 500))

# Initialize a dictionary to hold accumulated metrics
accumulated_metrics = {
    'Precision': {0: [], 1: [], 2: []},
    'Recall': {0: [], 1: [], 2: []},
    'F1': {0: [], 1: [], 2: []},
    'Accuracy': {0: [], 1: [], 2: []}
}

for i in range(500):
    # Split data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)

    # Apply PCA whitening
    pca = PCA(whiten=True)
    X_train_whitened = pca.fit_transform(X_train_scaled)
    X_valid_whitened = pca.transform(X_valid_scaled)

    # Initialize and train the model
    regressor = SoftmaxRegression(n_iterations=1_000_000, reg_coeff=0, regularization="None", early_stopping=True, weights=False, verbose=False)
    regressor.fit(X_train_whitened, y_train, X_valid_whitened, y_valid)

    # Predict on the validation set
    y_pred = regressor.predict(X_valid_whitened)

    # Generate a classification report for this fold
    report = classification_report(y_valid, y_pred, output_dict=True, zero_division=0)

    # Calculate per-class accuracy and update accumulated metrics
    cm = confusion_matrix(y_valid, y_pred)
    for label in range(len(cm)):
        label_str = str(label)
        accumulated_metrics['Accuracy'][label].append(cm[label, label] / np.sum(cm[label]))
        accumulated_metrics['Precision'][label].append(report[label_str]['precision'])
        accumulated_metrics['Recall'][label].append(report[label_str]['recall'])
        accumulated_metrics['F1'][label].append(report[label_str]['f1-score'])

    # Predict on the test set
    X_test = scaler.transform(test_data)
    X_test_whitened = pca.transform(X_test)
    total_predictions[:, i] = regressor.predict(X_test_whitened)

# Calculate the average of the accumulated metrics
average_metrics = {measure: {} for measure in accumulated_metrics}
for measure, classes in accumulated_metrics.items():
    for class_label, values in classes.items():
        average_metrics[measure][class_label] = np.mean(values)

# Remap class labels to the desired names
class_label_names = {0: 'Normal', 1: 'Tropical Cyclone', 2: 'Atmospheric River'}

# Now create a new DataFrame that will contain the remapped labels and measures
final_metrics_df = pd.DataFrame()

# Populate the new DataFrame with the remapped labels and the average metrics
for measure, classes in average_metrics.items():
    for class_label, value in classes.items():
        final_metrics_df.at[class_label_names[class_label], measure] = value

# Reorder the DataFrame to have accuracy as the first row and performance measures as columns
reordered_measures = ['Accuracy', 'Precision', 'Recall', 'F1']
final_metrics_df = final_metrics_df[reordered_measures]

# Transpose the DataFrame to match the requested format with performance measures as rows
final_metrics_df = final_metrics_df.T

# Print the final DataFrame
final_metrics_df

Unnamed: 0,Normal,Tropical Cyclone,Atmospheric River
Accuracy,0.924998,0.74461,0.56443
Precision,0.833137,0.834148,0.785644
Recall,0.924998,0.74461,0.56443
F1,0.876624,0.785904,0.656131


In [47]:
# Convert predictions to integers if necessary
total_predictions_int = total_predictions.astype(int)

# Now apply the majority voting
y_test = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=total_predictions_int)

In [48]:
labels, counts = np.unique(y_test, return_counts=True)

for label, count in zip(labels, counts):
    print(f"Label {label}: {count} occurrences")

Label 0: 7931 occurrences
Label 1: 813 occurrences
Label 2: 1576 occurrences


In [50]:
df = pd.DataFrame({
    'SNo': range(1, len(y_test) + 1),
    'Label': y_test
})

df.to_csv("logistic_final.csv", index=False)