In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
import numpy as np
from format_data import *

# Entraînement

### Importation et formattage des données

In [13]:
train_data = pd.read_csv("./data/GAN_train.csv", index_col=0)
train_data = format_data(train_data)
train_data.head()

Unnamed: 0,TMQ,U850,V850,UBOT,VBOT,QREFHT,PS,PSL,T200,T500,...,TMQ_WINDBOT_INTERACTION,QREFHT_TREFHT_INTERACTION,TMQ_SQUARE,WIND850_MAGNITUDE_SQUARE,WINDBOT_MAGNITUDE_SQUARE,TREFHT_SQUARE,MONTH_SIN,MONTH_COS,LAPSE_RATE,Label
0,14.278533,-3.173287,1.833835,-7.581432,3.399386,0.006412,103070.3594,103070.3594,214.141617,256.000519,...,118.635566,1.842325,203.876504,13.432702,69.033935,82562.958151,-0.866025,0.5,0.13953,0
1,44.26907,-3.000973,0.375796,-2.194901,-3.694045,0.017199,101155.2188,101155.2188,219.897949,266.757843,...,190.220828,5.165643,1959.750529,9.14706,18.463556,90207.229732,-0.866025,0.5,0.1562,0
2,29.716904,5.699519,-4.10604,3.079108,-2.678627,0.01248,101841.1641,101841.1641,215.238892,264.979614,...,121.279737,3.674712,883.094365,49.344087,16.655949,86705.590264,-0.866025,0.5,0.165802,2
3,36.301456,-0.657589,-1.584453,-0.903698,-2.157597,0.017055,101450.9688,101450.9688,217.602081,267.198273,...,84.91665,5.140841,1317.79574,2.942915,5.471893,90860.960406,-0.866025,-0.5,0.165321,0
4,29.445889,22.721797,-15.070397,1.895406,-7.071098,0.008437,101568.182752,101137.479833,215.028439,265.348473,...,215.565199,2.459911,867.06035,743.396917,53.592988,85008.390575,-0.5,-0.866025,0.167733,2


In [67]:
import numpy as np
from scipy.sparse import csr_matrix, hstack

class SoftmaxRegression:
    def __init__(self, learning_rate=0.01, n_iterations=100000, regularization='L1', reg_coeff=0.01, weights=False, early_stopping=True, patience=10):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.regularization = regularization
        self.reg_coeff = reg_coeff
        self.use_weights = weights
        self.sample_weights = None
        self.theta = None
        self.early_stopping = early_stopping
        self.patience = patience

    def softmax(self, scores):
        exp_scores = np.exp(scores - np.max(scores, axis=1, keepdims=True))
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    def compute_class_weights(self, y):
        class_sample_counts = np.bincount(y)
        weights = 1. / class_sample_counts
        weights = weights / np.sum(weights) * len(np.unique(y))
        return np.array([weights[label] for label in y])

    def fit(self, X_train, y_train, X_val=None, y_val=None):
        if not isinstance(X_train, csr_matrix):
            X_train = csr_matrix(X_train)

        bias_train = np.ones((X_train.shape[0], 1))
        X_train_bias = np.hstack([bias_train, X_train.toarray()])

        n_samples, n_features = X_train_bias.shape
        n_classes = len(np.unique(y_train))

        self.theta = 0.01 * np.random.randn(n_features, n_classes)

        if self.use_weights:
            self.sample_weights = self.compute_class_weights(y_train)
        else:
            self.sample_weights = np.ones(n_samples)

        best_theta = None
        best_val_accuracy = float('-inf')
        no_improvement_count = 0

        if self.early_stopping and X_val is not None and y_val is not None:
            if not isinstance(X_val, csr_matrix):
                X_val = csr_matrix(X_val)
            bias_val = np.ones((X_val.shape[0], 1))
            X_val_bias = np.hstack([bias_val, X_val.toarray()])

        for i in range(self.n_iterations):
            scores = X_train_bias.dot(self.theta)
            probabilities = self.softmax(scores)

            y_onehot = np.zeros(probabilities.shape)
            y_onehot[np.arange(n_samples), y_train] = 1

            gradient = - (X_train_bias.T.dot(self.sample_weights[:, np.newaxis] * (y_onehot - probabilities))) / np.sum(self.sample_weights)

            if self.regularization == 'L2':
                reg_theta = self.theta.copy()
                reg_theta[0] = 0
                gradient += self.reg_coeff * reg_theta
            elif self.regularization == 'L1':
                gradient += self.reg_coeff * np.sign(self.theta)

            self.theta -= self.learning_rate * gradient

            train_accuracy = self.score(X_train, y_train)
            print(f"Iteration {i}: Training accuracy: {train_accuracy:.4f}", end="")

            if self.early_stopping and X_val is not None and y_val is not None:
                val_accuracy = self.score(X_val, y_val)
                print(f", Validation accuracy: {val_accuracy:.4f}", end="")

                if val_accuracy > best_val_accuracy:
                    best_val_accuracy = val_accuracy
                    best_theta = self.theta.copy()
                    no_improvement_count = 0
                else:
                    no_improvement_count += 1

                if no_improvement_count >= self.patience:
                    print(f"\nEarly stopping after {i} iterations")
                    self.theta = best_theta
                    val_accuracy = self.score(X_val, y_val)
                    print(f", Best validation accuracy: {val_accuracy:.4f}", end="")
                    break

            print("")

    def predict(self, X):
        if not isinstance(X, csr_matrix):
            X = csr_matrix(X)

        bias = np.ones((X.shape[0], 1))
        X_bias = np.hstack([bias, X.toarray()])

        scores = X_bias.dot(self.theta)
        predictions = np.argmax(scores, axis=1)
        return predictions

    def score(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy

In [79]:
logisticRegression = SoftmaxRegression(
learning_rate=0.00001, 
n_iterations=100000, 
regularization='L1',
reg_coeff=0.05, 
weights=True,
early_stopping=True,
patience=100)

In [80]:
y = train_data["Label"]
X = train_data.drop(columns=["Label"])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [81]:
logisticRegression.fit(X_train, y_train, X_val, y_val)

Iteration 0: Training accuracy: 0.4262, Validation accuracy: 0.4363
Iteration 1: Training accuracy: 0.2086, Validation accuracy: 0.2098
Iteration 2: Training accuracy: 0.6086, Validation accuracy: 0.6111
Iteration 3: Training accuracy: 0.2230, Validation accuracy: 0.2243
Iteration 4: Training accuracy: 0.1117, Validation accuracy: 0.1136
Iteration 5: Training accuracy: 0.2165, Validation accuracy: 0.2184
Iteration 6: Training accuracy: 0.5320, Validation accuracy: 0.5387
Iteration 7: Training accuracy: 0.2345, Validation accuracy: 0.2345
Iteration 8: Training accuracy: 0.0480, Validation accuracy: 0.0483
Iteration 9: Training accuracy: 0.5794, Validation accuracy: 0.5848
Iteration 10: Training accuracy: 0.1777, Validation accuracy: 0.1718
Iteration 11: Training accuracy: 0.6829, Validation accuracy: 0.6840
Iteration 12: Training accuracy: 0.1705, Validation accuracy: 0.1635
Iteration 13: Training accuracy: 0.2305, Validation accuracy: 0.2279
Iteration 14: Training accuracy: 0.0487, Val

In [83]:
import pandas as pd

# Assuming train_data has been previously loaded
test_data = pd.read_csv("./data/test.csv")

# Format the test_data
test_data = format_data(test_data, is_test=True)

# List of columns from train_data excluding the label
columns_without_label = [col for col in train_data.columns if col != "Label"]

# Check and add missing columns to test_data and set their values to 0
missing_columns = ['SEASON_Spring', 'SEASON_Winter']

for col in missing_columns:
    if col not in test_data.columns:
        test_data[col] = 0

# Reorder the columns of test_data to match the order in train_data
test_data = test_data[columns_without_label]

test_data.head()

Unnamed: 0,TMQ,U850,V850,UBOT,VBOT,QREFHT,PS,PSL,T200,T500,...,TMQ_WIND850_INTERACTION,TMQ_WINDBOT_INTERACTION,QREFHT_TREFHT_INTERACTION,TMQ_SQUARE,WIND850_MAGNITUDE_SQUARE,WINDBOT_MAGNITUDE_SQUARE,TREFHT_SQUARE,MONTH_SIN,MONTH_COS,LAPSE_RATE
0,25.907482,6.66207,-17.510447,-7.432653,-3.93603,0.010624,101532.5391,101532.5391,213.092209,256.032043,...,485.375771,217.895122,3.081385,671.197631,350.998912,70.736668,84123.171146,-0.866025,0.5,0.143133
1,25.907482,6.66207,-17.510447,-7.432653,-3.93603,0.010624,101532.5391,101532.5391,213.092209,256.032043,...,485.375771,217.895122,3.081385,671.197631,350.998912,70.736668,84123.171146,-0.866025,0.5,0.143133
2,27.019733,4.951319,-17.341263,-7.286631,-3.150316,0.01089,101513.0234,101513.0234,213.161011,255.616837,...,487.281212,214.495643,3.157487,730.065995,325.234953,63.019482,84065.558951,-0.866025,0.5,0.141519
3,27.019733,4.951319,-17.341263,-7.286631,-3.150316,0.01089,101513.0234,101513.0234,213.161011,255.616837,...,487.281212,214.495643,3.157487,730.065995,325.234953,63.019482,84065.558951,-0.866025,0.5,0.141519
4,26.516499,5.362008,-17.227922,-7.257047,-2.907396,0.010821,101505.1484,101505.1484,213.188248,255.49881,...,478.439089,207.300143,3.138607,703.124696,325.552442,61.117679,84123.985464,-0.866025,0.5,0.141035


In [84]:
y_test = logisticRegression.predict(test_data)

labels, counts = np.unique(y_test, return_counts=True)

for label, count in zip(labels, counts):
    print(f"Label {label}: {count} occurrences")

Label 0: 9645 occurrences
Label 2: 675 occurrences


In [86]:
df = pd.DataFrame({
    'SNo': range(1, len(y_test) + 1),
    'Label': y_test
})

df.to_csv("logistic_prev.csv", index=False)