In [None]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
import os
import s3fs
import matplotlib.pyplot as plt 

from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.layers import Dense, GRU, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.activations import linear, relu, sigmoid
from tensorflow.keras.regularizers import l2

from modules.utils import pre_process_data, encoded_categorical_features

In [None]:
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
BUCKET = "ebahri-ensae"
FILE_KEY_S3 = "X_train_Hi5.csv"
FILE_PATH_S3 = BUCKET + "/" + FILE_KEY_S3

with fs.open(FILE_PATH_S3, mode="rb") as file_in:
    x_train = pd.read_csv(file_in, sep=",")

In [None]:
x_trained_without_nan, y_train = pre_process_data(x_train)

In [None]:

y_train.head()

In [None]:
x_trained_without_nan.head()

In [None]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.width', None)        # No line wrap
pd.set_option('display.max_colwidth', None) # No truncation of columns

nan_percentage_5 = x_trained_without_nan.isna().mean() * 100

# Display the percentage of NaN values per column
print(nan_percentage_5.sort_values(ascending=False))

In [None]:

x_trained_without_nan["insee_%_agri"] = x_trained_without_nan["insee_%_agri"].replace(
    {'N/A - division par 0': 0}  # Replace with 0 or any value you choose
).astype(float)
x_trained_without_nan["insee_med_living_level"] = x_trained_without_nan["insee_med_living_level"].replace(
    {'N/A - résultat non disponible': 0}  # Replace with 0 or any value you choose
).astype(float)
x_trained_without_nan["insee_%_ind"] = x_trained_without_nan["insee_%_ind"].replace(
    {'N/A - division par 0': 0}  # Replace with 0 or any value you choose
).astype(float)
x_trained_without_nan["insee_%_const"] = x_trained_without_nan["insee_%_const"].replace(
    {'N/A - division par 0': 0}  # Replace with 0 or any value you choose
).astype(float)

In [None]:
X_final = encoded_categorical_features(x_trained_without_nan)

In [None]:
X_final = pd.DataFrame(X_final)
X_final.info()


In [None]:
X_final = X_final.fillna(X_final.mean())

In [None]:
import seaborn as sns

# Correlation heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(X_final.corr(), annot=False, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_final,y_train ))

In [None]:
# Total number of samples in your dataset
dataset_size = len(X_final)
train_size = int(0.6 * dataset_size)  # 60% for training
val_size = int(0.2 * dataset_size)    # 20% for validation
test_size = dataset_size - train_size - val_size  # Remaining 20% for testing

# Shuffle the dataset
dataset = dataset.shuffle(buffer_size=dataset_size)

# Split the dataset into train, validation, and test sets
train_dataset = dataset.take(train_size)    # First 60%
val_dataset = dataset.skip(train_size).take(val_size)  # Next 20%
test_dataset = dataset.skip(train_size + val_size)  # Remaining 20%

# Batch the datasets
batch_size = 32
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

In [None]:
model = Sequential([
    # Input Layer with 89 features
    Dense(64, activation='relu', input_shape=(89,)),  # Start with 64 neurons

    # Output Layer: Softmax activation for classification (assuming 5 classes)
    Dense(5, activation='softmax')  # Output layer for 5 classes
])

In [None]:
model.summary()

In [None]:
def f1_score(y_true, y_pred):
    y_pred = K.argmax(y_pred, axis=-1)
    true_positives = K.sum(K.cast(K.equal(y_true, y_pred), K.floatx()))
    possible_positives = K.sum(K.cast(K.not_equal(y_true, 0), K.floatx()))  # assuming 0 is background
    return true_positives / (K.sum(K.cast(K.equal(y_true, y_pred), K.floatx())) + K.epsilon())

In [None]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    metrics=['accuracy']
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',  # Choose the metric to monitor
    factor=0.5,          # Factor by which to reduce the learning rate
    patience=3,          # Number of epochs with no improvement before reducing
    min_lr=1e-6         # Minimum learning rate
)

checkpoint = ModelCheckpoint(
    '../models/first_model_3.keras',             # File to save the best model
    monitor='val_accuracy',       # Metric to monitor for improvement
    mode='max',                   # Mode 'max' for accuracy (since higher is better)
    save_best_only=True,          # Save only when there is an improvement
    verbose=1,                    # Print message when saving
                    
)


history = model.fit(
    train_dataset,
    validation_data = val_dataset,
    epochs=10,
    callbacks=[reduce_lr, checkpoint]
)

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.legend()
plt.grid(True)

In [None]:
from sklearn.metrics import f1_score

# 1. Make predictions on the validation dataset
y_pred = model.predict(val_dataset)

# 2. Convert predictions to class labels if necessary
# If the model's last layer is 'softmax', use np.argmax to get class labels
y_pred_classes = np.argmax(y_pred, axis=1)

# 3. Get the true labels from the validation dataset
# Note: This assumes val_dataset is a tf.data.Dataset object containing (features, labels)
y_true = np.concatenate([y for _, y in val_dataset], axis=0)

# 4. Calculate the F1 score
f1 = f1_score(y_true, y_pred_classes, average='weighted')  # 'weighted' is typically used for imbalanced classes

# Print the F1 score
print(f'F1 Score: {f1}')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import mean_squared_error, accuracy_score 
from xgboost import XGBClassifier


mon_X_train, mon_X_test, mon_y_train, mon_y_test = train_test_split(X_final,y_train, test_size=0.2, random_state=42)

# Standardiser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(mon_X_train)
X_test_scaled = scaler.transform(mon_X_test)

# Initialiser le modèle XGBoost
model = XGBClassifier(
    objective="multi:softmax",
    num_class=5,
    random_state=42,
    eval_metric="mlogloss",
    colsample_bytree=0.7,
    learning_rate=0.2,
    max_depth=7,
    n_estimators=500,
    reg_alpha=0.5,
    reg_lambda=0.0,
    subsample=1.0
)

# Entraîner le modèle
model.fit(X_train_scaled, mon_y_train)

# Prédire sur le jeu de test
y_pred = model.predict(X_test_scaled)

# Calculer le F1-score pondéré
f1 = f1_score(mon_y_test, y_pred, average='weighted')
print(f"F1-Score pondéré sur le jeu de test : {f1:.2f}")

In [None]:
model.save_model("xgboost_model_2.json")