<a href="https://colab.research.google.com/github/BrouthenKamel/HAICK-2023/blob/main/Sonatrach_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
from tensorflow import keras as keras
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.backend as K

In [2]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [3]:
data_path = '/gdrive/MyDrive/Sonatrach_Dataset'

In [4]:
data_sample = pd.read_csv(data_path+'/sample.csv')
data_train = pd.read_csv(data_path+'/train_dataset.csv')
data_test = pd.read_csv(data_path+'/test_dataset.csv')

In [5]:
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
mi_scores = make_mi_scores(data_train.drop(columns = ["ROP (Time)"]), data_train["ROP (Time)"])
mi_scores

In [10]:
X = data_train.drop(columns = ["ROP (Time)"])
y = data_train["ROP (Time)"]

In [12]:
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X_standardized = pd.DataFrame(X_standardized, columns=data_train.drop(columns = ["ROP (Time)"]).columns)

In [None]:
pca = PCA(n_components=0.99)
X_pca = pca.fit_transform(X_standardized)

component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_pca.head()

In [23]:
X_pca.shape

(72000, 25)

In [17]:
from keras.engine.training import Model
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

param_grid = {'alpha' : [0.02, 0.01, 0.05],
              'penalty' : ['l1', 'l2'],
              }

model = SGDRegressor()

grid_search = GridSearchCV(model, param_grid, cv=5)

grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate the mean squared error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Mean squared error: ", rmse)

Best hyperparameters:  {'alpha': 0.01, 'penalty': 'l1'}
Mean squared error:  13.779540547405892


In [22]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

early_stop = EarlyStopping(monitor='val_loss', patience=5)

In [34]:
model = Sequential(
    [
    Dense(128, activation='relu', input_shape=(X_pca.shape[1],)),
     Dropout(0.1),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(32, activation='relu'),
     Dropout(0.2),
    Dense(16, activation='relu'),
     Dropout(0.2),
    Dense(1, activation='linear')
    ]
)

model.compile(optimizer='adam', loss='mean_squared_error', metrics=[rmse])

model.fit(X_train.to_numpy(), y_train.to_numpy(), epochs = 50, validation_split=0.3, callbacks=[early_stop])

loss, metric = model.evaluate(X_test, y_test)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50


In [35]:
data_test_standard = scaler.transform(data_test)

data_test_pca = pca.transform(data_test_standard)
print(data_test_pca.shape)
component_names = [f"PC{i+1}" for i in range(data_test_pca.shape[1])]
data_test_pca = pd.DataFrame(data_test_pca, columns=component_names)

data_predict = model.predict(data_test_pca.to_numpy())

submission = dict()
submission["id"] = []
submission["ROP (Time)"] = []

for index, row in enumerate(data_predict):
  submission["id"].append(index)
  submission["ROP (Time)"].append(row[0])

submit = pd.DataFrame(submission)
submit.to_csv('/content/submission_deep2.csv', index=False)



(14394, 25)
