In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/oily-mlgo-hackathon-data/sample_submission.csv
/kaggle/input/oily-mlgo-hackathon-data/train.csv
/kaggle/input/oily-mlgo-hackathon-data/test.csv


Problem Statement of hackathon: Compressional travel-time (DTC) and shear travel-time (DTS) logs are not acquired in all the wells drilled in a field due to operational constraints. Under such circumstances, machine learning techniques can be used to predict DTC and DTS logs to improve subsurface characterization. The goal is to develop data-driven models by processing  conventional logs from a Well  and use the data-driven models to generate synthetic compressional and shear travel-time logs (DTC and DTS, respectively) in another Well. A robust data-driven model for the desired sonic-log synthesis will result in low prediction errors, which can be quantified in terms of Root Mean Squared Error(RMSE) by comparing the synthesized and the original DTC and DTS logs.


importing the train data

In [2]:
train_data=pd.read_csv('/kaggle/input/oily-mlgo-hackathon-data/train.csv')
train_data.head()

Unnamed: 0,CAL,CNC,GR,HRD,HRM,PE,ZDEN,DTC,DTS
0,-999.0,-999.0,41.4699,-999.0,-999.0,-999.0,-999.0,128.0737,319.0654
1,-999.0,-999.0,42.5053,-999.0,-999.0,-999.0,-999.0,127.8347,318.7825
2,-999.0,-999.0,43.1548,-999.0,-999.0,-999.0,-999.0,127.2307,317.3323
3,-999.0,-999.0,43.241,-999.0,-999.0,-999.0,-999.0,126.2917,313.6486
4,-999.0,-999.0,40.3218,-999.0,-999.0,-999.0,-999.0,125.3985,307.8903


importing test data

In [3]:
test_data=pd.read_csv('/kaggle/input/oily-mlgo-hackathon-data/test.csv')
test_data.head()

Unnamed: 0,CAL,CNC,GR,HRD,HRM,PE,ZDEN
0,8.5781,0.3521,55.1824,0.8121,0.781,6.8291,2.3256
1,8.5781,0.3639,57.0114,0.8038,0.7723,6.81,2.3255
2,8.5781,0.3703,58.9263,0.7444,0.7048,6.7766,2.3212
3,8.5625,0.3667,57.3308,0.7169,0.6542,6.7219,2.3119
4,8.5781,0.35,53.0624,0.6845,0.6109,6.6384,2.2982


In [4]:
X_train=train_data.iloc[:,0:7]
X_train.head()

Unnamed: 0,CAL,CNC,GR,HRD,HRM,PE,ZDEN
0,-999.0,-999.0,41.4699,-999.0,-999.0,-999.0,-999.0
1,-999.0,-999.0,42.5053,-999.0,-999.0,-999.0,-999.0
2,-999.0,-999.0,43.1548,-999.0,-999.0,-999.0,-999.0
3,-999.0,-999.0,43.241,-999.0,-999.0,-999.0,-999.0
4,-999.0,-999.0,40.3218,-999.0,-999.0,-999.0,-999.0


In [5]:
y1_train=train_data['DTC']
y2_train=train_data['DTS']
y1_train.head()

0    128.0737
1    127.8347
2    127.2307
3    126.2917
4    125.3985
Name: DTC, dtype: float64

In [6]:
y2_train.head()

0    319.0654
1    318.7825
2    317.3323
3    313.6486
4    307.8903
Name: DTS, dtype: float64

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30143 entries, 0 to 30142
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   CAL     30143 non-null  float64
 1   CNC     30143 non-null  float64
 2   GR      30143 non-null  float64
 3   HRD     30143 non-null  float64
 4   HRM     30143 non-null  float64
 5   PE      30143 non-null  float64
 6   ZDEN    30143 non-null  float64
dtypes: float64(7)
memory usage: 1.6 MB


check for null values to prevent any errors in the future modelling of the code

In [8]:
X_train.isnull().sum()

CAL     0
CNC     0
GR      0
HRD     0
HRM     0
PE      0
ZDEN    0
dtype: int64

**MUTUAL INFORMATION:**

In feature engineering, mutual information (MI) is used as a measure of the relationship between a feature and the target variable. Mutual information is a non-negative value that measures the degree of association between two variables. It can be used to identify features that are highly correlated with the target variable.

One way to calculate mutual information is by using the concept of entropy, which is a measure of the impurity or randomness of a variable. The mutual information between two variables is calculated by comparing the joint entropy of the two variables to the entropy of each variable individually. A higher mutual information value indicates a stronger association between the two variables.

In practice, mutual information is used to select the most informative features for a machine learning model. The features with the highest mutual information values are typically considered the most important for predicting the target variable.

using mutual information to rank the features for 1st target variable according to target importance

In [9]:
from sklearn.feature_selection import mutual_info_regression
mi1=mutual_info_regression(X_train,y1_train)
mi1_scores=pd.Series(mi1,name="MI Scores",index=X_train.columns)
mi1_scores

CAL     1.063050
CNC     1.178901
GR      0.688997
HRD     0.886866
HRM     0.814120
PE      0.491797
ZDEN    0.729608
Name: MI Scores, dtype: float64

using mutual information to rank the features for 2nd target variable according to target importance

In [10]:
from sklearn.feature_selection import mutual_info_regression
mi2=mutual_info_regression(X_train,y2_train)
mi2_scores=pd.Series(mi2,name="MI Scores",index=X_train.columns)
mi2_scores

CAL     1.014854
CNC     1.017684
GR      0.679854
HRD     0.807763
HRM     0.753592
PE      0.491455
ZDEN    0.626192
Name: MI Scores, dtype: float64

In [11]:
features1=mi1_scores[mi1_scores>0.7].index.tolist()
features2=mi2_scores[mi2_scores>0.7].index.tolist()

set of important features for the 1st target variable

In [12]:
print(features1)

['CAL', 'CNC', 'HRD', 'HRM', 'ZDEN']


set of important features for the 2nd target variable

In [13]:
print(features2)

['CAL', 'CNC', 'HRD', 'HRM']


In [14]:
X_train.shape

(30143, 7)

In [15]:
y1_train.shape

(30143,)

In [16]:
X_train=X_train[features1]

creating a test data as per the features we've selected using information gain

In [17]:
X_test=test_data[features1]

viewing the train and test datasets

In [18]:
X_train.head()

Unnamed: 0,CAL,CNC,HRD,HRM,ZDEN
0,-999.0,-999.0,-999.0,-999.0,-999.0
1,-999.0,-999.0,-999.0,-999.0,-999.0
2,-999.0,-999.0,-999.0,-999.0,-999.0
3,-999.0,-999.0,-999.0,-999.0,-999.0
4,-999.0,-999.0,-999.0,-999.0,-999.0


In [19]:
X_test.head()

Unnamed: 0,CAL,CNC,HRD,HRM,ZDEN
0,8.5781,0.3521,0.8121,0.781,2.3256
1,8.5781,0.3639,0.8038,0.7723,2.3255
2,8.5781,0.3703,0.7444,0.7048,2.3212
3,8.5625,0.3667,0.7169,0.6542,2.3119
4,8.5781,0.35,0.6845,0.6109,2.2982


Scaling to the dataset for more accurate predictions. StandardScaler will be used for the above example

In [20]:
from sklearn.preprocessing import StandardScaler

# create the scaler
scaler = StandardScaler()

# fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# convert the scaled data back to a pandas DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# transform the test data
X_test_scaled = scaler.transform(X_test)

# convert the scaled data back to a pandas DataFrame
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


viewing the final scaled train and test datasets

In [21]:
X_train_scaled.head()

Unnamed: 0,CAL,CNC,HRD,HRM,ZDEN
0,-7.621914,-6.206612,-2.747081,-2.189854,-6.577444
1,-7.621914,-6.206612,-2.747081,-2.189854,-6.577444
2,-7.621914,-6.206612,-2.747081,-2.189854,-6.577444
3,-7.621914,-6.206612,-2.747081,-2.189854,-6.577444
4,-7.621914,-6.206612,-2.747081,-2.189854,-6.577444


In [22]:
X_test_scaled.head()

Unnamed: 0,CAL,CNC,HRD,HRM,ZDEN
0,0.130591,0.153015,-0.00867,-0.001677,0.151576
1,0.130591,0.15309,-0.008693,-0.001696,0.151575
2,0.130591,0.15313,-0.008856,-0.001844,0.151546
3,0.130471,0.153107,-0.008931,-0.001955,0.151484
4,0.130591,0.153001,-0.00902,-0.002049,0.151392


**XGBoost:**

XGBoost (eXtreme Gradient Boosting) is a popular and powerful open-source library for gradient boosting. It is designed to be efficient and scalable for both small and large datasets, and can handle a variety of problem types, including classification and regression.

XGBoost is built on top of the traditional gradient boosting algorithm, but includes several key enhancements

**GridSearchCV**

GridSearchCV is a scikit-learn function that performs an exhaustive search over a specified parameter grid to find the best combination of hyperparameters for a given model. It is commonly used in machine learning to fine-tune the parameters of a model and improve its performance.

First and foremost, we will use GridSearchCV to find the best set of parameters for our xgboost model as per the dataset. This is in accordance to the first target variable. Considering the identical information gain, there shouldn't be a big difference in the ideal hyperparameters for both target features.

In [23]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
model=XGBRegressor()
param_grid = {'learning_rate':[0.05, 0.75, 0.1], 'n_estimators':[200,400,600,800,1000]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
#scoring hyperparameter ensures we're using RMSE as the necessary factor
grid_search.fit(X_train_scaled, y1_train)
print(grid_search.best_params_)

{'learning_rate': 0.75, 'n_estimators': 200}


fitting and predicting the first target variable using the ideal hyperparameters

In [24]:
model1=XGBRegressor(learning_rate=0.75,n_estimators=200)
model1.fit(X_train_scaled,y1_train)
pred1=model1.predict(X_test_scaled)

fitting and predicting the second target variable with the same hyperparameters

In [25]:
model2=XGBRegressor(learning_rate=0.75,n_estimators=200)
model2.fit(X_train_scaled,y2_train)
pred2=model2.predict(X_test_scaled)

Displaying the predicted values and saving it in the form of a csv

In [26]:
output=pd.DataFrame({'DTC':pred1,'DTS':pred2})
output.to_csv('submission.csv',index=False)
print("Successful Save")

Successful Save


viewing the final submission

In [27]:
output.head()

Unnamed: 0,DTC,DTS
0,92.024918,-309.390106
1,95.081619,-337.87381
2,104.579163,226.597305
3,96.983925,66.70903
4,100.533195,203.336578
