In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, models
import pandas as pd




In [None]:
# Load and preprocess your data
data = pd.read_csv('../Curated_data/two_composite_filtered.csv', low_memory=False)

# 1. Column filtering

In [None]:
for col in data:
    print(col)

## 1.1 Filter useless columns

First, filter for the columns that have severe problems, like too many NA's, or all same values.

Core Diameter: all NaNs <br>
Cumulate: all NaNs <br>
Dataset: all 0s  <br>
Recovery_m: all Nans <br>
Recovery_Pct: all Nans <br>
Re_ppm: all Nans <br>

Col numbers: 17, 19, 20, 86, 87, 143
Core Diameter, Cumulate, Dataset, Recovery_m, Recovery_pct, Re_ppm

In [None]:
# List of columns to exclude
columns_to_exclude = ["Core_Diameter", "Cumulate", "DataSet", "Recovery_m", "Recovery_Pct", "Re_ppm"]

# Drop the specified columns from the dataset
data = data.drop(columns=columns_to_exclude)

# Now, filtered_data contains all columns except the ones you wanted to exclude



# 1.2 Filter uninteresting variables 
Second, get rid of the columns that are not physical variables of interest

Sample Number <br>
Length <br>
CoreLoss_m <br>
Date_Logged <br>
Density <br>
Density_kgm3 <br>
Interval_Length <br>
Logged_By <br>
Ori_Confidence <br> 
samp_id <br>
SampleID <br>

Col numbers: 0, 7, 15, 16, 17, 19, 22, 27, 31, 65, 66  <br>
Sample_Number, Length, Core_Loss_m, Date_Logged, Density, Density_kgm3, Interval_Length, Logged_by, Ori_Confidence, samp_id, SampleID


In [None]:
# List of additional columns to exclude
additional_columns_to_exclude = ["Sample Number", "Length", "CoreLoss_m", "Date_Logged", "Density", "Density_kgm3", "Interval_Length", "Logged_By", "Ori_Confidence", "samp_id", "SampleID"]

# Drop the specified columns from the dataset
data = data.drop(columns=additional_columns_to_exclude)

# Now, filtered_data contains all columns except the ones you wanted to exclude


Filter all the ones with the word "Proportion", as they are also useless

In [None]:
data = data.filter(regex='^(?!.*Proportion).*$')


In [None]:
data

##  1.3 Filter big proportion of missing values

The following code deletes columns that have over 95% missing values

In [None]:
# Calculate the percentage of NaN values in each column
nan_percentage = (data.isna().sum() / len(data)) * 100

# Define a threshold (99% in this case)
threshold = 90

# Get the column indices that have less than the threshold percentage of NaN values
columns_to_keep = nan_percentage[nan_percentage <= threshold].index

# Create a new DataFrame with only the selected columns
data = data[columns_to_keep]

# Now, filtered_data contains only columns with less than 99% NaN values


In [None]:
len(data.columns)

In [None]:

# Assuming 'data' is your DataFrame
# Create a copy of the original DataFrame to store the encoded data
encoded_data = data.copy()

# Define the threshold for one-hot encoding (e.g., 10 unique values)
threshold = 10

# Iterate through each column
for column in data.columns:
    # Check if the column is of object data type (categorical)
    if data[column].dtype == 'object':
        unique_values = data[column].nunique()
        
        # Check if the number of unique values is within the threshold
        if unique_values <= threshold:
            # Perform one-hot encoding for columns with unique values within the threshold
            encoded_columns = pd.get_dummies(encoded_data[column], prefix=column)
            encoded_columns = encoded_columns.astype(int)  # Convert to integers (0 or 1)
            encoded_data = pd.concat([encoded_data, encoded_columns], axis=1)
            encoded_data = encoded_data.drop(columns=[column])

# Now, 'encoded_data' contains the one-hot encoded columns within the specified threshold, with 0s and 1s.


In [None]:
data = encoded_data

## 1.3 Filter for correlation

In [None]:

def filter_numeric_columns(data):
    # Select only columns with numeric data types (int or float)
    numeric_columns = data.select_dtypes(include=['number'])
    
    return numeric_columns



# Filter non-numeric columns
filtered_data = filter_numeric_columns(data)




In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
corr_matrix = filtered_data.corr()

# Create a heatmap without displaying numeric values inside the cells
plt.figure(figsize=(24, 16))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap")
plt.show()


## 1.4 Filtering variables with small correlation to variables of interest

Our variables of interest are the elements + density (so no physical variables such as mag susc, etc) that have less than 80% missing values 

In [None]:
import re

# Create a regex pattern to match variable names with "pct," "ppm," or "ppb"
pattern = re.compile(r'.*(pct|ppm|ppb).*')

# Create a list of variables that match the pattern
variables_matching_pattern = [column for column in filtered_data.columns if pattern.search(column)]

# Filter the data for the variables matching the pattern
variables_of_interest = filtered_data[variables_matching_pattern]

# Now, filtered_data contains only the variables that match the specified pattern


In [None]:
# Define the threshold for missing values (80% in this case)
threshold = 80

# Calculate the percentage of missing values in each column
missing_percentage = (variables_of_interest.isnull().sum() / len(variables_of_interest)) * 100

# Get the column indices that have less than or equal to the threshold percentage of missing values
columns_to_keep = missing_percentage[missing_percentage <= threshold].index

# Create a new DataFrame with only the selected columns
variables_of_interest = variables_of_interest[columns_to_keep]

# Now, filtered_data contains only the variables with less than or equal to 80% missing values


In [None]:
variables_of_interest.columns

In [None]:
# Calculate the correlation matrix
corr_matrix = filtered_data.corr()

# Define the threshold correlation value (absolute value)
threshold = 0.2

# Initialize the list of variables of interest with "Density_gcm3"
variables_of_interest = ['Au_ppb', 'Pd_ppb', 'Pt_ppb', 'Co_ppm', 'Cr_ppm', 'Cu_pct', 'Fe_pct',
       'Mn_ppm', 'Ni_pct', 'Pb_ppm', 'S_pct', 'SulphTot_pct', 'Zn_ppm', 'Density_gcm3']

# Create a list to store the variables that should be kept
variables_to_keep = ["X", "Y", "Z"]

# Create a list to store the variables to remove
variables_to_remove = []

# Iterate through the columns and check the correlation with variables of interest
for column in corr_matrix.columns:
    if column in variables_of_interest:
        other_variables = [var for var in variables_of_interest if var != column]
        # Check if the minimum absolute correlation with the other variables of interest is smaller than the threshold
        if abs(corr_matrix[column][other_variables]).max() < threshold:
            variables_to_remove.append(column)
    else:
        # Check if the minimum absolute correlation with the variables of interest is smaller than the threshold
        if abs(corr_matrix[column][variables_of_interest]).max() < threshold:
            variables_to_remove.append(column)


variables_to_remove.append("Alt1_Int_tr")

# Drop the variables with correlation (absolute value) smaller than the threshold
remaining_data = filtered_data.drop(columns=variables_to_remove)

# Add the variables to keep back to the remaining_data
remaining_data[variables_to_keep] = filtered_data[variables_to_keep]


In [None]:
# Calculate the correlation matrix
corr_matrix = remaining_data.corr()

# Create a heatmap without displaying numeric values inside the cells
plt.figure(figsize=(18, 12))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
X = remaining_data.drop(columns=['Density_gcm3'])
y = data['Density_gcm3']

In [None]:
# Replace missing values in X with zeros
X_filled = X.fillna(0)

# Now X_filled contains missing values replaced with 0s
X_filled

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler
scaler = StandardScaler()

# Normalize the data in X_filled
X_normalized = scaler.fit_transform(X_filled)

# Now X_normalized contains the normalized data


In [None]:
import tensorflow as tf
from tensorflow import keras

# Preprocess your data, including handling missing values and normalization

# Define the architecture of the autoencoder
input_dim = X_normalized.shape[1]  # Number of input features
encoding_dim = 10  # You can adjust this for dimensionality reduction

autoencoder = keras.models.Sequential([
    keras.layers.Input(shape=(input_dim,)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(encoding_dim, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(input_dim)
])

# Compile the autoencoder
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
autoencoder.fit(X, X, epochs=50, batch_size=32, validation_split=0.2)

# Extract feature importance from the bottleneck layer
encoder_layer = autoencoder.layers[3]  # Choose the bottleneck layer
encoded_X = encoder_layer.predict(X)

# You can analyze the encoded_X to identify important features
