In [2]:
import pandas as pd

# Load the dataset
file_path = "train.csv"
df = pd.read_csv(file_path)

# Display basic information about the dataset
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 41 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ID                                   1600 non-null   int64  
 1   Alloy formula                        1600 non-null   object 
 2   Alloy class                          1600 non-null   object 
 3   Cu                                   1600 non-null   float64
 4   Al                                   1600 non-null   float64
 5   Ag                                   1600 non-null   float64
 6   B                                    1600 non-null   float64
 7   Be                                   1600 non-null   float64
 8   Ca                                   1600 non-null   int64  
 9   Co                                   1600 non-null   float64
 10  Ce                                   1600 non-null   float64
 11  Cr                            

(None,
     ID                  Alloy formula      Alloy class         Cu        Al  \
 0  969  Cu-6Ni-1Si-0.5Al-0.15Mg-0.1Cr   Cu low alloyed  95.557137  0.246150   
 1  241      Cu-4.5Ni-1Si-1.2Co-0.15Mg  Cu-Ni-Si alloys  95.083982  0.001252   
 2  820      Cu-4.5Ni-1Si-1.2Co-0.15Mg  Cu-Ni-Si alloys  91.894209  0.022183   
 3  693                 Cu-4.0Ni-2.0Si  Cu-Ni-Si alloys  92.624741  0.000000   
 4  421               Cu-0.28Cr-0.19Mg  Cu-Ni-Si alloys  95.575242  0.009056   
 
          Ag        B        Be  Ca        Co  ...    tss (h)  \
 0  0.000075  0.00000  0.000000   0  0.000000  ...   3.972130   
 1  0.000000  0.00000  0.000947   0  0.000000  ...        NaN   
 2  0.000000  0.00326  0.000000   0  0.008681  ...   4.064446   
 3  0.000000  0.00000  0.015469   0  0.000438  ...   7.041734   
 4  0.000000  0.00000  0.008676   0  0.000000  ...  27.770000   
 
    CR reduction (%)  Aging     Tag (K)   tag (h)  \
 0          4.366903      Y  778.447643  4.447236   
 1         90

In [3]:
# Drop rows where target variable is missing
df = df.dropna(subset=["Electrical conductivity (%IACS)"])

# Impute missing processing parameters with median values
for col in ["Tss (K)", "tss (h)", "Tag (K)", "tag (h)"]:
    df[col] = df[col].fillna(df[col].median())

# Impute Hardness (HV) with median
df["Hardness (HV)"] = df["Hardness (HV)"].fillna(df["Hardness (HV)"].median())

# Drop columns with excessive missing values
df = df.drop(columns=["Yield strength (MPa)", "Ultimate tensile strength (MPa)"])

# Fill missing categorical values in 'Secondary thermo-mechanical process' with the most frequent value
df["Secondary thermo-mechanical process"] = df["Secondary thermo-mechanical process"].fillna(df["Secondary thermo-mechanical process"].mode()[0])

# Convert categorical variables to numerical
df["Aging"] = df["Aging"].map({"Y": 1, "N": 0})
df = pd.get_dummies(df, columns=["Alloy class", "Secondary thermo-mechanical process"], drop_first=True)

# Verify no missing values remain
df.isnull().sum().sum()


0

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define features and target variable
X = df.drop(columns=["ID", "Alloy formula", "Electrical conductivity (%IACS)"])
y = df["Electrical conductivity (%IACS)"]

# Split data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shape of the datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((1278, 38), (320, 38), (1278,), (320,))

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)


Mean Absolute Error: 13.96960159723486


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

model = RandomForestRegressor(random_state=42)
grid_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=5, scoring="neg_mean_absolute_error", random_state=42)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)


Best Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 10}
Best MAE: 13.696605538518645


In [7]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

print("XGBoost MAE:", mae_xgb)


XGBoost MAE: 14.369808997840419


In [8]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_grid, 
                                 n_iter=10, cv=5, scoring="neg_mean_absolute_error", random_state=42)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
print("Tuned Random Forest MAE:", mean_absolute_error(y_test, y_pred_best_rf))


Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 20}
Tuned Random Forest MAE: 13.938618283144475


In [9]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the Neural Network Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  
    Dropout(0.2),  
    Dense(64, activation='relu'),  
    Dropout(0.1),  
    Dense(32, activation='relu'),  
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss='mae')

# Train the model
history = model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test), epochs=100, batch_size=32, verbose=1)

# Predict
y_pred_nn = model.predict(X_test_scaled)

# Calculate MAE
mae_nn = mean_absolute_error(y_test, y_pred_nn)
print("Neural Network MAE:", mae_nn)


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 23.6642 - val_loss: 15.7852
Epoch 2/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 14.3617 - val_loss: 15.1391
Epoch 3/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 14.5895 - val_loss: 14.9654
Epoch 4/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 13.5462 - val_loss: 15.1028
Epoch 5/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 13.5230 - val_loss: 15.1575
Epoch 6/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 13.1466 - val_loss: 15.1515
Epoch 7/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 12.5902 - val_loss: 15.1676
Epoch 8/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 12.7901 - val_loss: 16.7629
Epoch 9/100
[1m40/40[0m [32m━━━━━━━━━━━━━

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)
gbr_model.fit(X_train, y_train)

y_pred_gbr = gbr_model.predict(X_test)
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
print("Gradient Boosting MAE:", mae_gbr)


Gradient Boosting MAE: 14.102287849422405


In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define Hyperparameter Grid
param_grid = {
    "n_estimators": [100, 300, 500, 700],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ['sqrt', 'log2']
}

# Initialize RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Perform Randomized Search
rf_random_search = RandomizedSearchCV(
    estimator=rf_model, 
    param_distributions=param_grid, 
    n_iter=20,  # Number of random combinations to try
    cv=5,  # 5-fold cross-validation
    scoring="neg_mean_absolute_error",
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit the model
rf_random_search.fit(X_train, y_train)

# Best parameters & best MAE score
print("Best Parameters:", rf_random_search.best_params_)
print("Best MAE from tuning:", -rf_random_search.best_score_)

best_rf = rf_random_search.best_estimator_
best_rf.fit(X_train, y_train)

# Make predictions
y_pred_best_rf = best_rf.predict(X_test)

# Calculate MAE
from sklearn.metrics import mean_absolute_error
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)

print("Tuned Random Forest MAE:", mae_best_rf)


Best Parameters: {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 30}
Best MAE from tuning: 13.63495375320486
Tuned Random Forest MAE: 13.917937807348247


In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test)

# Calculate MAE
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print("Linear Regression MAE:", mae_lr)


Linear Regression MAE: 14.502349675717705
