# **Importing Libraries**

**Basic Libraries**

In [55]:
import pandas as pd
import numpy as np

**Libraries for visualisation and analysis**

In [56]:
#libraries used for statistical graphics in python
import seaborn as sb
import matplotlib.pyplot as plt

**Data Preprocessing Libraries**

In [57]:
#Libraries used for data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# **Importing Models**

In [58]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, SGDRegressor,MultiTaskLasso,MultiTaskElasticNet
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor,HistGradientBoostingRegressor


#for model evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# **Data Analysis**

**Reading the Data**

In [59]:
Alloy_data_path = r"https://raw.githubusercontent.com/DarshanGoodGuy/IITISOC25/refs/heads/main/final_dataset.csv"
Alloy_data = pd.read_csv(Alloy_data_path)

**Analysing the Data**

In [60]:
Alloy_data.head()

Unnamed: 0,FORMULA,Co,Cr,Fe,Ni,Mn,Nb,Ti,Al,C,...,PROPERTY: Microstructure,PROPERTY: Processing method,PROPERTY: BCC/FCC/other,PROPERTY: Calculated Density (g/cm$^3$),PROPERTY: Type of test,PROPERTY: Test temperature ($^\circ$C),PROPERTY: YS (MPa),PROPERTY: UTS (MPa),PROPERTY: Elongation (%),PROPERTY: Calculated Young modulus (GPa)
0,Co1 Cr1 Fe1 Ni1,26.138193,23.061468,24.768591,26.031747,,,,,,...,FCC,WROUGHT,FCC,8.2,T,25.0,274.0,708.0,39.0,226.0
1,Co1 Cr1 Mn1 Ni1,26.243766,23.154613,,26.13689,24.464731,,,,,...,FCC,WROUGHT,FCC,8.1,T,25.0,282.0,694.0,44.0,222.0
2,Co1 Cr1 Ni1,34.743724,30.654043,,34.602233,,,,,,...,FCC,WROUGHT,FCC,8.3,T,25.0,300.0,860.0,60.0,231.0
3,Co1 Fe1 Mn1 Ni1,25.801523,,24.449562,25.696448,24.052467,,,,,...,FCC,WROUGHT,FCC,8.2,T,25.0,170.0,550.0,41.0,204.0
4,Co1 Fe1 Ni1,33.972825,,32.192701,33.834474,,,,,,...,FCC,WROUGHT,FCC,8.5,T,25.0,211.0,513.0,31.0,207.0


In [61]:
Alloy_data.shape

(887, 37)

In [62]:
Alloy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 37 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   FORMULA                                   887 non-null    object 
 1   Co                                        343 non-null    float64
 2   Cr                                        432 non-null    float64
 3   Fe                                        387 non-null    float64
 4   Ni                                        399 non-null    float64
 5   Mn                                        126 non-null    float64
 6   Nb                                        443 non-null    float64
 7   Ti                                        477 non-null    float64
 8   Al                                        304 non-null    float64
 9   C                                         19 non-null     float64
 10  Mo                                    

**Filling the missing values with mean**

In [63]:
Alloy_data.isna().sum()

Unnamed: 0,0
FORMULA,0
Co,544
Cr,455
Fe,500
Ni,488
Mn,761
Nb,444
Ti,410
Al,583
C,868


In [64]:
Alloy_data.iloc[:, -4:] = Alloy_data.iloc[:, -4:].fillna(Alloy_data.iloc[:, -4:].mean())
Alloy_data.head()

Unnamed: 0,FORMULA,Co,Cr,Fe,Ni,Mn,Nb,Ti,Al,C,...,PROPERTY: Microstructure,PROPERTY: Processing method,PROPERTY: BCC/FCC/other,PROPERTY: Calculated Density (g/cm$^3$),PROPERTY: Type of test,PROPERTY: Test temperature ($^\circ$C),PROPERTY: YS (MPa),PROPERTY: UTS (MPa),PROPERTY: Elongation (%),PROPERTY: Calculated Young modulus (GPa)
0,Co1 Cr1 Fe1 Ni1,26.138193,23.061468,24.768591,26.031747,,,,,,...,FCC,WROUGHT,FCC,8.2,T,25.0,274.0,708.0,39.0,226.0
1,Co1 Cr1 Mn1 Ni1,26.243766,23.154613,,26.13689,24.464731,,,,,...,FCC,WROUGHT,FCC,8.1,T,25.0,282.0,694.0,44.0,222.0
2,Co1 Cr1 Ni1,34.743724,30.654043,,34.602233,,,,,,...,FCC,WROUGHT,FCC,8.3,T,25.0,300.0,860.0,60.0,231.0
3,Co1 Fe1 Mn1 Ni1,25.801523,,24.449562,25.696448,24.052467,,,,,...,FCC,WROUGHT,FCC,8.2,T,25.0,170.0,550.0,41.0,204.0
4,Co1 Fe1 Ni1,33.972825,,32.192701,33.834474,,,,,,...,FCC,WROUGHT,FCC,8.5,T,25.0,211.0,513.0,31.0,207.0


In [65]:
Alloy_data.isna().sum()

Unnamed: 0,0
FORMULA,0
Co,544
Cr,455
Fe,500
Ni,488
Mn,761
Nb,444
Ti,410
Al,583
C,868


# **Data Standardisation**

In [66]:
Alloy_data.shape

(887, 37)

In [67]:
#Assigning Feature and Target variables
X = Alloy_data.iloc[:,1:27] #Feature Columns
y = Alloy_data.iloc[:,27:38]  #Target Columns

In [68]:
X.head()

Unnamed: 0,Co,Cr,Fe,Ni,Mn,Nb,Ti,Al,C,Mo,...,Zn,Ta,Zr,Hf,W,Re,Ca,Y,Pd,Sc
0,26.138193,23.061468,24.768591,26.031747,,,,,,,...,,,,,,,,,,
1,26.243766,23.154613,,26.13689,24.464731,,,,,,...,,,,,,,,,,
2,34.743724,30.654043,,34.602233,,,,,,,...,,,,,,,,,,
3,25.801523,,24.449562,25.696448,24.052467,,,,,,...,,,,,,,,,,
4,33.972825,,32.192701,33.834474,,,,,,,...,,,,,,,,,,


In [69]:
y.head()

Unnamed: 0,PROPERTY: Microstructure,PROPERTY: Processing method,PROPERTY: BCC/FCC/other,PROPERTY: Calculated Density (g/cm$^3$),PROPERTY: Type of test,PROPERTY: Test temperature ($^\circ$C),PROPERTY: YS (MPa),PROPERTY: UTS (MPa),PROPERTY: Elongation (%),PROPERTY: Calculated Young modulus (GPa)
0,FCC,WROUGHT,FCC,8.2,T,25.0,274.0,708.0,39.0,226.0
1,FCC,WROUGHT,FCC,8.1,T,25.0,282.0,694.0,44.0,222.0
2,FCC,WROUGHT,FCC,8.3,T,25.0,300.0,860.0,60.0,231.0
3,FCC,WROUGHT,FCC,8.2,T,25.0,170.0,550.0,41.0,204.0
4,FCC,WROUGHT,FCC,8.5,T,25.0,211.0,513.0,31.0,207.0


**Scaling of data**

In [70]:
sc = MinMaxScaler()

X_scaled = pd.DataFrame(sc.fit_transform(X),columns=X.columns);
columns_to_scale = ['PROPERTY: Calculated Density (g/cm$^3$)', 'PROPERTY: Test temperature ($^\circ$C)', 'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)','PROPERTY: Elongation (%)','PROPERTY: Calculated Young modulus (GPa)']
y_scaled = y.copy()
y_scaled[columns_to_scale] = sc.fit_transform(y[columns_to_scale])

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


In [71]:
X_scaled.head()

Unnamed: 0,Co,Cr,Fe,Ni,Mn,Nb,Ti,Al,C,Mo,...,Zn,Ta,Zr,Hf,W,Re,Ca,Y,Pd,Sc
0,0.392247,0.530344,0.432264,0.452454,,,,,,,...,,,,,,,,,,
1,0.394086,0.532722,,0.454576,0.58315,,,,,,...,,,,,,,,,,
2,0.542211,0.724192,,0.625437,,,,,,,...,,,,,,,,,,
3,0.38638,,0.425798,0.445687,0.571841,,,,,,...,,,,,,,,,,
4,0.528777,,0.582723,0.609941,,,,,,,...,,,,,,,,,,


In [72]:
y_scaled.head()

Unnamed: 0,PROPERTY: Microstructure,PROPERTY: Processing method,PROPERTY: BCC/FCC/other,PROPERTY: Calculated Density (g/cm$^3$),PROPERTY: Type of test,PROPERTY: Test temperature ($^\circ$C),PROPERTY: YS (MPa),PROPERTY: UTS (MPa),PROPERTY: Elongation (%),PROPERTY: Calculated Young modulus (GPa)
0,FCC,WROUGHT,FCC,0.490741,T,0.157213,0.086124,0.159245,0.371429,0.681416
1,FCC,WROUGHT,FCC,0.481481,T,0.157213,0.089069,0.155695,0.419048,0.663717
2,FCC,WROUGHT,FCC,0.5,T,0.157213,0.095694,0.197789,0.571429,0.70354
3,FCC,WROUGHT,FCC,0.490741,T,0.157213,0.047847,0.11918,0.390476,0.584071
4,FCC,WROUGHT,FCC,0.518519,T,0.157213,0.062937,0.109798,0.295238,0.597345


In [73]:
from sklearn.model_selection import train_test_split

X_new,X_test,y_new,y_test = train_test_split(X_scaled,y_scaled,test_size=0.1,random_state=1);
X_train,X_valid,y_train,y_valid = train_test_split(X_new,y_new,test_size=0.2,random_state=1)

[X_train.shape,y_train.shape],[X_valid.shape,y_valid.shape],[X_test.shape,y_test.shape]

([(638, 26), (638, 10)], [(160, 26), (160, 10)], [(89, 26), (89, 10)])

In [74]:
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_valid.shape, y_valid.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (638, 26) (638, 10)
Validation set shape: (160, 26) (160, 10)
Test set shape: (89, 26) (89, 10)


# **Modelling**

In [75]:
import xgboost as xgb

def model_results(XTrain,XValid,yTrain,yValid):
  models =[('Random Forest',RandomForestRegressor()),
           ('MLP',MLPRegressor(max_iter = 1000)),
           ('RidgeCV',MultiOutputRegressor(RidgeCV())),
           ('SGD',MultiOutputRegressor(SGDRegressor())),
           ('KNN', KNeighborsRegressor(n_neighbors=10)),
           ('Support Vector',MultiOutputRegressor(SVR())),
           ('Decision Tree',MultiOutputRegressor(DecisionTreeRegressor())),
           ('AdaBoost',MultiOutputRegressor(AdaBoostRegressor())),
           ('X_gb',MultiOutputRegressor(xgb.XGBRegressor())),
           ('GradientBoost',MultiOutputRegressor(GradientBoostingRegressor())),
          ]
  finalResults = []

  for name,model in models:
      model.fit(XTrain, yTrain)
      model_results = model.predict(XValid)
      r2score = r2_score(yValid, model_results)
      RMSE = mean_squared_error(yValid, model_results,squared = False)
      MAE = mean_absolute_error(yValid, model_results)
      finalResults.append((name, r2score, RMSE, MAE))

  finalResults.sort(key=lambda k:k[2])
  return finalResults



In [77]:
np.random.seed(42)
np_results = model_results(X_train,X_valid,y_train,y_valid)
results = pd.DataFrame(np_results,columns=['Model Name', 'R2 Score', 'RMSE', 'MAE'])
results

ValueError: could not convert string to float: 'FCC+Sec.'