Linear Regression

In [24]:
# Importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load your dataset
df = pd.read_csv('syn_steroid_data.csv')

# Define features and target variable
X = df[['Product Type', 'Animal Source', 'Steroid Name', 'Steroid Class']]  # Features
y = df['Steroid Amount (mg/kg)']  # Target (Steroid Amount)

# One-Hot Encoding for categorical variables
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize and fit Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

r2 = r2_score(y_test, y_pred)
print(f'R2 Score: {r2}')
# # Optional: Display model coefficients to understand feature contributions
# coefficients = model.coef_
# print(f'Coefficients: {coefficients}')


Mean Squared Error (MSE): 8.744369282466173e-06
R2 Score: -0.016801786731131196


Random forest Regressor

In [17]:
# Importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

df = pd.read_csv('syn_steroid_data.csv')

# Feature and target selection
X = df[['Product Type', 'Animal Source', 'Steroid Name', 'Steroid Class']]
y = df['Steroid Amount (mg/kg)']

# One-Hot Encoding the categorical variables
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

#Train the model
model.fit(X_train,y_train)

# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, y_pred)
print(f'R2 Score: {r2}')



Mean Squared Error: 8.886076144478985e-06
R2 Score: -0.03327956641223473


Gradient Boosting

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize Gradient Boosting Regressor
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, y_pred)
print(f'R2 Score: {r2}')


Mean Squared Error: 8.88774935510445e-06
R2 Score: -0.033474128592604346
