In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load the dataset
startup_data = pd.read_csv('50_Startups.csv')

# Separate features and target variable
X = startup_data.drop(columns=["Profit"])
y = startup_data["Profit"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer to handle categorical variables
# In this case, we'll use OneHotEncoder to encode the 'State' column
# The rest of the columns will be passed through
categorical_features = ['State']
numeric_features = list(set(X.columns) - set(categorical_features))

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Define the pipeline with preprocessing and the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LinearRegression())])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
train_rmse = mean_squared_error(y_train, pipeline.predict(X_train), squared=False)
test_rmse = mean_squared_error(y_test, pipeline.predict(X_test), squared=False)

print("Training RMSE:", train_rmse)
print("Testing RMSE:", test_rmse)


Training RMSE: 8927.489013300052
Testing RMSE: 9055.95732349782


In [9]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, mean_absolute_percentage_error ,r2_score

Y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_true=y_test, 
                          y_pred=Y_pred) 
print("Mean Absolute Error", mae) 

mse = mean_squared_error(y_true=y_test, 
                         y_pred=Y_pred) 
print("Mean Square Error", mse)

r2 = r2_score(y_true=y_test, 
                         y_pred=Y_pred) 
print("R2", r2)

rmse = mean_squared_error(y_true=y_test, 
                          y_pred=Y_pred, 
                          squared=False) 
print("Root Mean Square Error", rmse) 

mape = mean_absolute_percentage_error(y_test, 
                                      Y_pred, 
                                      sample_weight=None, 
                                      multioutput='uniform_average') 
print("Mean Absolute Percentage Error", mape) 

Mean Absolute Error 6961.477813275563
Mean Square Error 82010363.04501379
R2 0.8987266414319836
Root Mean Square Error 9055.95732349782
Mean Absolute Percentage Error 0.10278189896425191
