In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as mtp
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Importing the dataset
data_set = pd.read_csv('/kaggle/input/revising-onehotencoding/50_Startups.csv')

# Extracting independent and dependent variables
x = data_set.iloc[:, :-1].values  # All columns except the last one (independent variables)
y = data_set.iloc[:, -1].values   # Only the last column (Profit)

# One-hot encoding the categorical data (State is in column index 3)
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x), dtype=np.float64)  # Convert to float64 to avoid 'isfinite' error

# Avoiding the dummy variable trap (removing the first column of the one-hot encoded variables)
x = x[:, 1:]

# Splitting the dataset into the training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Fitting the multiple linear regression model to the training set
regressor = LinearRegression()
regressor.fit(x_train, y_train)

# Predicting the test set results
y_pred = regressor.predict(x_test)

# Printing the train and test scores
print('Train Score: ', regressor.score(x_train, y_train))
print('Test Score: ', regressor.score(x_test, y_test))

# Adding a column of ones to x for the intercept in statsmodels (to account for constant term)
x = np.append(arr=np.ones((x.shape[0], 1)).astype(int), values=x, axis=1)

# Performing backward elimination using statsmodels OLS
x_opt=x[:, [0,3]]  
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()  
regressor_OLS.summary()  

# Displaying the OLS regression results
print(regressor_OLS.summary())



Train Score:  0.9501847627493607
Test Score:  0.9347068473282987
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     849.8
Date:                Sun, 13 Oct 2024   Prob (F-statistic):           3.50e-32
Time:                        06:47:05   Log-Likelihood:                -527.44
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      48   BIC:                             1063.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------