# Multiple linear regression

## Importing the libraries

In [160]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading data

In [161]:
data = pd.read_csv("50_Startups.csv")

In [162]:
X = data.iloc[: , :-1].values
y = data.iloc[:, -1].values

## Encoding Categorical data

**Note**:
1. No need to apply feature scaling in Multiple linear regression.
2. No need to check assumptions of linear regression.

In [163]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## train test split

In [164]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Train model
**Note**:
1. we don't have to avoid dummy variable trap because LinearRegression model class automatically avoid this trap.
2. we don't have to select features (**feature selection**) because LinearRegression model class automatically identify the best features (**high p-values**).

In [165]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [166]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


## Making a single prediction (for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California')


In [167]:
print(regressor.predict([[1, 0, 0, 160000, 130000, 300000]]))

[181566.92]


Therefore, our model predicts that the profit of a Californian startup which spent 160000 in R&D, 130000 in Administration and 300000 in Marketing is $ 181566,92.

**Important note 1:** Notice that the values of the features were all input in a double pair of square brackets. That's because the "predict" method always expects a 2D array as the format of its inputs. And putting our values into a double pair of square brackets makes the input exactly a 2D array. Simply put:

$1, 0, 0, 160000, 130000, 300000 \rightarrow \textrm{scalars}$

$[1, 0, 0, 160000, 130000, 300000] \rightarrow \textrm{1D array}$

$[[1, 0, 0, 160000, 130000, 300000]] \rightarrow \textrm{2D array}$

**Important note 2:** Notice also that the "California" state was not input as a string in the last column but as "1, 0, 0" in the first three columns. That's because of course the predict method expects the one-hot-encoded values of the state, and as we see in the second row of the matrix of features X, "California" was encoded as "1, 0, 0". And be careful to include these values in the first three columns, not the last three ones, because the dummy variables are always created in the first columns.

## Getting the final linear regression equation with the values of the coefficients

In [168]:
print(regressor.coef_)
print(regressor.intercept_)

[ 8.66e+01 -8.73e+02  7.86e+02  7.73e-01  3.29e-02  3.66e-02]
42467.52924853278


Therefore, the equation of our multiple linear regression model is:

$$\textrm{Profit} = 86.6 \times \textrm{Dummy State 1} - 873 \times \textrm{Dummy State 2} + 786 \times \textrm{Dummy State 3} + 0.773 \times \textrm{R&D Spend} + 0.0329 \times \textrm{Administration} + 0.0366 \times \textrm{Marketing Spend} + 42467.53$$

**Important Note:** To get these coefficients we called the "coef_" and "intercept_" attributes from our regressor object. Attributes in Python are different than methods and usually return a simple value or an array of values.

# Dummy Variable and Backward Elimination

In [170]:
# Multiple Linear Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(X)

# Encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

# Avoiding the Dummy Variable Trap
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Building the optimal model using Backward Elimination
import statsmodels.api as sm
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()X_opt = X[:, [0, 1, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()X_opt = X[:, [0, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()X_opt = X[:, [0, 3, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()X_opt = X[:, [0, 3]]
X_opt = X_opt.astype(np.float64)regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

SyntaxError: invalid syntax (<ipython-input-170-a26f98af290d>, line 44)

# PyTorch

In [190]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Load your dataset
dataset = pd.read_csv('50_Startups.csv')

# Separate features and target variable
X = dataset.drop(columns=['Profit'])
y = dataset['Profit']

# Separate numerical and categorical features
numerical_features = X.select_dtypes(include=[np.number])
categorical_features = X.select_dtypes(include=[np.object])

# Apply StandardScaler to numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(numerical_features)

# Concatenate the scaled numerical features with the encoded categorical features
X_encoded = pd.concat([pd.DataFrame(X_numerical_scaled, columns=numerical_features.columns), categorical_features], axis=1)

# Perform one-hot encoding on categorical features
column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), categorical_features.columns)
    ],
    remainder='passthrough'
)

X_encoded = column_transformer.fit_transform(X_encoded)

# Convert the NumPy array to a PyTorch tensor
X_tensor = torch.tensor(X_encoded, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Define the multiple linear regression model
class MultipleLinearRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(MultipleLinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

# Instantiate the model
input_size = X_train.shape[1]
model = MultipleLinearRegressionModel(input_size)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Train the model
num_epochs = 1000
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Make predictions on the test set
with torch.no_grad():
    predictions = model(X_test)

# Calculate the test set mean squared error
mse = mean_squared_error(y_test.numpy(), predictions.numpy())
print(f'Test Set Mean Squared Error: {mse:.4f}')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features = X.select_dtypes(include=[np.object])


Epoch [100/1000], Loss: 8238990336.0000
Epoch [200/1000], Loss: 4584051712.0000
Epoch [300/1000], Loss: 2628645376.0000
Epoch [400/1000], Loss: 1563130496.0000
Epoch [500/1000], Loss: 971373952.0000
Epoch [600/1000], Loss: 636184000.0000
Epoch [700/1000], Loss: 442343584.0000
Epoch [800/1000], Loss: 327704608.0000
Epoch [900/1000], Loss: 258189744.0000
Epoch [1000/1000], Loss: 214804992.0000
Test Set Mean Squared Error: 529498112.0000


In [189]:
import torch

# Assuming you have already trained a model named 'model'

# Input values for prediction
input_data = [[1, 0, 0, 160000, 130000, 300000]]

# Convert input data to PyTorch tensor
input_tensor = torch.tensor(input_data, dtype=torch.float32)

# Make prediction
with torch.no_grad():
    prediction = model(input_tensor)

# Convert the prediction tensor to a NumPy array and print the result
print(prediction.numpy())

# Assuming 'prediction' is the PyTorch tensor
numpy_array = prediction.numpy().squeeze()

# Round the value
rounded_value = np.round(numpy_array, 2)

# Print the result in the desired format
print(f'[{rounded_value}]')

[[6.69e+09]]
[6693758976.0]
