1. Linear Regression

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [6]:
class CustomLinearRegression:
    def __init__(self, X_data, y_target, learning_rate = 0.01, num_epochs = 10000):
        self.num_samples = X_data.shape[0]
        self.X_data = np.c_[np.ones((self.num_samples, 1)), X_data]
        self.y_target = y_target
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        
        # Initial weights
        self.theta = np.random.randn(self.X_data.shape[1], 1)
        self.lossess = []
    
    def compute_loss(self, y_pred, y_target):
        loss = np.mean((y_pred - y_target) ** 2)
        return loss
    
    def predict(self, X_data):
        y_pred = X_data.dot(self.theta)
        return y_pred
    
    def fit(self):
        for epoch in range(self.num_epochs):
            
            #predict
            y_pred = self.predict(self.X_data)
            
            #compute loss:
            loss = self.compute_loss(y_pred, self.y_target)
            self.lossess.append(loss)
            
            #compute gradient
            loss_grad = 2*(y_pred - self.y_target)/self.num_samples
            gradients = self.X_data.T.dot(loss_grad)
            
            # Compute weight
            self.theta = self.theta = self.learning_rate*gradients
            if epoch% 50 == 0:
                print(f'Epoch:{epoch} - Loss: {loss}')
            return{
                'loss': sum(self.lossess)/len(self.lossess),
                'weight': self.theta
            }
    

In [7]:
def r2score(y_pred, y):
    rss = np.sum((y_pred - y) ** 2)
    tss = np.sum((y - y.mean()) ** 2)
    r2 = 1 - (rss / tss)
    return r2

In [8]:
# Case 1
y_pred = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 3, 4, 5])
r2score(y_pred, y)

1.0

In [9]:
# Case 2
y_pred = np.array([1, 2, 3, 4, 5])
y = np.array([3, 5, 5, 2, 4])
r2score(y_pred, y)

-2.235294117647059

2. Polynomial Regression

In [10]:
def create_polynomial_features_1(X, degree=2):
    """Creates the polynomial features
    Args:
        X: A array tensor for the data.
        degree: A intege for the degree of 
        the generated polynomial function. """
    X_new = X
    for d in range(2, degree+1):
        X_new = np.c_[X_new, np.power(X, d)]
    return X_new

In [11]:
def create_polynomial_features_2(X, degree=2):
    X_mem = []
    for X_sub in X.T:
        X_new = X_sub
        for d in range(2, degree + 1):
            X_new = np.c_[X_new, np.power(X_sub, d)]
        X_mem.extend(X_new.T)
    return np.c_[X_mem].T

3. Sales Prediction

a. Load dataset

In [12]:
df = pd.read_csv("SalesPrediction.csv")
df

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.913410,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.246340
4,15.0,8.437408,1.405998,Micro,56.594181
...,...,...,...,...,...
4567,26.0,4.472360,0.717090,Micro,94.685866
4568,71.0,20.610685,6.545573,Nano,249.101915
4569,44.0,19.800072,5.096192,Micro,163.631457
4570,71.0,17.534640,1.940873,Macro,253.610411


In [13]:
df.head()

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.91341,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.24634
4,15.0,8.437408,1.405998,Micro,56.594181


In [14]:
df.isnull().sum()

TV              10
Radio            4
Social Media     6
Influencer       0
Sales            6
dtype: int64

In [15]:
df.duplicated().sum()

0

b. Preprocessing

We need to handle with the missing values as well as encode the 'influence' column

In [16]:
# Bước 1: One-Hot Encoding cho cột 'Influencer'
df = pd.get_dummies(df)

# Bước 2: Xử lý giá trị Null bằng cách thay thế bằng giá trị trung bình
df = df.fillna(df.mean())

Train model

In [17]:
df.shape

(4572, 8)

In [18]:
df

Unnamed: 0,TV,Radio,Social Media,Sales,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano
0,16.0,6.566231,2.907983,54.732757,False,True,False,False
1,13.0,9.237765,2.409567,46.677897,False,True,False,False
2,41.0,15.886446,2.913410,150.177829,False,True,False,False
3,83.0,30.020028,6.922304,298.246340,False,True,False,False
4,15.0,8.437408,1.405998,56.594181,False,False,True,False
...,...,...,...,...,...,...,...,...
4567,26.0,4.472360,0.717090,94.685866,False,False,True,False
4568,71.0,20.610685,6.545573,249.101915,False,False,False,True
4569,44.0,19.800072,5.096192,163.631457,False,False,True,False
4570,71.0,17.534640,1.940873,253.610411,True,False,False,False


In [19]:
X = df.iloc[:, df.columns != 'Sales']
y = df['Sales']

In [20]:
X

Unnamed: 0,TV,Radio,Social Media,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano
0,16.0,6.566231,2.907983,False,True,False,False
1,13.0,9.237765,2.409567,False,True,False,False
2,41.0,15.886446,2.913410,False,True,False,False
3,83.0,30.020028,6.922304,False,True,False,False
4,15.0,8.437408,1.405998,False,False,True,False
...,...,...,...,...,...,...,...
4567,26.0,4.472360,0.717090,False,False,True,False
4568,71.0,20.610685,6.545573,False,False,False,True
4569,44.0,19.800072,5.096192,False,False,True,False
4570,71.0,17.534640,1.940873,True,False,False,False


In [21]:
y

0        54.732757
1        46.677897
2       150.177829
3       298.246340
4        56.594181
           ...    
4567     94.685866
4568    249.101915
4569    163.631457
4570    253.610411
4571    148.202414
Name: Sales, Length: 4572, dtype: float64

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=0
)

c. Feature Scaling

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_processed = scaler.fit_transform(X_train)
X_test_processed = scaler.transform(X_test)

In [24]:
scaler.mean_[0]

54.173577723283785

d. Polynomial features

In [25]:
poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train_processed)
X_test_poly = poly_features.transform(X_test_processed)

e. Training & Evaluation


In [26]:
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

In [27]:
preds = poly_model.predict(X_test_poly)


r2 = r2_score(y_test, preds)
print('R^2 Score:', r2)

R^2 Score: 0.9951491934519345
