# Çoklu Doğrusal Regresyon

In [6]:
import pandas as pd
ad = pd.read_csv("Advertising.csv", usecols = [1,2,3,4])
df = ad.copy()
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [9]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [10]:
# sales değişkenini dışarıda bırakıp tüm bağımsız değişkenleri seçme işlemi
X = df.drop("sales", axis = 1)
y = df["sales"]

# Test ve train ayırma işlemleri
# Her seferinde farklı ifadeler gelmesin diye random_state kullandık
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [15]:
# Test ve train gözlemlerinin boyutlarına bakalım
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(160, 3)
(40, 3)
(160,)
(40,)


In [16]:
# Veri setinin tüm hâlini oluşturan traning isimli değişken oluşturuyoruz
training = df.copy()
training.shape

(200, 4)

## Statsmodels

In [22]:
import statsmodels.api as sm

# Model oluşturma
lm = sm.OLS(y_train, X_train)

In [23]:
# Model uygulama
model = lm.fit()

# Modelin çıktılarına ulaşma
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared (uncentered):,0.982
Model:,OLS,Adj. R-squared (uncentered):,0.982
Method:,Least Squares,F-statistic:,2935.0
Date:,"Thu, 13 Apr 2023",Prob (F-statistic):,1.28e-137
Time:,22:54:28,Log-Likelihood:,-336.65
No. Observations:,160,AIC:,679.3
Df Residuals:,157,BIC:,688.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040

0,1,2,3
Omnibus:,11.405,Durbin-Watson:,1.895
Prob(Omnibus):,0.003,Jarque-Bera (JB):,15.574
Skew:,-0.432,Prob(JB):,0.000415
Kurtosis:,4.261,Cond. No.,13.5


In [28]:
# coefli tabloya erişmek istersek
model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040


## skicit-learn ile model oluşturma 

In [29]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
model = lm.fit(X_train, y_train)
model.intercept_

2.9790673381226274

In [30]:
model.coef_

array([0.04472952, 0.18919505, 0.00276111])

## Tahmin

Model denklemi:

Sales = 2.97 + TV*0.04 + radio*0.18 % newspaper^0.002

Örneğin 30 birim Tv harcaması, 10 birim radio harcaması, 40 birim de gazete harcaması olduğunda satışların tahmini değeri ne olur?

In [38]:
new_data = [[30], [10], [40]]
new_data = pd.DataFrame(new_data).T

In [39]:
model.predict(new_data)



array([6.32334798])

In [42]:
# Eğitim setinin hatasına ulaşma
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

rmse = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
rmse

1.6447277656443373

In [43]:
# Test setinin hatasına ulaşma

rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
rmse

1.7815996615334506

## Model Tuning / Model Doğrulama

In [46]:
X = df.drop("sales", axis = 1)
y = df["sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.20, 
                                                    random_state = 42)
lm = LinearRegression()
model = lm.fit(X_train, y_train)

In [54]:
np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

1.6447277656443373

In [47]:
np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

1.7815996615334506

In [48]:
model.score(X_train, y_train)

0.8957008271017817

In [51]:
from sklearn.model_selection import cross_val_score

# Modelimizin 10 tane rastgele r2 değerinin ortalaması ile doğruluğu
cross_val_score(model, X, y, cv = 10, scoring = "r2").mean()

0.8853562237979616

In [53]:
# Eğitim hatası için model doğruluğu yukarıdaki train sqrt değeri ile karşılaştırabiliriz
# Başında eksi olması +'ya çevirme
np.sqrt(-cross_val_score(model, 
                X_train, 
                y_train, 
                cv = 10, 
                scoring = "neg_mean_squared_error")).mean()

1.6513523730313335

In [57]:
# Test hatası için model doğruluğu yukarıdaki train sqrt değeri ile karşılaştırabiliriz 
np.sqrt(-cross_val_score(model,
                X_test, 
                y_test, 
                cv = 10, 
                scoring = "neg_mean_squared_error")).mean()

1.8462778823997088