# Multiple Linear Regression 

##### This Python code performs statistical model building on a dataset by coding categorical data, creating data frames, splitting the dataset into training and test sets, scaling, training the regression model and making predictions. Firstly, country and gender data are converted into numerical data using LabelEncoder and OneHotEncoder. The data are transferred to pandas DataFrames and merged. The data divided into training and test sets are trained and predicted with the LinearRegression model. The height column is separated as a separate variable and re-split into training and test sets with the remaining features. The regression model is trained again and predictions are made. Finally, statistical models are created with the OLS (Ordinary Least Squares) method using the statsmodels library and the model is updated by removing the explanatory variables of the model one by one and the summary results are printed.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk

data = pd.DataFrame(pd.read_csv("C:\\Users\\Arif Furkan\\OneDrive\\Belgeler\\Python_kullanirken\\veriler.csv"))
print(data)

   ulke  boy  kilo  yas cinsiyet
0    tr  130    30   10        e
1    tr  125    36   11        e
2    tr  135    34   10        k
3    tr  133    30    9        k
4    tr  129    38   12        e
5    tr  180    90   30        e
6    tr  190    80   25        e
7    tr  175    90   35        e
8    tr  177    60   22        k
9    us  185   105   33        e
10   us  165    55   27        k
11   us  155    50   44        k
12   us  160    58   39        k
13   us  162    59   41        k
14   us  167    62   55        k
15   fr  174    70   47        e
16   fr  193    90   23        e
17   fr  187    80   27        e
18   fr  183    88   28        e
19   fr  159    40   29        k
20   fr  164    66   32        k
21   fr  166    56   42        k


## Encoding Categorical Data for Country

In [2]:
country = data.iloc[:, 0:1].values
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
country[:, 0] = le.fit_transform(data.iloc[:, 0])
ohe = preprocessing.OneHotEncoder()
country = ohe.fit_transform(country).toarray()

## Encoding Categorical Data for Gender

In [3]:
from sklearn import preprocessing
gender = data.iloc[:, -1]
le = preprocessing.LabelEncoder()
data.iloc[:, -1] = le.fit_transform(gender)

## Creating DataFrames

In [4]:
print(list(range(22)))
result = pd.DataFrame(data=country, index=range(22), columns=['fr', 'tr', 'us'])
age = data.iloc[:, 1:4].values 
result2 = pd.DataFrame(data=age, index=range(22), columns=['height', 'weight', 'age'])
gender = data.iloc[:, -1].values
result3 = pd.DataFrame(data=gender, index=range(22), columns=['gender']) 

s = pd.concat([result, result2], axis=1) 
print(s)
s2 = pd.concat([s, result3], axis=1)
print(s2)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
     fr   tr   us  height  weight  age
0   0.0  1.0  0.0     130      30   10
1   0.0  1.0  0.0     125      36   11
2   0.0  1.0  0.0     135      34   10
3   0.0  1.0  0.0     133      30    9
4   0.0  1.0  0.0     129      38   12
5   0.0  1.0  0.0     180      90   30
6   0.0  1.0  0.0     190      80   25
7   0.0  1.0  0.0     175      90   35
8   0.0  1.0  0.0     177      60   22
9   0.0  0.0  1.0     185     105   33
10  0.0  0.0  1.0     165      55   27
11  0.0  0.0  1.0     155      50   44
12  0.0  0.0  1.0     160      58   39
13  0.0  0.0  1.0     162      59   41
14  0.0  0.0  1.0     167      62   55
15  1.0  0.0  0.0     174      70   47
16  1.0  0.0  0.0     193      90   23
17  1.0  0.0  0.0     187      80   27
18  1.0  0.0  0.0     183      88   28
19  1.0  0.0  0.0     159      40   29
20  1.0  0.0  0.0     164      66   32
21  1.0  0.0  0.0     166      56   42
     fr   tr   us  heigh

## Splitting Data into Training and Test Sets

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(s, result3, test_size=0.33, random_state=0)

## Feature Scaling

In [6]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

## Separating the Height Column

In [7]:
height = s2.iloc[:, 3:4].values
print(height)

[[130]
 [125]
 [135]
 [133]
 [129]
 [180]
 [190]
 [175]
 [177]
 [185]
 [165]
 [155]
 [160]
 [162]
 [167]
 [174]
 [193]
 [187]
 [183]
 [159]
 [164]
 [166]]


## Separating Features and Splitting Data

In [9]:
left = s2.iloc[:, :3]
right = s2.iloc[:, 4:]
data = pd.concat([left, right], axis=1)
x_train, x_test, y_train, y_test = train_test_split(data, height, test_size=0.33, random_state=0)

## Training and Prediction with Regression Model

In [10]:
r2 = LinearRegression()
r2.fit(x_train, y_train)
y_pred = r2.predict(x_test)

## Creating Statistical Model

### Model with All Features

In [11]:
import statsmodels.api as sm
X = np.append(arr=np.ones((22, 1)).astype(int), values=data, axis=1)
X_l = data.iloc[:, [0, 1, 2, 3, 4, 5]].values
X_l = np.array(X_l, dtype=float)
model = sm.OLS(height, X_l).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.885
Model:                            OLS   Adj. R-squared:                  0.849
Method:                 Least Squares   F-statistic:                     24.69
Date:                Thu, 25 Jul 2024   Prob (F-statistic):           5.41e-07
Time:                        15:57:19   Log-Likelihood:                -73.950
No. Observations:                  22   AIC:                             159.9
Df Residuals:                      16   BIC:                             166.4
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1           103.4708      9.878     10.475      0.0

### Removing One Feature and Updating Model

In [12]:
X_l = data.iloc[:, [0, 1, 2, 3, 5]].values
X_l = np.array(X_l, dtype=float)
model = sm.OLS(height, X_l).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.884
Model:                            OLS   Adj. R-squared:                  0.857
Method:                 Least Squares   F-statistic:                     32.47
Date:                Thu, 25 Jul 2024   Prob (F-statistic):           9.32e-08
Time:                        15:57:42   Log-Likelihood:                -74.043
No. Observations:                  22   AIC:                             158.1
Df Residuals:                      17   BIC:                             163.5
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1           104.5490      9.193     11.373      0.0

### Removing Another Feature and Updating Model

In [13]:
X_l = data.iloc[:, [0, 1, 2, 3]].values
X_l = np.array(X_l, dtype=float)
model = sm.OLS(height, X_l).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.847
Model:                            OLS   Adj. R-squared:                  0.821
Method:                 Least Squares   F-statistic:                     33.16
Date:                Thu, 25 Jul 2024   Prob (F-statistic):           1.52e-07
Time:                        15:58:03   Log-Likelihood:                -77.131
No. Observations:                  22   AIC:                             162.3
Df Residuals:                      18   BIC:                             166.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1           119.8136      7.265     16.491      0.0