# Régression linéaire multiple

In [1]:
# Importer les librairies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

---
## Les entreprises

### Import des données

In [2]:
# le dataset
data = pd.read_csv('entreprises.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [8]:
# matrice X
X = data.drop('Profit', axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [9]:
# vecteur y
y = data['Profit']
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

### Gestion des variables catégoriques

In [10]:
# avec pd.get_dummies
dummy = pd.get_dummies(X['State'])
X = pd.concat([X,dummy], axis=1)
X.drop(['State','California'], axis=1, inplace=True)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [None]:
# avec OneHotEncoder
'''
X = data.iloc[:, :-1].values
y = data.iloc[:, -1:].values

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categories ='auto')
enc = enc.fit_transform(X[:,3].reshape(-1,1)).toarray()
X = np.hstack((X[:,:3],enc[:,1:]))
X[:5,:]
'''

### Création d'un échantillon d'entraînement et un échantillon de test

In [None]:
# "à la main"
'''
tirage = np.random.rand(len(data))
msk =  tirage < np.percentile(tirage,80)
X_train = X[msk]
X_test = X[~msk]
y_train = y[msk]
y_test = y[~msk]
'''
#print(len(train),len(test))

In [11]:
# avec scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

### Construction du modèle et prédictions

#### Avec Sckit-learn

In [None]:
# avec scikit-learn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
results = model.fit(X_train, y_train)

test_pred = pd.DataFrame()
test_pred['profit'] = y_test
test_pred['profit_pred']=model.predict(X_test)
test_pred

In [None]:
print('score : ', model.score(X_train,y_train),
      '\nconstante : ', results.intercept_,
      '\ncoefficients : ', results.coef_)

#### Avec statsmodels

In [13]:
#Construction du modèle
from statsmodels.api import OLS, add_constant
model = OLS(y_train, add_constant(X_train))
results = model.fit()

In [14]:
# Évaluation du modèle
results.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.949
Model:,OLS,Adj. R-squared:,0.942
Method:,Least Squares,F-statistic:,143.8
Date:,"Wed, 18 Dec 2019",Prob (F-statistic):,4.89e-24
Time:,16:53:54,Log-Likelihood:,-473.68
No. Observations:,45,AIC:,959.4
Df Residuals:,39,BIC:,970.2
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.031e+04,7533.849,6.678,0.000,3.51e+04,6.55e+04
R&D Spend,0.7900,0.055,14.390,0.000,0.679,0.901
Administration,-0.0250,0.056,-0.445,0.659,-0.139,0.089
Marketing Spend,0.0332,0.019,1.725,0.092,-0.006,0.072
Florida,-986.4818,3794.280,-0.260,0.796,-8661.138,6688.174
New York,-522.3850,3470.586,-0.151,0.881,-7542.307,6497.537

0,1,2,3
Omnibus:,16.553,Durbin-Watson:,2.396
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.164
Skew:,-1.083,Prob(JB):,3.43e-06
Kurtosis:,5.954,Cond. No.,1480000.0


In [None]:
# Nouvelles prédictions
results.predict(add_constant(X_test))

---
## Les appartements

### Import des données

In [None]:
# le dataset
data = pd.read_csv('apparts.csv')
data.head()

In [None]:
# conversion en m2
data['taille_m2'] = 0.092903 * data['taille_en_pieds_carre']
data.head()

In [None]:
# matrice X
X = data[['taille_m2','nb_chambres']]
X.head()

In [None]:
# vecteur y
y = data['prix']
y.head()

### Construction du modèle et prédictions

In [None]:
#Construction du modèle
from statsmodels.api import OLS, add_constant
model = OLS(y,X)
res = model.fit()
res.summary()

### Exo

In [None]:
# Essayer de refaire en standardisant les variables explicatives :
# pour chaque variable (taille, nb_chambres),
# on retranche la moyenne et on divise par l'écart type

---
## Les salaires

### Import des données

In [None]:
# le dataset
data = pd.read_csv('salaires.csv')
data

In [None]:
plt.scatter(data.Level,data.Salary)