In [1]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import pandas as pd


In [4]:
data = pd.read_csv('../datasets/ads/Advertising.csv')

In [5]:
features_cols = ['TV', 'Radio', 'Newspaper']

In [7]:
x = data[features_cols]
y = data['Sales']

In [8]:
estimator = SVR(kernel='linear')
selector = RFE(estimator, 2, step=1)
selector = selector.fit(x, y)

In [9]:
selector.support_

array([ True,  True, False])

In [11]:
selector.ranking_

array([1, 1, 2])

In [13]:
x_pred = x[['TV', 'Radio']]
lm = LinearRegression()
lm.fit(x_pred, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [14]:
lm.intercept_

2.9210999124051362

In [15]:
lm.coef_

array([0.04575482, 0.18799423])

In [16]:
lm.score(x_pred, y)  # R2

0.8971942610828956

<div class='alert alert-info'>
    <h3>Variables Categóricas</h3>
</div>

In [2]:
import numpy as np

In [3]:
df = pd.read_csv('../datasets/ecom-expense/Ecom Expense.csv')
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [4]:
dummy_gender = pd.get_dummies(df['Gender'], prefix='G')
dummy_city = pd.get_dummies(df['City Tier'], prefix='City')
dummy_city.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


Para trabajar con variables categoricas lo que hacemos es crear tantas variables como categorias hay para una columna dada, y le ponemos un 1 si pertenece a esa categoria y 0 si no pertenece, como vemos en los dos ejemplos anteriores.  Por ejemplo, para el genero, si toto es hombre entonces tendra un 1 en hombre y un cero en mujer.

In [5]:
columns_names = df.columns.values.tolist()
columns_names

['Transaction ID',
 'Age ',
 ' Items ',
 'Monthly Income',
 'Transaction Time',
 'Record',
 'Gender',
 'City Tier',
 'Total Spend']

In [6]:
df_new = df[columns_names].join(dummy_gender).join(dummy_city)
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,G_Female,G_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


In [7]:
df_new = df_new.drop(['Gender', 'City Tier'], axis=1)
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Total Spend,G_Female,G_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,3254.160485,1,0,0,1,0


In [8]:
features_cols = ['Monthly Income', 'Transaction Time', 'G_Female', 'G_Male',
                 'City_Tier 1', 'City_Tier 2', 'City_Tier 3', 'Record']

In [9]:
x = df_new[features_cols]
y = df_new['Total Spend']

In [10]:
lm = LinearRegression().fit(x, y)

In [11]:
print(lm.intercept_)
print(lm.coef_)

-79.4171303013718
[ 1.47538980e-01  1.54946125e-01 -1.31025013e+02  1.31025013e+02
  7.67643260e+01  5.51389743e+01 -1.31903300e+02  7.72233446e+02]


In [12]:
list(zip(features_cols, lm.coef_))

[('Monthly Income', 0.14753898049205738),
 ('Transaction Time', 0.15494612549589634),
 ('G_Female', -131.02501325554624),
 ('G_Male', 131.02501325554607),
 ('City_Tier 1', 76.76432601049513),
 ('City_Tier 2', 55.1389743092325),
 ('City_Tier 3', -131.9033003197277),
 ('Record', 772.2334457445645)]

Vemos que si sumamos los coeficientes de los generos, o los coeficientes de las ciudades dan cero, ya que son variables complementarias.

In [47]:
lm.score(x, y)

0.9179923586131016

In [62]:
df_new['Predict'] = lm.predict(pd.DataFrame(df_new[features_cols]))

In [63]:
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Total Spend,G_Female,G_Male,City_Tier 1,City_Tier 2,City_Tier 3,Predict
0,TXN001,42,10,7313,627.668127,5,4198.385084,1,0,1,0,0,4903.69672
1,TXN002,24,8,17747,126.904567,3,4134.976648,1,0,0,1,0,4799.434826
2,TXN003,47,11,22845,873.469701,2,5166.614455,0,1,0,1,0,5157.082504
3,TXN004,50,11,18552,380.219428,7,7784.447676,1,0,1,0,0,8068.012996
4,TXN005,60,2,14439,403.374223,2,3254.160485,1,0,0,1,0,3581.980335


In [64]:
SSD = np.sum((df_new['Predict'] - df_new['Total Spend'])**2) # suma de desviaciones

In [65]:
RSE = np.sqrt(SSD / (len(df_new) - len(features_cols) - 1))
RSE # desviacion tipica

803.1318809818165

In [66]:
sales_mean = np.mean(df_new['Total Spend'])
error = RSE / sales_mean
error

0.13031135680294162

# Enmascarar variables categoricas

In [67]:
# Una variable menos que las opciones de la variable categorica.
dummy_G = pd.get_dummies(df['Gender'], prefix='G').iloc[:,1:]
dummy_G.head()

Unnamed: 0,G_Male
0,0
1,0
2,1
3,0
4,0


In [68]:
dummy_C = pd.get_dummies(df['City Tier'], prefix='C').iloc[:,1:]
dummy_C.head()

Unnamed: 0,C_Tier 2,C_Tier 3
0,0,0
1,1,0
2,1,0
3,0,0
4,1,0


In [72]:
columns_names = df.columns.values.tolist()
df_new = df[columns_names].join(dummy_G).join(dummy_C)
df_new = df_new.drop(['Gender', 'City Tier'], axis=1)
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Total Spend,G_Male,C_Tier 2,C_Tier 3
0,TXN001,42,10,7313,627.668127,5,4198.385084,0,0,0
1,TXN002,24,8,17747,126.904567,3,4134.976648,0,1,0
2,TXN003,47,11,22845,873.469701,2,5166.614455,1,1,0
3,TXN004,50,11,18552,380.219428,7,7784.447676,0,0,0
4,TXN005,60,2,14439,403.374223,2,3254.160485,0,1,0


In [75]:
features_cols = ['Monthly Income', 'Transaction Time', 'G_Male',
                 'C_Tier 2', 'C_Tier 3', 'Record']

In [77]:
x = df_new[features_cols]
y = df_new['Total Spend']
lm = LinearRegression().fit(x, y)
print(lm.intercept_)
list(zip(features_cols, lm.coef_))

-133.67781754642238


[('Monthly Income', 0.14753898049205744),
 ('Transaction Time', 0.1549461254959002),
 ('G_Male', 262.0500265110948),
 ('C_Tier 2', -21.62535170126276),
 ('C_Tier 3', -208.66762633022296),
 ('Record', 772.2334457445636)]

In [78]:
lm.score(x, y)

0.9179923586131016

### Veamos los cambios:
```
* G_male:    antes  ->  131.02,   despues ->  262.05 = (132.02 - (-131.02))
* G_female:  antes  ->  -131.02,  despues ->  0
* CT1:       antes  ->  76.76,    despues ->  0
* CT2:       antes  ->  55.13,    despues ->  -21,62 = (55.13 - 76.76)
* CT3:       antes  ->  -131.90,  despues ->  -208.66 = (-132.90 - 76.76)
```