# El tratamiento de las variables categóricas

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv("../datasets/ecom-expense/Ecom Expense.csv")

In [6]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [86]:
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender")
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [87]:
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix="City")
dummy_city_tier.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


In [14]:
column_names = df.columns.values.tolist()
column_names

['Transaction ID',
 'Age ',
 ' Items ',
 'Monthly Income',
 'Transaction Time',
 'Record',
 'Gender',
 'City Tier',
 'Total Spend']

In [15]:
df_new = df[column_names].join(dummy_gender)
column_names = df_new.columns.values.tolist()
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0


In [16]:
df_new = df_new[column_names].join(dummy_city_tier)
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


In [43]:
feature_cols = ["Monthly Income", "Transaction Time", 
                "Gender_Female", "Gender_Male", 
                "City_Tier 1", "City_Tier 2", "City_Tier 3",
                "Record"]

In [44]:
X = df_new[feature_cols]
Y = df_new["Total Spend"]

In [45]:
lm = LinearRegression()
lm.fit(X,Y)

LinearRegression()

In [46]:
print(lm.intercept_)
print(lm.coef_)

-79.41713030137362
[ 1.47538980e-01  1.54946125e-01 -1.31025013e+02  1.31025013e+02
  7.67643260e+01  5.51389743e+01 -1.31903300e+02  7.72233446e+02]


In [47]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', 0.1475389804920575),
 ('Transaction Time', 0.15494612549589393),
 ('Gender_Female', -131.02501325554573),
 ('Gender_Male', 131.02501325554596),
 ('City_Tier 1', 76.76432601049554),
 ('City_Tier 2', 55.13897430923221),
 ('City_Tier 3', -131.9033003197279),
 ('Record', 772.2334457445646)]

In [48]:
lm.score(X,Y)

0.9179923586131016

El modelo puede ser escrito como:
    
    * Total_Spend = -79.41713030137362 + 'Monthly Income' * 0.1475389804920575 + 
     'Transaction Time' * 0.15494612549589393 + 'Gender_Female' * -131.02501325554573 +
     'Gender_Male' * 131.02501325554596 + 'City_Tier 1' * 76.76432601049554 + 
     'City_Tier 2' * 55.13897430923221 + 'City_Tier 3' * -131.9033003197279 +
     'Record' * 772.2334457445646

#### Otra forma más simple de calcular las predicciones
* df_new["prediction"] = lm.predict(pd.DataFrame(df_new[feature_cols]))

In [53]:
df_new["prediction"] = -79.41713030137362 + df_new['Monthly Income'] * 0.1475389804920575 + df_new['Transaction Time'] * 0.15494612549589393 + df_new['Gender_Female'] *(-131.02501325554573) + df_new['Gender_Male'] * 131.02501325554596 + df_new['City_Tier 1'] * 76.76432601049554 + df_new['City_Tier 2'] * 55.13897430923221 + df_new['City_Tier 3'] * -131.9033003197279 + df_new['Record'] * 772.2334457445646

In [55]:
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4903.69672
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4799.434826
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5157.082504
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8068.012996
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3581.980335


In [67]:
SSD = np.sum((df_new["prediction"] - df_new["Total Spend"])**2)

In [68]:
SSD

1517733985.3408165

In [69]:
RSE = np.sqrt(SSD / (len(df_new)-len(feature_cols)-1))

In [70]:
RSE

803.1318809818166

In [71]:
sales_mean = np.mean(df_new["Total Spend"])

In [72]:
sales_mean

6163.176415976715

In [74]:
error = RSE / sales_mean
error * 100

13.03113568029416

## Eliminar variables dummy redundantes

In [88]:
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender").iloc[:,1:]
dummy_gender.head()

Unnamed: 0,Gender_Male
0,0
1,0
2,1
3,0
4,0


In [89]:
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix="City").iloc[:,1:]
dummy_city_tier.head()

Unnamed: 0,City_Tier 2,City_Tier 3
0,0,0
1,1,0
2,1,0
3,0,0
4,1,0


In [90]:
column_names = df.columns.values.tolist()
df_new = df[column_names].join(dummy_gender)
column_names = df_new.columns.values.tolist()
df_new = df_new[column_names].join(dummy_city_tier)
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Male,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,0,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,1,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,0,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,0,1,0


In [92]:
feature_cols = ["Monthly Income", "Transaction Time", 
                "Gender_Male", "City_Tier 2", 
                "City_Tier 3", "Record"]
X = df_new[feature_cols]
Y = df_new["Total Spend"]
lm = LinearRegression()
lm.fit(X,Y)

LinearRegression()

In [93]:
print(lm.intercept_)

-133.67781754642056


In [94]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', 0.14753898049205752),
 ('Transaction Time', 0.1549461254958957),
 ('Gender_Male', 262.0500265110946),
 ('City_Tier 2', -21.62535170126243),
 ('City_Tier 3', -208.66762633022333),
 ('Record', 772.2334457445633)]

In [95]:
lm.score(X,Y)

0.9179923586131016

Coeficientes con todas las variables en el modelo
* ('Monthly Income', 0.1475389804920575),
* ('Transaction Time', 0.15494612549589393),
* ('Gender_Female', -131.02501325554573),
* ('Gender_Male', 131.02501325554596),
* ('City_Tier 1', 76.76432601049554),
* ('City_Tier 2', 55.13897430923221),
* ('City_Tier 3', -131.9033003197279),
* ('Record', 772.2334457445646)]
    
Coeficientes tras enmascarar las variables dummy pertinentes
* ('Monthly Income', 0.14753898049205752),
* ('Transaction Time', 0.1549461254958957),
* ('Gender_Male', 262.0500265110946),
* ('City_Tier 2', -21.62535170126243),
* ('City_Tier 3', -208.66762633022333),
* ('Record', 772.2334457445633)]

Los cambios se reflejan en:
* Gender_Male: 
    * antes -> 131.02, 
    * despúes -> 262.05 = (131.02 - (-131.02))
* Gender_Female: 
    * antes -> 131.02, 
    * despúes -> 0
* CT1: 
    * antes -> 76.76, 
    * despúes -> 0
* CT2: 
    * antes -> 55.13, 
    * despúes -> -21.62 = (55.13 - 76.76)
* CT3: 
    * antes -> -131.90, 
    * despúes -> -208.66 = (-131.90 - 76.76)