In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
path = 'bike_buyers.csv'
df = pd.read_csv(path)
df_copy = df.copy()

In [3]:
df_copy.head()

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000.0,1.0,Bachelors,Skilled Manual,Yes,0.0,0-1 Miles,Europe,42.0,No
1,24107,Married,Male,30000.0,3.0,Partial College,Clerical,Yes,1.0,0-1 Miles,Europe,43.0,No
2,14177,Married,Male,80000.0,5.0,Partial College,Professional,No,2.0,2-5 Miles,Europe,60.0,No
3,24381,Single,,70000.0,0.0,Bachelors,Professional,Yes,1.0,5-10 Miles,Pacific,41.0,Yes
4,25597,Single,Male,30000.0,0.0,Bachelors,Clerical,No,0.0,0-1 Miles,Europe,36.0,Yes


# Limpieza de los datos

Vamos a limpiar nuestro dataset para seleccionar nuestras variables independientes X y nuestra variable dependiente y, además de procesar los datos para transformar valores categóricos a su representación númerica

Revisaremos primero si podemos sustituir valores nulos de algunas columnas para no remover filas de nuestro dataset

In [4]:
df_copy.isna().sum()

ID                   0
Marital Status       7
Gender              11
Income               6
Children             8
Education            0
Occupation           0
Home Owner           4
Cars                 9
Commute Distance     0
Region               0
Age                  8
Purchased Bike       0
dtype: int64

In [5]:
df_copy.dropna(subset = ['Income','Children','Home Owner', 'Cars','Age','Marital Status'], axis = 0, inplace = True)
df_copy.isna().sum()

ID                   0
Marital Status       0
Gender              10
Income               0
Children             0
Education            0
Occupation           0
Home Owner           0
Cars                 0
Commute Distance     0
Region               0
Age                  0
Purchased Bike       0
dtype: int64

In [6]:
pd.unique(df_copy['Gender'])

array(['Female', 'Male', nan], dtype=object)

In [7]:
df_copy['Gender'].replace(np.nan, 'Other', inplace=True)

In [8]:
pd.unique(df_copy['Gender'])

array(['Female', 'Male', 'Other'], dtype=object)

In [9]:
df_copy.isna().sum()

ID                  0
Marital Status      0
Gender              0
Income              0
Children            0
Education           0
Occupation          0
Home Owner          0
Cars                0
Commute Distance    0
Region              0
Age                 0
Purchased Bike      0
dtype: int64

In [10]:
df_copy.reset_index(drop = True, inplace = True)

# Transformación de los datos

Vamos a obtener la representación númerica de las variables categóricas del conjunto de datos

In [11]:
df_dummies = pd.get_dummies(df_copy, drop_first=False)

In [12]:
df_dummies = df_dummies.drop(['Purchased Bike_No'], axis=1)

In [13]:
df_dummies.head()

Unnamed: 0,ID,Income,Children,Cars,Age,Marital Status_Married,Marital Status_Single,Gender_Female,Gender_Male,Gender_Other,...,Home Owner_Yes,Commute Distance_0-1 Miles,Commute Distance_1-2 Miles,Commute Distance_10+ Miles,Commute Distance_2-5 Miles,Commute Distance_5-10 Miles,Region_Europe,Region_North America,Region_Pacific,Purchased Bike_Yes
0,12496,40000.0,1.0,0.0,42.0,1,0,1,0,0,...,1,1,0,0,0,0,1,0,0,0
1,24107,30000.0,3.0,1.0,43.0,1,0,0,1,0,...,1,1,0,0,0,0,1,0,0,0
2,14177,80000.0,5.0,2.0,60.0,1,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
3,24381,70000.0,0.0,1.0,41.0,0,1,0,0,1,...,1,0,0,0,0,1,0,0,1,1
4,25597,30000.0,0.0,0.0,36.0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,1


# Selección de atributos y modelo logístico

In [14]:
X = np.asarray(df_dummies.drop(['Purchased Bike_Yes'], axis = 1))
y = np.asarray(df_dummies['Purchased Bike_Yes'].astype('int'))

## Entrenamiento/Pruebas del modelo

Dividiremos los datos en un 80% para entrenamiento y el 20% restante para probar el modelo

In [15]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (769, 30) (769,)
Test set: (193, 30) (193,)


# Modelos de clasificación

## Random Forest

In [16]:
RF = RandomForestClassifier(ccp_alpha = 0.001)
RF.fit(X_train,y_train)

Ahora vamos a predecir usando nuestros datos de pruebas

In [17]:
yhat_rf = RF.predict(X_test)

## SVM

In [18]:
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder 
from sklearn import svm

In [19]:
df_dummies['Purchased Bike_Yes'].value_counts()

0    503
1    459
Name: Purchased Bike_Yes, dtype: int64

In [20]:
target = df_dummies['Purchased Bike_Yes']
inputs = df_dummies.drop(['Purchased Bike_Yes'],axis=1)

In [21]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(inputs, target, test_size=0.2, stratify = target)

In [22]:
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train_svm)
X_train_svm_rescaled = scaling.transform(X_train_svm)

In [23]:
C = 1.0
svc = svm.SVC(kernel='poly', C=C).fit(X_train_svm_rescaled, y_train_svm)

In [24]:
yhat_svm = svc.predict(scaling.transform(X_test_svm))

## Naive Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB

In [31]:
NB = MultinomialNB()
NB.fit(X_train, y_train)

In [32]:
yhat_nb = NB.predict(X_test)

# Evaluación

## Resultados Ranfom Forest

In [28]:
print(classification_report(y_test, yhat_rf))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77       114
           1       0.67      0.71      0.69        79

    accuracy                           0.74       193
   macro avg       0.73      0.73      0.73       193
weighted avg       0.74      0.74      0.74       193



## Resultados SVM

In [29]:
print(classification_report(y_test_svm, yhat_svm))

              precision    recall  f1-score   support

           0       0.63      0.65      0.64       101
           1       0.61      0.59      0.60        92

    accuracy                           0.62       193
   macro avg       0.62      0.62      0.62       193
weighted avg       0.62      0.62      0.62       193



## Resultados NB

In [33]:
print(classification_report(y_test, yhat_nb))

              precision    recall  f1-score   support

           0       0.62      0.61      0.62       114
           1       0.46      0.47      0.46        79

    accuracy                           0.55       193
   macro avg       0.54      0.54      0.54       193
weighted avg       0.56      0.55      0.56       193

