In [None]:
import pandas as pd

data = pd.read_csv('BlackFriday.csv', header='infer')

data_copy = data.copy()

data

In [None]:
data = data[['Age', 'Marital_Status', 'Purchase', 'Gender']]

data

In [None]:
data['Purchase'] = pd.qcut(data['Purchase'], 3, labels=[0, 1, 2])

data

In [None]:
data['Marital_Status'] = pd.Categorical(data['Marital_Status'])
data['Gender'] = pd.Categorical(data['Gender'])

print(data)
print(data.dtypes)

In [None]:
df = pd.get_dummies(data['Age'])
data = pd.concat((df, data), axis=1)
data = data.drop(['Age'], axis=1)

data

In [None]:
from sklearn.model_selection import train_test_split

Y = data['Gender']
X = data.drop(['Gender'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt

clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)
clf = clf.fit(X, Y)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf.score(x_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(x_test, y_test)))

plt.figure(figsize=(25, 10))
tree.plot_tree(clf, feature_names=['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+', 'Marital_Status', 'Purchase', 'Gender'], class_names=['M', 'F'], filled=True, rounded=True, fontsize=12)
plt.show()

test_pred_decision_tree = clf.predict(x_test)

In [None]:
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(y_test, test_pred_decision_tree)

matrix_df = pd.DataFrame(confusion_matrix)

ax = plt.axes()
sns.set(font_scale=1.3)
plt.figure(figsize=(15, 10))
sns.heatmap(matrix_df, annot=True, fmt='g', ax=ax, cmap='magma')

ax.set_title('Confusion Matrix - Decision Tree')
ax.set_xlabel('Predicted label', fontsize=15)
ax.set_xticklabels(['M', 'F'])
ax.set_ylabel('True Label', fontsize=15)
ax.set_yticklabels(['M', 'F'], rotation=0)

plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn.fit(x_train, y_train)

print('Accuracty of K-NN classifier on training set: {:.2f}'.format(knn.score(x_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knn.score(x_test, y_test)))

test_pred_knn = knn.predict(x_test)

In [None]:
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(y_test, test_pred_knn)

matrix_df = pd.DataFrame(confusion_matrix)

ax = plt.axes()
sns.set(font_scale=1.3)
plt.figure(figsize=(15, 10))
sns.heatmap(matrix_df, annot=True, fmt='g', ax=ax, cmap='magma')

ax.set_title('Confusion Matrix - KNN')

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train, y_train)

print('Accuracy of GNB classifier on training set: {:.2f}'.format(gnb.score(x_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'.format(gnb.score(x_test, y_test)))

test_pred_gnb = gnb.predict(x_test)

In [None]:
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(y_test, test_pred_gnb)

matrix_df = pd.DataFrame(confusion_matrix)

ax = plt.axes()
sns.set(font_scale=1.3)
plt.figure(figsize=(15, 10))
sns.heatmap(matrix_df, annot=True, fmt='g', ax=ax, cmap='magma')

ax.set_title('Confusion Matrix - KNN')
ax.set_xlabel('Predicted label', fontsize=15)
ax.set_xticklabels(['M', 'F'], fontsize=15)
ax.set_ylabel('True Label', fontsize=15)
ax.set_yticklabels(['M', 'F'], rotation=0)

plt.show()

In [None]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(x_train, y_train)

print('Accuracy of SVM classifier on training set: {:.2f}'.format(svm.score(x_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(x_test, y_test)))

test_pred_svm = svm.predict(x_test)

In [None]:
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(y_test, test_pred_svm)

matrix_df = pd.DataFrame(confusion_matrix)

ax = plt.axes()
sns.set(font_scale=1.3)
plt.figure(figsize=(15, 10))
sns.heatmap(matrix_df, annot=True, fmt='g', ax=ax, cmap='magma')

ax.set_title('Confusion Matrix - SVC')
ax.set_xlabel('Predicted label', fontsize=15)
ax.set_xticklabels(['M', 'F'], fontsize=15)
ax.set_ylabel('True Label', fontsize=15)
ax.set_yticklabels(['M', 'F'], rotation=0)

plt.show()

In [None]:
class OneR(object):
  def __init__(self):
    self.ideal_variable = None
    self.max_accuracy = 0

  def fit(self, X, y):
    response = list()
    result = dict()

    dfx = pd.DataFrame(X)

    for i in dfx:
      result[str(i)] = dict()
      options_values = set(dfx[i])
      join_data = pd.DataFrame({'variable': dfx[i], 'label': y})
      cross_table = pd.crosstab(join_data.variable, join_data.label)
      summary = cross_table.idxmax(axis=1)
      result[str(i)] = dict(summary)

      counts = 0

      for idx, row in join_data.iterrows():
        if row['label'] == result[str(i)][row['variable']]:
          counts += 1

      accuracy = (counts/len(y))

      if accuracy > self.max_accuracy:
        self.max_accuracy = accuracy
        self.ideal_variable = i

      result_feature = {'variable': str(i), 'accuracy': accuracy, 'rules': result[str(i)]}
      response.append(result_feature)

    return response

  def predict(self, X=None):
    self_ideal_variable = self.ideal_variable + 1

  def __repr__(self):
    if self.ideal_variable != None:
      txt = 'The best variable for this data is ' + str(self.ideal_variable)
    else:
      txt = 'The best variable has not been found yet, try running the fit method first'

    return txt

In [None]:
clf = OneR()
test_results = clf.fit(x_test, y_test)

print(test_results)
print(clf)