In [1]:
#===================================================================#
# Created by Filip Bunta for Lundegaard data science interview task #
#===================================================================#
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from get_data import convert_to_dataframe
from utilities import count_unique_words
from transformers import CategorizeTransformer, VectorizeTransformer
# Load data
filename = os.path.join("datasets","finefoods","finefoods.txt")
df = convert_to_dataframe(filename)
df = df.astype({"Score": float})

# Split into train/test data    
df_train_set_copy, df_test_set_copy = train_test_split(df, test_size=0.2, random_state=42)
df_train_set = df_train_set_copy.reset_index()
df_test_set = df_test_set_copy.reset_index()

# Initialize transformers
cat_transf = CategorizeTransformer()
vect_transf = VectorizeTransformer()

# TRAIN SECTION - TRAIN MODEL
y_train = cat_transf.fit_transform(df_train_set)
x_train = vect_transf.fit_transform(df_train_set)

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(x_train,y_train)

SGDClassifier(random_state=42)

In [2]:
# TRAIN SECTION - GET CROSS-VALIDATION RESULTS
y_train_pred = cross_val_predict(sgd_clf, x_train, y_train, cv=3)
print("TRAIN SECTION:\nCross-validation score results for train set:")
cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring="accuracy")

TRAIN SECTION:
Cross-validation score results for train set:


array([0.8712678 , 0.87183513, 0.87097668])

In [3]:
# TRAIN SECTION - GET CONFUSION MATRIX RESULTS
precision_score_train = precision_score(y_train, y_train_pred)
recall_score_train = recall_score(y_train, y_train_pred)
print(f"TRAIN SECTION:\nPrecision score: {precision_score_train} vs Recall score: {recall_score_train}")
print("\nConfusion matrix results for train set:")
confusion_matrix(y_train, y_train_pred)

TRAIN SECTION:
Precision score: 0.8751409954677192 vs Recall score: 0.9910431343118634

Confusion matrix results for train set:


array([[ 10655,  55015],
       [  3485, 385602]], dtype=int64)

In [4]:
# TEST SECTION - GET CROSS-VALIDATION RESULTS
y_test = cat_transf.fit_transform(df_test_set)
x_test = vect_transf.fit_transform(df_test_set)
y_test_pred = cross_val_predict(sgd_clf, x_test, y_test, cv=3)
print("TEST SECTION:\nCross-validation score results for test set:")
cross_val_score(sgd_clf, x_test, y_test, cv=3, scoring="accuracy")

TEST SECTION:
Cross-validation score results for test set:


array([0.8712563 , 0.8718896 , 0.87288896])

In [5]:
# TEST SECTION - GET CONFUSION MATRIX RESULTS
precision_score_test = precision_score(y_test, y_test_pred)
recall_score_test = recall_score(y_test, y_test_pred)
print(f"TEST SECTION:\nPrecision score: {precision_score_test} vs Recall score: {recall_score_test}")
print("\nConfusion matrix results for test set:")
confusion_matrix(y_test, y_test_pred)

TEST SECTION:
Precision score: 0.8754388352156756 vs Recall score: 0.9915745345444084

Confusion matrix results for test set:


array([[ 2635, 13731],
       [  820, 96504]], dtype=int64)