In [None]:
# ------------------------------------------ Imports ------------------------------------------
import pandas as pd
import numpy as np

In [None]:

# ------------------------------------------ Load Data ------------------------------------------
full_pumpkins = pd.read_csv('US-pumpkins.csv')
full_pumpkins.head()

In [None]:
# ------------------------------------------ Clean Data ------------------------------------------
columns_to_select = ['City Name','Package','Variety', 'Origin','Item Size', 'Color']
pumpkins = full_pumpkins.loc[:, columns_to_select]
pumpkins.dropna(inplace=True)
pumpkins.info

In [None]:
# ----------------------------------------- Visualize -----------------------------------------
import seaborn as sns

palette = {
'ORANGE': 'orange',
'WHITE': 'wheat',
}

sns.catplot(
data=pumpkins, y="Variety", hue="Color", kind="count",
palette=palette, 
)

In [None]:
# ----------------------------------------- Feature Encoding -----------------------------------------
# Ordinal
from sklearn.preprocessing import OrdinalEncoder
item_size_categories = [['sml', 'med', 'med-lge', 'lge', 'xlge', 'jbo', 'exjbo']]
ordinal_features = ['Item Size']
ordinal_encoder = OrdinalEncoder(categories=item_size_categories)
# One Hot
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['City Name', 'Package', 'Variety', 'Origin']
categorical_encoder = OneHotEncoder(sparse_output=False)
# Transform Columns
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[
    ('ord', ordinal_encoder, ordinal_features),
    ('cat', categorical_encoder, categorical_features)
    ])
ct.set_output(transform='pandas')
encoded_features = ct.fit_transform(pumpkins)

In [None]:
# ----------------------------------------- Label Encoding -----------------------------------------
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_label = label_encoder.fit_transform(pumpkins['Color'])

In [None]:
encoded_pumpkins = encoded_features.assign(Color=encoded_label)
print(encoded_pumpkins)

In [None]:
# ----------------------------- Analyze Relationship Between Variables -----------------------------
palette = {
    'ORANGE': 'orange',
    'WHITE': 'wheat',
}
pumpkins['Item Size'] = encoded_pumpkins['ord__Item Size']

g = sns.catplot(
    data=pumpkins,
    x="Item Size", y="Color", row='Variety',
    kind="box", orient="h",
    sharex=False, margin_titles=True,
    height=1.8, aspect=4, palette=palette,
)
g.set(xlabel="Item Size", ylabel="").set(xlim=(0,6))
g.set_titles(row_template="{row_name}")

In [None]:
# ---------------------------- Swarm Plot ----------------------------
palette = {
    '0': 'orange',
    '1': 'wheat'
}
sns.swarmplot(x="Color", y="ord__Item Size", data=encoded_pumpkins, palette=palette)

In [None]:
# ---------------------------------------------- Build Model ----------------------------------------------
from sklearn.model_selection import train_test_split
X = encoded_pumpkins[encoded_pumpkins.columns.difference(['Color'])]
y = encoded_pumpkins['Color']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# ---------------------------------------------- Train Model ----------------------------------------------
from sklearn.metrics import f1_score, classification_report 
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# ---------------------------------------------- Predict ----------------------------------------------
predictions = model.predict(X_test)

In [None]:
# ---------------------------------------------- Evaluate ----------------------------------------------
print(classification_report(y_test, predictions))
print('Predicted labels: ', predictions)
print('F1-score: ', f1_score(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

In [None]:
# ------------------------------------------------ ROC CURVE ------------------------------------------------

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

y_scores = model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])

fig = plt.figure(figsize=(6, 6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
# ---- Area Under Cover ----
auc = roc_auc_score(y_test,y_scores[:,1])
print(auc)