In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.multiclass import OneVsRestClassifier


In [5]:
pd.set_option('display.max_columns', None)

In [7]:
csv_file_path = '~/code/ArjanAngenent/VinoDine/raw_data/XWines_Full_100K_wines.csv'
df_new = pd.read_csv(csv_file_path)
df_new.head()

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,Country,RegionID,RegionName,WineryID,WineryName,Website,Vintages
0,100001,Espumante Moscatel,Sparkling,Varietal/100%,['Muscat/Moscato'],"['Pork', 'Rich Fish', 'Shellfish']",7.5,Medium-bodied,High,BR,Brazil,1001,Serra Gaúcha,10001,Casa Perini,http://www.vinicolaperini.com.br,"[2020, 2019, 2018, 2017, 2016, 2015, 2014, 201..."
1,100002,Ancellotta,Red,Varietal/100%,['Ancellotta'],"['Beef', 'Barbecue', 'Codfish', 'Pasta', 'Pizz...",12.0,Medium-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10001,Casa Perini,http://www.vinicolaperini.com.br,"[2016, 2015, 2014, 2013, 2012, 2011, 2010, 200..."
2,100003,Cabernet Sauvignon,Red,Varietal/100%,['Cabernet Sauvignon'],"['Beef', 'Lamb', 'Poultry']",12.0,Full-bodied,High,BR,Brazil,1001,Serra Gaúcha,10002,Castellamare,https://www.emporiocastellamare.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."
3,100004,Virtus Moscato,White,Varietal/100%,['Muscat/Moscato'],['Sweet Dessert'],12.0,Medium-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10003,Monte Paschoal,http://www.montepaschoal.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."
4,100005,Maison de Ville Cabernet-Merlot,Red,Assemblage/Bordeaux Red Blend,"['Cabernet Sauvignon', 'Merlot']","['Beef', 'Lamb', 'Game Meat', 'Poultry']",11.0,Full-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10000,Aurora,http://www.vinicolaaurora.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."


In [56]:
df_new['Harmonize'] = df_new['Harmonize'].astype(str).str.replace(r'[\[\]\'\s]', '').str.split(',')
df_new['Grapes'] = df_new['Grapes'].astype(str).str.replace(r'[\[\]\'\s]', '').str.split(',')
df_new

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,Country,RegionID,RegionName,WineryID,WineryName,Website,Vintages
0,100001,Espumante Moscatel,Sparkling,Varietal/100%,[['Muscat/Moscato']],"[['Pork', 'Rich Fish', 'Shellfish']]",7.5,Medium-bodied,High,BR,Brazil,1001,Serra Gaúcha,10001,Casa Perini,http://www.vinicolaperini.com.br,"[2020, 2019, 2018, 2017, 2016, 2015, 2014, 201..."
1,100002,Ancellotta,Red,Varietal/100%,[['Ancellotta']],"[['Beef', 'Barbecue', 'Codfish', 'Pasta', ...",12.0,Medium-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10001,Casa Perini,http://www.vinicolaperini.com.br,"[2016, 2015, 2014, 2013, 2012, 2011, 2010, 200..."
2,100003,Cabernet Sauvignon,Red,Varietal/100%,[['Cabernet Sauvignon']],"[['Beef', 'Lamb', 'Poultry']]",12.0,Full-bodied,High,BR,Brazil,1001,Serra Gaúcha,10002,Castellamare,https://www.emporiocastellamare.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."
3,100004,Virtus Moscato,White,Varietal/100%,[['Muscat/Moscato']],[['Sweet Dessert']],12.0,Medium-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10003,Monte Paschoal,http://www.montepaschoal.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."
4,100005,Maison de Ville Cabernet-Merlot,Red,Assemblage/Bordeaux Red Blend,"[['Cabernet Sauvignon', 'Merlot']]","[['Beef', 'Lamb', 'Game Meat', 'Poultry']]",11.0,Full-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10000,Aurora,http://www.vinicolaaurora.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100641,200791,Rulandské Šedé Výběr z Hroznů,White,Varietal/100%,[['Pinot Gris']],"[['Rich Fish', 'Shellfish', 'Maturated Chees...",13.0,Medium-bodied,Medium,CZ,Czech Republic,2295,Morava,67056,Rodinné Vinařství Pavel Binder,http://www.pavelbinder.cz,"[2019, 2018, 2017, 2016, 2015, 2014, 2013, 201..."
100642,200792,Top Collection Merlot Pozdní Sběr,Dessert,Varietal/100%,[['Merlot']],"[['Beef', 'Game Meat']]",13.5,Medium-bodied,High,CZ,Czech Republic,2295,Morava,66978,Vinarstvi Mutěnice,http://www.vinarstvimutenice.cz,"[2020, 2019, 2018, 2017, 2016, 2015, 2013, 2010]"
100643,200793,Falter Ego Gelber Muskateller,White,Varietal/100%,[['Gelber Muskateller']],"[['Spicy Food', 'Sweet Dessert']]",12.5,Medium-bodied,High,AT,Austria,2086,Südsteiermark,62547,Hannes Sabathi,http://www.sabathi-weine.at,"[2020, 2019, 2018, 2017, 2016, 2015, 2014, 201..."
100644,200794,Reisberg Riesling,White,Varietal/100%,[['Riesling']],"[['Pork', 'Shellfish', 'Spicy Food', 'Poult...",12.5,Medium-bodied,High,AT,Austria,2093,Wien,62795,Zahel,http://www.zahel.at,"[2020, 2019, 2018, 2017, 2016, 2015, 2014, 201..."


In [8]:
df_new['Harmonize'].nunique()

900

In [9]:
df_new['Grapes'].nunique()

7107

In [59]:

df_filtered_new_exploded = df_filtered_new.explode('Grapes')

In [60]:
#count 
grape_counts_new = df_filtered_new_exploded['Grapes'].value_counts()

selected_values_grapes_new = grape_counts_new[grape_counts_new >= 2000].index

# filtering the data frame
df_grape_filtered_new = df_filtered_new_exploded[df_filtered_new_exploded['Grapes'].isin(selected_values_grapes_new)]

In [61]:

cleaned_data = df_grape_filtered_new

In [62]:
cleaned_data = cleaned_data.drop(['Vintages', 'Website'], axis=1)

In [63]:
cleaned_data.to_csv('~/code/ArjanAngenent/VinoDine/raw_data/cleaned_data.csv', index=False)

In [64]:
cleaned_data

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,Country,RegionID,RegionName,WineryID,WineryName
2,100003,Cabernet Sauvignon,Red,Varietal/100%,['Cabernet Sauvignon'],['Beef',12.0,Full-bodied,High,BR,Brazil,1001,Serra Gaúcha,10002,Castellamare
2,100003,Cabernet Sauvignon,Red,Varietal/100%,['Cabernet Sauvignon'],'Lamb',12.0,Full-bodied,High,BR,Brazil,1001,Serra Gaúcha,10002,Castellamare
2,100003,Cabernet Sauvignon,Red,Varietal/100%,['Cabernet Sauvignon'],'Poultry'],12.0,Full-bodied,High,BR,Brazil,1001,Serra Gaúcha,10002,Castellamare
4,100005,Maison de Ville Cabernet-Merlot,Red,Assemblage/Bordeaux Red Blend,['Cabernet Sauvignon',['Beef',11.0,Full-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10000,Aurora
4,100005,Maison de Ville Cabernet-Merlot,Red,Assemblage/Bordeaux Red Blend,'Merlot'],['Beef',11.0,Full-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10000,Aurora
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100645,200795,Rotleiten,Red,Assemblage/Blend,['Cabernet Sauvignon','Spicy Food',13.5,Medium-bodied,High,AT,Austria,2093,Wien,62795,Zahel
100645,200795,Rotleiten,Red,Assemblage/Blend,['Cabernet Sauvignon','Maturated Cheese',13.5,Medium-bodied,High,AT,Austria,2093,Wien,62795,Zahel
100645,200795,Rotleiten,Red,Assemblage/Blend,['Cabernet Sauvignon','Hard Cheese',13.5,Medium-bodied,High,AT,Austria,2093,Wien,62795,Zahel
100645,200795,Rotleiten,Red,Assemblage/Blend,['Cabernet Sauvignon','Poultry',13.5,Medium-bodied,High,AT,Austria,2093,Wien,62795,Zahel


In [65]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df_new['Harmonize'])


X = df_new.drop('Harmonize', axis=1)  
y = binary_labels 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_train, y_train)
predicted_labels = dummy_clf.predict(X_test)

report = classification_report(y_test, predicted_labels)

print(report)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00        11
           2       0.07      0.07      0.07      1526
           3       0.02      0.02      0.02       290
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00        22
           8       0.00      0.00      0.00         8
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         1
          12       0.01      0.00      0.00       247
          13       0.00      0.00      0.00         5
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         4
          16       0.00      0.00      0.00        22
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [66]:
wines = pd.read_csv('~/code/ArjanAngenent/VinoDine/raw_data/XWines_Full_100K_wines.csv')
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(sparse_output=True)
df = wines.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform([eval(element) for element in wines.Harmonize]),
                index=wines.index,
                columns=mlb.classes_))
X = wines.copy()
y = df.iloc[:, 17:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
# Assuming df_new is your original DataFrame containing the data
# Assuming 'Harmonize' is the target variable

# Convert the list of harmonization labels into binary array representation
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df_new['Harmonize'])

# Choose a single label from the multi-label data
# For example, you can select the first label for each sample
single_label = binary_labels[:, 0]  # Choose the first label

# Let's first split the data into features (X) and target variable (y)
X = df_new.drop('Harmonize', axis=1)  # Features
y = single_label  # Target variable (single label)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [73]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Define which columns need to be encoded
categorical_cols = ['Type', 'Body', 'Acidity', 'Code']
numeric_cols = ['ABV']
cat_pre = Pipeline([
    ('Encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
    ('Scaler', MinMaxScaler())])
cat_num = Pipeline([
    ('Scaler', MinMaxScaler())])
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pre, categorical_cols),
        ('num', cat_num, numeric_cols)  # 'passt hrough' passes through the numeric columns without transformation
    ])


In [74]:
X_train.shape, y_train.shape

((80516, 16), (80516,))

In [70]:
from sklearn.linear_model import LogisticRegression
# Create a binary classifier (Logistic Regression in this case)
binary_classifier = LogisticRegression(max_iter=500)
# Create a OneVsRestClassifier with the binary classifier
ova_classifier = OneVsRestClassifier(binary_classifier)
# Create pipeline including preprocessor and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', ova_classifier)])
# Example usage: fitting the pipeline
pipeline.fit(X_train, y_train)  # df is your DataFrame, y is your target variable

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['list']

In [71]:
from sklearn.linear_model import LogisticRegression
# Create a binary classifier (Logistic Regression in this case)
binary_classifier = LogisticRegression(max_iter=500)
# Create a OneVsRestClassifier with the binary classifier
ova_classifier = OneVsRestClassifier(binary_classifier)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', ova_classifier)])
import seaborn as sns
sns.heatmap(y_train.corr())
# Predict on the test set
y_pred = ova_classifier.predict(preprocessor.transform(X_test))
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

AttributeError: 'numpy.ndarray' object has no attribute 'corr'

In [72]:
# Count the occurrences of each grape
grape_counts = df_new_exploded['Grapes'].value_counts()

# Filter out grapes mentioned less than 2000 times
selected_grapes = grape_counts[grape_counts >= 2000].index

# Filter the DataFrame to include only rows with selected grapes
df_filtered_grapes = df_new_exploded[df_new_exploded['Grapes'].isin(selected_grapes)]

# Now, df_filtered_grapes contains only rows with grapes mentioned at least 2000 times


NameError: name 'df_new_exploded' is not defined