# Libraries

In [8]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Import

In [9]:
df = pd.read_csv('winemag-data-130k-clean.csv')

In [10]:
df.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'taster_name', 'title', 'variety', 'winery', 'continent'],
      dtype='object')

In [11]:
df.dropna(subset=['continent','points', 'price'], inplace=True)

# Quels sont les mots clés qui caractérisent les bons vins ?

In [12]:

SEUIL = 90


df['description'] = df['description'].str.lower()
df['description'] = df['description'].str.replace('wine', '', regex=False)
df_positive = df[df['points'] >= SEUIL]

vectorizer = CountVectorizer(stop_words='english')
word_counts = vectorizer.fit_transform(df_positive['description'])
word_counts_df = pd.DataFrame(word_counts.toarray(), columns=vectorizer.get_feature_names_out())
word_freq = word_counts_df.sum().sort_values(ascending=False)

fig = make_subplots(rows=1, cols=1)

continents = df['continent'].unique()

for continent in continents:
    df_continent = df_positive[df_positive['continent'] == continent]
    word_counts_continent = vectorizer.fit_transform(df_continent['description'])
    word_counts_continent_df = pd.DataFrame(word_counts_continent.toarray(), columns=vectorizer.get_feature_names_out())
    word_freq_continent = word_counts_continent_df.sum().sort_values(ascending=False)

    fig.add_trace(
        go.Bar(
            x=word_freq_continent.head(20),
            y=word_freq_continent.head(20).index,
            orientation='h',
            name=continent,
            visible=False
        )
    )

buttons = []
for continent in continents:
    buttons.append(
        dict(
            label=continent,
            method='update',
            args=[{'visible': [continent == c for c in continents]},
                  {'title': f'Mots fréquents dans les critiques positives en {continent}'}]
        )
    )

buttons.append(
    dict(
        label='Tous',
        method='update',
        args=[{'visible': [True] * len(continents)},
              {'title': 'Mots fréquents dans les critiques positives'}]
    )
)

fig.update_layout(
    updatemenus=[{
        'buttons': buttons,
        'direction': 'down',
        'pad': {'r': 10, 't': 10},
        'showactive': True,
        'x': 0.1,
        'xanchor': 'left',
        'y': 1.2,
        'yanchor': 'top'
    }],
    yaxis_title='Mots',
    xaxis_title='Fréquence'
)

fig.show()

In [None]:
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import CountVectorizer
from dash import Dash, html, dcc, Input, Output

# Assume df is your DataFrame with wine reviews
app = Dash(__name__)


# Function to process data and get top words
def get_top_words(df, threshold, continent=None):
    df_filtered = df[df["points"] >= threshold]
    if continent:
        df_filtered = df_filtered[df_filtered["continent"] == continent]
    df_filtered["description"] = (
        df_filtered["description"].str.lower().str.replace("wine", "", regex=False)
    )

    vectorizer = CountVectorizer(stop_words="english")
    word_counts = vectorizer.fit_transform(df_filtered["description"])
    word_counts_df = pd.DataFrame(
        word_counts.toarray(), columns=vectorizer.get_feature_names_out()
    )
    word_freq = word_counts_df.sum().sort_values(ascending=False).head(20)
    return word_freq


# Layout
app.layout = html.Div(
    [
        dcc.Graph(id="word-freq-plot"),
        dcc.Slider(
            id="threshold-slider",
            min=df["points"].min(),
            max=df["points"].max(),
            value=90,
            step=1,
            marks={
                i: str(i)
                for i in range(int(df["points"].min()), int(df["points"].max()) + 1, 10)
            },
        ),
    ]
)


# Callback to update graph
@app.callback(Output("word-freq-plot", "figure"), Input("threshold-slider", "value"))
def update_graph(threshold):
    continents = df["continent"].unique()
    fig = make_subplots(rows=1, cols=1)

    for continent in continents:
        word_freq_continent = get_top_words(df, threshold, continent)
        fig.add_trace(
            go.Bar(
                x=word_freq_continent.values,
                y=word_freq_continent.index,
                orientation="h",
                name=continent,
            )
        )

    fig.update_layout(yaxis_title="Mots", xaxis_title="Fréquence", showlegend=True)

    return fig


app.run_server(debug=True)