## Quelle est l'influence du nombre de variétés sur la qualité des vins produit par un établissement vinicole donné ?

In [22]:
import plotly.graph_objects as go
import pandas as pd
from plotly.subplots import make_subplots
import plotly.express as px

In [23]:
df = pd.read_csv('winemag-data-130k-clean.csv')

df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,continent
0,Italy,aromas include tropical fruit broom brimstone ...,Vulkà Bianco,87,,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,Europe
1,Portugal,ripe fruity wine that smooth while still struc...,Avidagos,87,15.0,Douro,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,Europe
2,United States of America,tart snappy flavors lime flesh rind dominate. ...,,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,North America
3,United States of America,pineapple rind lemon pith orange blossom start...,Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,North America
4,United States of America,regular bottling 2012 comes across rather roug...,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,North America


In [24]:
# group by winery with mean points and count of unique varieties 
# country and  province and continent have to be included in the groupby
df_winery = df.groupby(['country', 'province', 'winery', "continent"]).agg({'points': 'median', 'variety': 'nunique'}).reset_index()

df_winery.head()

Unnamed: 0,country,province,winery,continent,points,variety
0,Argentina,Mendoza Province,2 Copas,South America,81.0,1
1,Argentina,Mendoza Province,Achaval-Ferrer,South America,91.0,5
2,Argentina,Mendoza Province,Aconcagua,South America,82.0,2
3,Argentina,Mendoza Province,Aconga,South America,83.0,4
4,Argentina,Mendoza Province,Acordeón,South America,84.0,3


In [25]:
# simple scatter plot of points vs variety

fig = px.scatter(df_winery, x="variety", y="points", color="continent", hover_data=['country', 'province', 'winery'])

fig.show()

In [26]:
# group by number of varieties and median points
df_variety = df.groupby(['variety', 'continent']).agg({'points': 'median', 'winery': 'nunique'}).reset_index()

df_variety

Unnamed: 0,variety,continent,points,winery
0,Abouriou,Europe,89.0,2
1,Abouriou,North America,85.0,1
2,Agiorgitiko,Europe,87.0,29
3,Aglianico,Europe,89.0,94
4,Aglianico,North America,89.0,12
...,...,...,...,...
1116,Zlahtina,Europe,86.0,2
1117,Zweigelt,Europe,89.0,79
1118,Zweigelt,North America,88.0,3
1119,Çalkarası,Europe,86.5,1


In [27]:
# show difference in variety and points by continent

fig = px.scatter(df_variety, x="variety", y="points", color="continent", hover_data=['variety', 'continent', 'winery'])

# rotate x axis labels
fig.update_layout(xaxis_tickangle=-45)

fig.show()

In [55]:
df_winery_grouped_continent  = df_winery.groupby(['variety', 'continent']).agg({'points': 'mean', 'winery': 'nunique'}).reset_index()
df_winery_grouped_country = df_winery.groupby(['variety', 'country']).agg({'points': 'mean', 'winery': 'nunique'}).reset_index()
df_winery_grouped = df_winery.groupby(['variety']).agg({'points': 'mean', 'winery': 'nunique'}).reset_index()

In [58]:
# rename variety to number of varieties and winery to number of wineries

df_winery_grouped_continent.rename(columns={'variety': 'number of varieties', 'winery': 'number of wineries'}, inplace=True)
df_winery_grouped_country.rename(columns={'variety': 'number of varieties', 'winery': 'number of wineries'}, inplace=True)
df_winery_grouped.rename(columns={'variety': 'number of varieties', 'winery': 'number of wineries'}, inplace=True)

In [64]:
# assign a friendly  color to each continent
color_map = {'Africa': 'red', 'Asia': 'blue', 'Europe': 'green', 'North America': 'orange', 'Oceania': 'purple', 'South America': 'brown'}

# create a new column with the color for each continent
df_winery_grouped_continent['color'] = df_winery_grouped_continent['continent'].apply(lambda x: color_map[x])

In [68]:
fig = go.Figure()



fig.add_trace(go.Scatter(
    x=df_winery_grouped['number of varieties'], 
    y=df_winery_grouped['points'],
    mode='markers',
    name='All')
)

fig.add_trace(go.Scatter(
    visible=False,
    x=df_winery_grouped_continent['number of varieties'], 
    y=df_winery_grouped_continent['points'],
    marker=dict(color=df_winery_grouped_continent['color']),
    hoverinfo='text',
    hovertext=df_winery_grouped_continent['continent'],
    mode='markers',
    name='Continent')
)

fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="right",
            showactive=True,
            x=0.4,  # Position horizontale ajustée
            y=1.1,  # Position verticale ajustée
            xanchor='left',
            yanchor='top',
            buttons=list([
                dict(label="non groupé",
                     method="update",
                     args=[{"visible": [True, False]},
                           {"title": "Points vs nombre de variétés"}]),
                dict(label="continent",
                     method="update",
                     args=[{"visible": [False, True]},
                           {"title": "Points vs nombre de variétés par continent"}]),
            ]),
        )
    ]
)

fig.update_layout(
    title=dict(
        text="Points vs nombre de variétés",  # Texte du titre
        font=dict(
            size=20,  # Taille de la police du titre
            color="Black"  # Couleur du titre
        ),
        # x=0.5,  # Centrer le titre
        # y=0.95  # Ajuster la position verticale du titre
    ),
    margin=dict(l=0, r=0, t=50, b=0)  # Ajustez les marges si nécessaire
)

# Show the plot
fig.show()

un vigneron qui produit beaucoup de variétés de vin est-il meilleur qu'un autre qui n'en produit que quelques unes ?
a priori, on pourrait penser que oui, car l'on pourrait penser que la diversité des produits proposés est un gage de qualité.
mais est-ce vraiment le cas ?
selon les données que nous avons, on peut observer qu'il n'y a pas de corrélation entre le nombre de variétés produites et la qualité des vins produits.

In [31]:
# Create an interactive plot with Plotly Express
# group by continent and sort by points
df_winery_desc = df_winery.groupby(['continent']).agg({'points': 'median', 'variety': 'sum'}).reset_index()


fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Bar(
    x=df_winery_desc['continent'],
    y=df_winery_desc['variety'],
    name="variety"
), secondary_y=False)

# add trace for points and give it a different yaxis
fig.add_trace(go.Scatter(
    x=df_winery_desc['continent'],
    y=df_winery_desc['points'],
    name="points",
    yaxis="y2"
), secondary_y=True)

# Add figure title
fig.update_layout(
    title_text="Variety and Points by Continent"
)

# Set x-axis title
fig.update_xaxes(title_text="Continent")

# Set y-axes titles
fig.update_yaxes(title_text="Variety", secondary_y=False)
fig.update_yaxes(title_text="Points", secondary_y=True)


# Show the interactive plot
fig.show()

In [32]:
fig = px.scatter(df_winery, x='variety', y='points', color='country', trendline='ols',
                 facet_col='continent', labels={'variety': 'Number of Variety', 'points': 'Points'},
                 title='Points vs. Number of Variety by Country')

# Show the plot

# rotate xaxis labels
fig.update_xaxes(tickangle=45)

fig.show()