In [7]:
import sqlite3
import pandas as pd 

conn = sqlite3.connect("data/vivino.db")
cursor = conn.cursor()

### 1. highlight 10 wines to increase our sales. Which ones should we choose and why?

In [8]:
query_1 = f"""
    SELECT wines.name, vintages.price_euros, vintages.ratings_average, rank
    FROM vintages
    JOIN vintage_toplists_rankings
    ON vintages.id = vintage_toplists_rankings.vintage_id
    JOIN wines
    ON vintages.wine_id = wines.id
    WHERE vintage_toplists_rankings.rank > vintage_toplists_rankings.previous_rank
    AND vintages.price_euros < (SELECT AVG(price_euros) FROM vintages) AND vintages.ratings_average > 4.0 AND rank > 10
    ORDER BY vintages.price_euros ASC, vintages.ratings_average DESC
    LIMIT 10;"""

df = pd.read_sql_query(query_1, conn)

df.index += 1

df


# cursor.execute(query_1)
# cursor.fetchall()

Unnamed: 0,name,price_euros,ratings_average,rank
1,60 Sessantanni Old Vines Primitivo di Manduria,24.75,4.4,17
2,60 Sessantanni Old Vines Primitivo di Manduria,24.75,4.4,19
3,Limited Edition 10 Vendemmie,34.2,4.5,15
4,Malleolus,35.95,4.4,18
5,Lupi Rezerva,36.55,4.6,15
6,Bramare Malbec Uco Valley,60.2,4.5,12
7,Tinto,64.95,4.4,20
8,Chardonnay,78.95,4.4,18
9,Malbec Argentino,89.95,4.5,15
10,Les Noisetiers,94.0,4.4,19


In [9]:
import seaborn as sns 
import matplotlib.pyplot as plt

query_L = f"""
    SELECT wines.name, acidity, fizziness, intensity, sweetness, tannin, price_euros, rank, vintages.ratings_average
    FROM vintages
    JOIN vintage_toplists_rankings
    ON vintage_toplists_rankings.vintage_id = vintages.id
    JOIN wines
    ON vintages.wine_id = wines.id
    WHERE vintage_toplists_rankings.rank IS NOT NULL
    ORDER BY vintage_toplists_rankings.rank ASC;"""

df = pd.read_sql_query(query_L, conn)


# df["tannin"].fillna(0, inplace=True)
# df["sweetness"].fillna(0, inplace=True)
# df.drop(columns=["fizziness", "name"], inplace=True)

# correlation_matrix = df.corr()

# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# plt.title('Correlation Heatmap of Wine Characteristics and Prices')
# plt.show()


df.index += 1
df.to_csv('analysis_rank1.csv', index=False)

# cursor.execute(query_L)
# cursor.fetchall()

In [10]:
query_1_2 = f"""
    SELECT DISTINCT keyword_type
    FROM vintage_toplists_rankings
    JOIN vintages
    ON vintage_toplists_rankings.vintage_id = vintages.id
    JOIN keywords_wine
    ON vintages.wine_id = keywords_wine.wine_id
    WHERE vintage_toplists_rankings.rank IS NOT NULL
    LIMIT 10 ;
"""

cursor.execute(query_1_2)
cursor.fetchall()


[('primary',), ('secondary',)]

In [11]:
query_2_3 = f"""
    SELECT MAX(sweetness)
    FROM wines"""

cursor.execute(query_2_3)
cursor.fetchall()

[(5.0,)]

### 2. We have a limited marketing budget for this year. Which country should we prioritise and why?

In [26]:
query_2 = f"""
    SELECT name, regions_count, users_count, wineries_count
    FROM countries
    WHERE users_count IS NOT NULL
    ORDER BY users_count DESC, wineries_count DESC
    LIMIT 3;"""

df = pd.read_sql_query(query_2, conn)

df.index += 1

df['users_count'] = df['users_count'].apply(lambda x: "{:,}".format(x))
df['wineries_count'] = df['wineries_count'].apply(lambda x: "{:,}".format(x))

df

# cursor.execute(query_2)
# cursor.fetchall()

Unnamed: 0,name,regions_count,users_count,wineries_count
1,États-Unis,362,12273684,28145
2,France,1306,5973301,67553
3,Italie,563,4270717,42399


In [25]:
query_2 = f"""
    SELECT countries.name, regions_count, users_count, wineries_count, toplists.name
    FROM countries
    JOIN toplists
    ON toplists.country_code = countries.code
    WHERE users_count IS NOT NULL
    ORDER BY users_count DESC;"""

df = pd.read_sql_query(query_2, conn)

df.index += 1

df['users_count'] = df['users_count'].apply(lambda x: "{:,}".format(x))
df['wineries_count'] = df['wineries_count'].apply(lambda x: "{:,}".format(x))

df

Unnamed: 0,name,regions_count,users_count,wineries_count,name.1
1,États-Unis,362,12273684,28145,Bestsellers in Massachusetts
2,États-Unis,362,12273684,28145,Bestsellers in Tennessee
3,États-Unis,362,12273684,28145,Bestsellers in Arizona
4,États-Unis,362,12273684,28145,Bestsellers in Nevada
5,États-Unis,362,12273684,28145,Bestsellers in New Mexico
6,États-Unis,362,12273684,28145,Bestsellers in Wyoming
7,États-Unis,362,12273684,28145,Bestsellers in Montana
8,États-Unis,362,12273684,28145,Bestsellers in North Dakota
9,États-Unis,362,12273684,28145,Bestsellers in Utah
10,Italie,563,4270717,42399,Bestsellers in Italy


### 3. We would like to give awards to the best wineries. Come up with 3 relevant ones. Which wineries should we choose and why?

In [28]:
query_3 = f"""
    SELECT name
    FROM wineries;
    """

df = pd.read_sql_query(query_3, conn)

df.index += 1

df

Unnamed: 0,name
1,Vin Santo di Montepulciano
2,St. Henri Shiraz
3,Siepi
4,Solare Toscana
5,Tenuta Tignanello 'Solaia'
...,...
1016,Elevage Branco
1017,Moss Blanc Verdicchio dei Castelli di Jesi Cla...
1018,Academia Purcari Rară Neagră
1019,Paralupi Appassimento
