## Tasks

## Load the wine Dataset

Calculate the `median` of the `points` column from the dataset frame.

In [9]:
import pandas as pd

# Load the dataset
wine_df = pd.read_csv('../data/wine-reviews/winemag-data-130k-v2.csv')

# Calculate the median of the 'points' column
median_points = wine_df['points'].median()

# Output the result
print(median_points)

88.0


In [10]:
# Find the unique countries represented in the dataset
countries = wine_df['country'].unique()

# Output the result
print(countries)

['Italy' 'Portugal' 'US' 'Spain' 'France' 'Germany' 'Argentina' 'Chile'
 'Australia' 'Austria' 'South Africa' 'New Zealand' 'Israel' 'Hungary'
 'Greece' 'Romania' 'Mexico' 'Canada' nan 'Turkey' 'Czech Republic'
 'Slovenia' 'Luxembourg' 'Croatia' 'Georgia' 'Uruguay' 'England' 'Lebanon'
 'Serbia' 'Brazil' 'Moldova' 'Morocco' 'Peru' 'India' 'Bulgaria' 'Cyprus'
 'Armenia' 'Switzerland' 'Bosnia and Herzegovina' 'Ukraine' 'Slovakia'
 'Macedonia' 'China' 'Egypt']


In [11]:
# Find how often each country appears in the dataset
reviews_per_country = wine_df['country'].value_counts()

# Output the result
print(reviews_per_country)

country
US                        54504
France                    22093
Italy                     19540
Spain                      6645
Portugal                   5691
Chile                      4472
Argentina                  3800
Austria                    3345
Australia                  2329
Germany                    2165
New Zealand                1419
South Africa               1401
Israel                      505
Greece                      466
Canada                      257
Hungary                     146
Bulgaria                    141
Romania                     120
Uruguay                     109
Turkey                       90
Slovenia                     87
Georgia                      86
England                      74
Croatia                      73
Mexico                       70
Moldova                      59
Brazil                       52
Lebanon                      35
Morocco                      28
Peru                         16
Ukraine                      14


In [12]:
# Calculate the mean of the 'price' column
mean_price = wine_df['price'].mean()

# Create a new variable 'centered_price' by subtracting the mean price from each price
centered_price = wine_df['price'] - mean_price

# Output the result (optional)
print(centered_price)

0               NaN
1        -20.363389
2        -21.363389
3        -22.363389
4         29.636611
            ...    
129966    -7.363389
129967    39.636611
129968    -5.363389
129969    -3.363389
129970   -14.363389
Name: price, Length: 129971, dtype: float64


In [13]:
# First, calculate the point-per-price ratio
wine_df['point_per_price'] = wine_df['points'] / wine_df['price']

# Then, find the index of the wine with the highest point-per-price ratio
bargain_index = wine_df['point_per_price'].idxmax()

# Extract the title of the wine with the highest point-per-price ratio
bargain_wine = wine_df.loc[bargain_index, 'title']

# Output the result
print(bargain_wine)

Bandit NV Merlot (California)


In [19]:
# Count occurrences of the words 'fruity' and 'tropical' in the 'description' column
fruity_count = wine_df['description'].str.contains('fruity', case=False, na=False).sum()
tropical_count = wine_df['description'].str.contains('tropical', case=False, na=False).sum()

tropical_count2 = wine_df['description'].map(lambda desc: 'tropical' in str(desc)).sum()
fruity_count2 = wine_df['description'].map(lambda desc: 'fruity' in str(desc)).sum()

# Create a Series with the counts
descriptor_counts = pd.Series({'fruity': fruity_count, 'tropical': tropical_count})
descriptor_counts2 = pd.Series({'fruity': fruity_count2, 'tropical': tropical_count2})

# Output the result
print(descriptor_counts)
print(descriptor_counts2)


fruity      9455
tropical    3800
dtype: int64
fruity      9090
tropical    3607
dtype: int64


In [21]:
# Create a Series called: star_ratings
# With the number of stars corresponding to each review in the Dataset.
# Rating system ranges from 80-100 points.
# A score of 95 or higher = 3 stars.
# A score of at least 85 but less than 95 = 2 stars.
# Any other score is 1 star.

# Create the 'star_ratings' Series based on the points
star_ratings = pd.cut(wine_df['points'], 
                      bins=[0, 84, 94, 100], 
                      labels=[1, 2, 3], 
                      right=True, 
                      include_lowest=True)

star_counts = star_ratings.value_counts()

def assign_stars(points):
    if points >= 95:
        return 3
    elif points >= 85:
        return 2
    else:
        return 1

# Use apply with lambda to create the 'star_ratings' column
star_ratings2 = wine_df['points'].apply(lambda x: assign_stars(x))

# Count the occurrences of each star rating
star_counts2 = star_ratings.value_counts()

# Output the counts
print(star_counts)

# Output the result (optional)
print(star_counts2)

points
2    115125
1     12430
3      2416
Name: count, dtype: int64
points
2    115125
1     12430
3      2416
Name: count, dtype: int64
