# ICA_Week5B: Analyze the ramen data

In [1]:
# https://www.kaggle.com/residentmario/ramen-ratings
"""
The Ramen Rater is a product review website for the hardcore ramen enthusiast (or "ramenphile"), with over 2500 reviews to date. 
This dataset is an export of "The Big List" (of reviews), converted to a CSV format.
"""
# Import pandas package first
import pandas as pd

## Tasks

In [2]:
# 1. Read the data from the CSV file into a DataFrame
## the data file: ramen_ratings.csv
df = pd.read_csv("data/ramen_ratings.csv")

In [3]:
# 2. Display the first five rows of data
df.head()

Unnamed: 0,Brand,Variety,Style,Country,Stars
0,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75
1,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0
2,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25
3,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75
4,Ching's Secret,Singapore Curry,Pack,India,3.75


In [4]:
# 3. Display the last five rows of data
df.tail()

Unnamed: 0,Brand,Variety,Style,Country,Stars
2572,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5
2573,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1.0
2574,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2.0
2575,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2.0
2576,Westbrae,Miso Ramen,Pack,USA,0.5


In [5]:
# 4. Display statistical information for the numeric columns using the describe() method
## describe(): https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html
df.describe()

Unnamed: 0,Stars
count,2577.0
mean,3.654676
std,1.015331
min,0.0
25%,3.25
50%,3.75
75%,4.25
max,5.0


In [6]:
# 5. Dislay the number of unique values for each column using the nunique() methond
## nunique(): https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.nunique.html
df.nunique()

Brand       355
Variety    2410
Style         7
Country      38
Stars        42
dtype: int64

In [7]:
# 6. Display only rows where the country is Vietnam using the query() method
## query(): https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html
df.query("Country == 'Vietnam'")

Unnamed: 0,Brand,Variety,Style,Country,Stars
18,Binh Tay,Mi Hai Cua,Pack,Vietnam,4.00
52,Uni-President,Mushroom Flavor,Pack,Vietnam,0.00
143,Mum Ngon,Lau Tom Chua Cay,Pack,Vietnam,3.50
224,Vifon,Viet Cuisine Bun Rieu Cua Sour Crab Soup Insta...,Bowl,Vietnam,5.00
365,Acecook,Oh! Ricey Pork Flavour,Pack,Vietnam,4.00
...,...,...,...,...,...
2486,Binh Tay,Mi Chay Mushroom,Pack,Vietnam,2.75
2535,Ve Wong,Kung-Fu Chicken Flavor,Pack,Vietnam,2.75
2570,Ve Wong,Mushroom Pork,Pack,Vietnam,1.00
2571,Vifon,Nam Vang,Pack,Vietnam,2.50


In [8]:
# 7. Display only the Brand and Style columns
df[["Brand", "Style"]]

Unnamed: 0,Brand,Style
0,New Touch,Cup
1,Just Way,Pack
2,Nissin,Cup
3,Wei Lih,Pack
4,Ching's Secret,Pack
...,...,...
2572,Vifon,Bowl
2573,Wai Wai,Pack
2574,Wai Wai,Pack
2575,Wai Wai,Pack


In [9]:
# 8. Display only the Country column
df["Country"]

0          Japan
1         Taiwan
2            USA
3         Taiwan
4          India
          ...   
2572     Vietnam
2573    Thailand
2574    Thailand
2575    Thailand
2576         USA
Name: Country, Length: 2577, dtype: object

In [10]:
# 9. Display the data after it has been sorted by the Stars column from high values to low values
## sort_values(): https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html
df.sort_values(by=["Stars"])

Unnamed: 0,Brand,Variety,Style,Country,Stars
555,Urban Noodle,Authentic Street Food Black Bean,Cup,UK,0.0
796,Western Family,Beef Flavour Instant Noodles,Pack,Canada,0.0
2534,Kim Ve Wong,Jaopai Series: Vegetarian Instant Noodles,Bowl,Taiwan,0.0
561,Samyang Foods,Honey & Cheese Big Bowl,Bowl,South Korea,0.0
950,Azami,Kimchee Noodle Soup,Cup,Canada,0.0
...,...,...,...,...,...
346,Nissin,Spicy Chikin Donburi,Bowl,Japan,5.0
1837,Indomie,Mi Goreng Sate (Local),Pack,Indonesia,5.0
350,Nissin,Raoh Shoyu Ramen,Bowl,Japan,5.0
800,JML,Spicy King Spicy Chicken,Pack,China,5.0


In [11]:
# 10. In the Country column, replace 'USA' with 'United States. Make sure this change is saved in the DataFrame, 
# and then display the first five rows to be sure the change was made correctly.
## replace(): https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html
df["Country"].replace({"USA": "United States"}, inplace=True)
df.head()

Unnamed: 0,Brand,Variety,Style,Country,Stars
0,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75
1,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0
2,Nissin,Cup Noodles Chicken Vegetable,Cup,United States,2.25
3,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75
4,Ching's Secret,Singapore Curry,Pack,India,3.75


## Questions

In [12]:
# 1. How many countries are represented in the data?
print(f"The total number of countries represented in the data is {df["Country"].nunique()}")

The total number of countries represented in the data is 37


In [13]:
# 2. Which three countries have the highest average rating? 
## Feel free to try different methods. You may consider groupby(): https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html
print("Top 3 best rated countires:")
df.groupby(["Country"])["Stars"].mean().sort_values(ascending=False)[:3]

Top 3 best rated countires:


Country
Brazil      4.350000
Sarawak     4.333333
Cambodia    4.200000
Name: Stars, dtype: float64

In [14]:
# 3. Which three countreis have the lowest average rating?
print("Top 3 worst rated countires:")
df.groupby(["Country"])["Stars"].mean().sort_values(ascending=True)[:3]

Top 3 worst rated countires:


Country
Nigeria        1.500000
Canada         2.243902
Netherlands    2.483333
Name: Stars, dtype: float64

In [15]:
# 4. Which three countries have the most brands, and how many brands does each of these countries have?
print("Top 3 countries with the most brands:")
df.groupby(["Country"])["Brand"].nunique().sort_values(ascending=False)[:3]

Top 3 countries with the most brands:


Country
Japan            58
United States    49
Taiwan           47
Name: Brand, dtype: int64

The end of this notebook.