Далее идут строки импортов и настройки pandas

In [2]:
from pandas import DataFrame, set_option

from utils.load import load_middleware
from utils.my_argparse import setup_basic_config
from utils.write import write_middleware

set_option('display.max_rows', None)
set_option('display.max_columns', None)
set_option('display.max_colwidth', None)

Далее функция, которая выполняет задания подпункта 2

Вызовите метод info(), чтобы увидеть число пропущенных значений
Очистите датасет от всех заведений, у которых средний чек неизвестен или превышает 2500
Далее заполните оставшиеся пропуски средними значениями с помощью fillna()

In [3]:
def point_two(dataset: DataFrame) -> DataFrame:
    """
    :param dataset: DataFrame obj of table
    :return:
    """

    # Showing info about non-null values
    # (To see missed values (43195-non-null) values)
    # print(dataset.info(show_counts=True))

    # Drop rows from the DataFrame, where average_bill is null or more than 2500
    dataset_with_filter = dataset.query(
        'average_bill != None and average_bill <= 2500'
    )

    # fill in the remaining gaps with average values
    return dataset_with_filter.fillna(dataset_with_filter.mean(numeric_only=True))

Далее функция, которая выполняет задания подпункта 3

Каких заведений больше - из Москвы или Санкт-Петербурга? Какие типы заведений есть в датасете?
Сколько в датасете суммарно ресторанов и пабов? Какой у них средний чек?
Какая доля заведений “Кофе с собой” действительно имеет фичу кофе на вынос?
Найдите разницу между средним чеком всех кафе в Москве и Санкт-Петербурге.

In [4]:
def point_three(dataset: DataFrame) -> None:
    """
    :param dataset: DataFrame obj of table
    :return:
    """

    # Showing counts of coffee shops in sbp and msk
    format_string = 'Maximum cafes in Name: "{}" Count: "{}"'
    cities = dataset["city"].value_counts()
    print(f'\n{format_string.format(*max(cities.items(), key=lambda x: x[1]))}')

    # Showing types of coffee shops
    # (I'm sorting it by len because I like it)

    coffee_shop_types = sorted(set(dataset["rubric"].tolist()), key=lambda x: -len(x))

    print('\nTypes of cafes in msk and spb:\n')
    for i in range(0, len(coffee_shop_types), 3):
        print("\t\t".join(coffee_shop_types[i:i + 3]))

    keys = {'Ресторан', 'Бар, паб'}

    # getting average bill
    print(f'\nAverage Bill of Cafes:\n{dataset.query(f"rubric in {tuple(keys)}")["average_bill"].mean()}')

    coffee_to_go = dataset.query('rubric == "Кофе с собой"')
    coffee_to_go_percent = round(coffee_to_go['coffee_to_go'].mean(), 2)

    # The share of "Coffee with you" establishments has a takeaway coffee feature Bill
    print(f'\nThe share of "Coffee with you” establishments has a takeaway coffee feature: '
          f'{coffee_to_go_percent} (in percents is {coffee_to_go_percent * 100}%)')

    msk_sbp = dataset.groupby('city').agg(
        {
            'average_bill': 'sum',
            'city': 'count'
        }
    )

    msk_average_bill = msk_sbp['average_bill']['msk'] / msk_sbp['city']['msk']
    sbp_average_bill = msk_sbp['average_bill']['spb'] / msk_sbp['city']['spb']

    print(f'\nAverage Bill Difference between msk and spb: {round(msk_average_bill - sbp_average_bill, 2)}')

Далее функция, которая выполняет задания подпункта 4

Вызовите метод describe() и напишите 2-3 своих наблюдения. Например, какая фича самая редкая?

In [5]:
def point_four(dataset: DataFrame) -> None:
    features_names = [
        'coffee_to_go', 'food_delivery', 'breakfast', 'car_park', 'wi_fi', 'karaoke',
        'payment_by_credit_card', 'summer_terrace', 'gift_certificate', 'business_lunch',
        'view_on_ostankino_tower', 'free_delivery', 'music', 'pets', 'air_conditioning'
    ]

    rare_feature = dataset.agg({key: 'sum' for key in features_names})

    format_string = 'Name: "{}" Occurs: "{}"'
    min_feature = format_string.format(*min(rare_feature.items(), key=lambda x: x[1]))
    max_feature = format_string.format(*max(rare_feature.items(), key=lambda x: x[1]))

    print(
        f'\nThere is an institution whose rating is 0\n'
        f'Feature occurs min times is {min_feature}\n'
        f'Feature occurs max times is {max_feature}\n'
    )

    # print(f'\n{dataset.describe(include="all")}')

Далее функция, которая выполняет задания подпункта 5

Как изменился средний чек ресторанов?
Что еще можно сказать об изменениях в других столбцах? Напишите 1-2 наблюдения.

In [6]:
def point_five(dataset: DataFrame):
    dataset = dataset.query('city == "spb"')
    # creating new table
    write_middleware('spb_datasets_eat_places.csv', dataset)
    # write_middleware('spb_datasets_eat_places.xlsx', dataset)

    print(f'\nAverage Bill: {round(dataset["average_bill"].mean(), 2)} (it has shrunk)')
    print(f'\nThere is not an institution whose have feature "view_on_ostankino_tower"')
    # print(f'\n{dataset.describe(include="all")}')


Далее идет главвная функция, которая запускает все

In [7]:
def main():
    # Getting arguments from command line
    args = setup_basic_config()

    # Getting values from table
    dataset: DataFrame = load_middleware(args.input_from)

    # Point Two
    dataset = point_two(dataset)

    # Point Three
    point_three(dataset)

    # Point Four
    point_four(dataset)

    # Point Five
    point_five(dataset)


if __name__ == '__main__':
    main()

Loading E:\PythonProjects\projects\mlYandex\source\csv\datasets_eat_places.csv
End of data loading

Maximum cafes in Name: "msk" Count: "11075"

Types of cafes in msk and spb:

Бар безалкогольных напитков		Булочная, пекарня		Быстрое питание
Кофе с собой		Кондитерская		Кальян-бар
Бар, паб		Спортбар		Столовая
Суши-бар		Пиццерия		Ресторан
Кофейня		Кафе

Average Bill of Cafes:
1099.0958408679928

The share of "Coffee with you” establishments has a takeaway coffee feature: 0.59 (in percents is 59.0%)

Average Bill Difference between msk and spb: 94.73

There is an institution whose rating is 0
Feature occurs min times is Name: "pets" Occurs: "13"
Feature occurs max times is Name: "wi_fi" Occurs: "15248"

Writing to E:\PythonProjects\projects\mlYandex\source\csv\spb_datasets_eat_places.csv
End of data writing

Average Bill: 689.56 (it has shrunk)

There is not an institution whose have feature "view_on_ostankino_tower"


Видео