In [None]:
import numpy as np
import pandas as pd

In [None]:
def create_medications(names, counts):
    return pd.Series(data=counts, index=names, name="medications")

def get_percent(medications, name):
    return medications[name] / sum(medications.values) * 100

In [None]:
names=['chlorhexidine', 'cyntomycin', 'afobazol']
counts=[15, 18, 7]

med = create_medications(names, counts)
perc = get_percent(med, "chlorhexidine")

print(perc)

In [None]:
def create_companyDF(income, expenses, years):
    return pd.DataFrame({'income': income, 'expenses': expenses}, index=years)

def get_profit(df, year):
    if year in df.index:
        return df.loc[year].income - df.loc[year].expenses
    else:
        return None

In [None]:
income = [478, 512, 196]
expenses = [156, 130, 270]
years = [2018, 2019, 2020]

df = create_companyDF(income, expenses, years)

print(get_profit(df, 2018))

---

In [None]:
data = pd.read_csv("Data/data.csv", sep=",")
data['Postcode'] = data['Postcode'].astype('int64')
data['Car'] = data['Car'].astype('int64')
data['Bedroom'] = data['Bedroom'].astype('int64')
data['Bathroom'] = data['Bathroom'].astype('int64')
data['Propertycount'] = data['Propertycount'].astype('int64')
data['YearBuilt'] = data['YearBuilt'].astype('int64')
#data.info()

train = data.copy()

In [None]:
data[(data.SellerG == "Nelson") & (data.Price > 3_000_000)].shape
data[(data.Price < 1_000_000) & ((data.Rooms > 5) | (data.YearBuilt > 2015))].Price.mean()
data[(data.Type == "h") & (data.Price < 3_000_000)].Regionname.value_counts()

In [None]:
train.drop(['index','Coordinates'],axis=1,inplace=True) # duplicates the Lattitude and Longtitude
total_rooms = train['Rooms'] + train['Bedroom'] + train['Bathroom'] # total rooms from Bathrooms and Bedrooms

train['MeanRoomsSquare'] = train['BuildingArea'] / total_rooms # mean rooms square
diff_area = train['BuildingArea'] - train['Landsize']
sum_area = train['BuildingArea'] + train['Landsize']
train['AreaRatio'] = diff_area/sum_area # ratio of building area to landsize 

train['Date'] = pd.to_datetime(train['Date'])

train['AgeBuilding'] = train['Date'].dt.year - train['YearBuilt'] 
train.drop('YearBuilt', axis=1, inplace=True) #remove duplicate 

train['WeekdaySale'] = train["Date"].dt.weekday # on which day of the week the sale happened
train['MonthSale'] = train['Date'].dt.month # on which month the sale happened
train["Weekend"] = train["WeekdaySale"].apply(lambda cell: 1 if cell in [5, 6] else 0) # 0 if not on weekends, 1 - otherwise
#train[train["Weekend"] == 1]["Price"].mean()

values = train["SellerG"].value_counts().nlargest(49).index
train["SellerG"] = train["SellerG"].apply(lambda cell: cell if cell in values else "other") # narrow down the number of unique values
#train[train["SellerG"] == "Nelson"]["Price"].min() / train[train["SellerG"] == "other"]["Price"].min()

def get_street_type(address):
    exclude_list = ['N', 'S', 'W', 'E']
    address_list = address.split(' ')
    street_type = address_list[-1]
    if street_type in exclude_list:
        street_type = address_list[-2]
    return street_type

street_types = train['Address'].apply(get_street_type)
popular_stypes = street_types.value_counts().nlargest(10).index
train['StreetType'] = street_types.apply(lambda x: x if x in popular_stypes else 'other')

train.drop('Address', axis=1, inplace=True)

popular_subtype = train["Suburb"].value_counts().nlargest(119).index
train["Suburb"] = train["Suburb"].apply(lambda cell: cell if cell in popular_subtype else "other")

# unique_list = []
# for col in train.columns:
#     item = (col, train[col].nunique(),train[col].dtype) 
#     unique_list.append(item) 
# unique_counts = pd.DataFrame(
#     unique_list,
#     columns=['Column_Name', 'Num_Unique', 'Type']
# ).sort_values(by='Num_Unique',  ignore_index=True)

In [None]:
cols_to_exclude = ['Date', 'Rooms', 'Bedroom', 'Bathroom', 'Car'] # список столбцов, которые мы не берём во внимание
max_unique_count = 150 # задаём максимальное число уникальных категорий
for col in train.columns: # цикл по именам столбцов
    if train[col].nunique() < max_unique_count and col not in cols_to_exclude: # проверяем условие
        train[col] = train[col].astype('category') # преобразуем тип столбца

train["Suburb"] = train["Suburb"].astype("category")

In [None]:
train.info()

In [None]:
#all(map(lambda col: col in train.columns, []))
countries_df = pd.DataFrame({
    'country': ['Англия', 'Канада', 'США', 'Россия', 'Украина', 'Беларусь', 'Казахстан'],
    'population': [56.29, 38.05, 322.28, 146.24, 45.5, 9.5, 17.04],
    'square': [133396, 9984670, 9826630, 17125191, 603628, 207600, 2724902]
})
countries_df["density"] = countries_df["population"] * 1_000_000 / countries_df["square"]
countries_df["density"].mean()


In [None]:
ufo = pd.read_csv("data/ufo.csv", sep=",")
ufo["Time"] = pd.to_datetime(ufo["Time"])
days = ufo[ufo["State"] == "NV"].Time.dt.date
days.diff().dt.days.mean()

In [None]:
students = pd.read_csv("data/students.csv")
students.info()

students["math score"].mean()
students["race/ethnicity"].value_counts(normalize=True)
students[students["test preparation course"] == "completed"]["reading score"].mean()
students[students["math score"] == 0].shape
students[students["race/ethnicity"] == "group A"]["writing score"].median() - students[students["race/ethnicity"] == "group C"]["writing score"].mean()
students[students["parental level of education"] == "bachelor's degree"].shape[0] / students.shape[0]
students[students["lunch"] == "free/reduced"]["math score"].mean()
students[students["lunch"] == "standard"]["math score"].mean()

In [None]:
import re

experience_col = pd.Series([
        'Опыт работы 8 лет 3 месяца',
        'Опыт работы 3 года 5 месяцев',
        'Опыт работы 1 год 9 месяцев',
        'Опыт работы 3 месяца',
        'Опыт работы 6 лет'
        ])
for cell in experience_col:
    total = 0
    years = re.search(r"\d+ (лет|год)", cell)
    months = re.search(r"\d+ месяц", cell)
    if years is not None:
        total += int(years.group().split()[0]) * 12
    if months is not None:
        total += int(months.group().split()[0])
    print(total)

In [None]:
bike = pd.read_csv("data/citybike.csv")
bike.info()

In [None]:
def to_time_of_day(cell):
    day_time = str()
    if cell > 18:
        day_time = "evening"
    elif cell > 12:
        day_time = "day"
    elif cell > 6:
        day_time = "morning"
    else:
        day_time = "night"
    return day_time
    

bike.drop(["end station id", "start station id"], axis=1, inplace=True)

bike["age"] = bike["birth year"].apply(lambda cell: 2018 - cell)
bike.drop(["birth year"], axis=1, inplace=True)

bike["trip duration"] = (pd.to_datetime(bike["stoptime"]) - pd.to_datetime(bike["starttime"])).dt.seconds
#bike.drop(["starttime", "stoptime"], axis=1, inplace=True)

bike["weekend"] = pd.to_datetime(bike["starttime"]).dt.weekday.apply(lambda cell: 1 if cell in [5,6] else 0)
bike["time_of_day"] = pd.to_datetime(bike["starttime"]).dt.hour.apply(to_time_of_day)

In [None]:
bike.isnull().sum()
bike["bikeid"].value_counts()
bike["usertype"].value_counts(normalize=True)
bike["gender"].value_counts()
bike["birth year"].max()
bike["end station name"].value_counts()
bike[bike["age"] > 60].shape
bike["trip duration"].mean()

In [None]:
bike[bike["time_of_day"] == "day"].shape[0] / bike[bike["time_of_day"] == "night"].shape[0]