In [3]:
import pandas as pd
import numpy as np
import os

DATA_DIR = "data" # indicate magical constansts (maybe rather put it on the top of the script)
# fix gruesome var names
crime06_filename = "CrimeOneYearofData_2006.xlsx"
crime07_filename = "CrimeOneYearofData_2007.xlsx"

print(os.path.join(DATA_DIR, crime06_filename))
print(os.path.join(DATA_DIR, crime07_filename))

data/CrimeOneYearofData_2006.xlsx
data/CrimeOneYearofData_2007.xlsx


In [4]:
# assign revenues in $ to marketing campaigns
camp1_revenue = 50000
camp2_revenue = 100000

#### BAD WAY ####
# calc whic performed better
camps_revenue_diff = (camp2_revenue * 0.65)  - camp1_revenue
#### END BAD WAY ####

#### GOOD WAY ####
CAMP2_NORMALIZER = 0.65 # we need to normalize because the campaign ran in peak season
# calc whic performed better
camps_revenue_diff = (camp2_revenue * CAMP2_NORMALIZER)  - camp1_revenue
#### END GOOD WAY ####

In [5]:
# Our original function
def mystery_combine(a, b, times):
    return (a + b) * times

print(mystery_combine(2, 3, 4))
# 20

print(mystery_combine('Hello ', 'World! ', 4))
# Hello World! Hello World! Hello World! Hello World!

# show your intents explicitly by indicating types of your argument and returned value
def mystery_combine(a: str, b: str, times: int) -> str:
    return (a + b) * times

20
Hello World! Hello World! Hello World! Hello World! 


In [6]:
import pandas as pd
import os

DATA_PATH = "data"
filename_list = os.listdir(DATA_PATH)

# read in bunch of csv-s from a dir
csv_list = []
for fileaname in filename_list:
  csv_list.append(pd.read_csv(os.path.join(DATA_PATH, filename)))

In [9]:
import pandas as pd
import os

DATA_PATH = "data"
filename_list = os.listdir(DATA_PATH)

#### GOOD WAY ####
csv_list = [pd.read_csv(os.path.join(DATA_PATH, filename)) for filename in filename_list]
# what about if not only .csv-s are present? easy to tackle this with list comprehensions
csv_list = [
  pd.read_csv(os.path.join(DATA_PATH, filename)) for filename in filename_list if filename.endswith(".csv")]
#### END GOOD WAY ####

In [10]:
# lets aggregate click and time spent to its mean in a Q
var_list = ["clicks", "time_spent"]
var_list_Q = [varname + "_Q" for varname in var_list]

#### BAD WAY ####
df_Q = df.groupby("id").rolling(window=3, min_periods=1, on="yearmonth")[var_list].mean().reset_index().rename(columns=dict(zip(var_list, var_list_Q)))
#### BAD WAY ####

#### GOOD WAY ####
df_Q = (
    df
    .groupby("id")
    .rolling(window=3, min_periods=1, on="yearmonth")[var_list]
    .mean()
    .reset_index()
    .rename(columns=dict(zip(var_list, var_list_Q))))
#### END GOOD WAY ####

In [11]:
import datetime
from dateutil.relativedelta import relativedelta

# task: get months between two dates in YM format

#### BAD WAY ####
start_num = 201910
end_num = 202012

res_list = []
iter_num = start_num
while iter_num < end_num:
    if abs(iter_num) % 100 > 12:
        iter_num += 88
        res_list.append(iter_num)
        iter_num += 1
    else:
        res_list.append(iter_num)
        iter_num += 1
res_list.append(iter_num)
#### END BAD WAY ####

#### GOOD WAY ####
# initialize datetimes
start_datetime = datetime.datetime(2019, 10, 1)
end_datetime = datetime.datetime(2020, 12, 1)

# find months between end and astart date
r = relativedelta(end_datetime, start_datetime)
months_between = r.months + (12*r.years)

myres = [
    start_datetime + relativedelta(months=_)
    for _ in range(1, months_between + 1)]
# format dates
myres = [element.strftime("%Y%m") for element in myres]
#### END GOOD WAY ####

In [12]:
myres

['201911',
 '201912',
 '202001',
 '202002',
 '202003',
 '202004',
 '202005',
 '202006',
 '202007',
 '202008',
 '202009',
 '202010',
 '202011',
 '202012']