# Exercise 01 : Basic operations

In [1]:
import pandas as pd

df = pd.read_csv('../feed-views.log', skiprows=[2, 3], skipfooter=2, engine='python', names=['datetime', 'user'], delimiter='\t')

## Create a dataframe views with two columns: datetime and user by reading feed-views.log

- convert the datetime to the datetime64[ns] Dtype
- extract the year, month, day, hour, minute, and second from the values of that column to the new columns

In [2]:
# Convert 'datetime' column to datetime64[ns]
df['datetime'] = pd.to_datetime(df['datetime'])

# Extract year, month, day, hour, minute, and second
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['second'] = df['datetime'].dt.second

## Create the new column daytime

- you need to assign the particular time of day value if an hour is within a particular interval, for example, afternoon if the hour is larger than 11 and less or equal to 17
- 0 – 3.59 night, 4 – 6.59 early morning, 7 – 10.59 morning, 11 – 16.59 afternoon, 17 – 19.59 early evening, 20 – 23.59 evening
- use the method cut to solve this subtask
- assign the column user as the index

In [3]:
# Create 'daytime' column
conditions = [
    (df['hour'] >= 0) & (df['hour'] <= 3),
    (df['hour'] >= 4) & (df['hour'] <= 6),
    (df['hour'] >= 7) & (df['hour'] <= 10),
    (df['hour'] >= 11) & (df['hour'] <= 16),
    (df['hour'] >= 17) & (df['hour'] <= 19),
    (df['hour'] >= 20) & (df['hour'] <= 23)
]
values = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']
df['daytime'] = pd.cut(df['hour'], bins=6, labels=values, include_lowest=True)

# Set 'user' column as index
df.set_index('user', inplace=True)

## Calculate the number of elements in your dataframe

- use the method count()
- calculate the number of elements in each time of day category using the method value_counts()

In [4]:
# Number of elements
num_elements = df['daytime'].count()
print(f"Number of elements: {num_elements}")

# Number of elements in each time of day category
time_of_day_counts = df['daytime'].value_counts()
print(f"Number of elements in each time of day category:\n{time_of_day_counts}")

Number of elements: 1072
Number of elements in each time of day category:
daytime
evening          508
early evening    192
afternoon        165
night            129
morning           73
early morning      5
Name: count, dtype: int64


## Sort values in your dataframe by hour, minute, and second in ascending order (simultaneously and not one by one)

In [5]:
df.sort_values(by=['hour', 'minute', 'second'], inplace=True)

## Calculate the minimum and maximum for the hours and the mode for the daytime categories

- calculate the maximum of hour for the rows where the time of day is night
- calculate the minimum of hour for the rows where the time of day is morning
- In addition to this, find out who visited the page at those hours (make one example from that)
- calculate the mode for the hour and daytime

In [6]:
night_max_hour = df.loc[df['daytime'] == 'night', 'hour'].max()
print(f"Night's max hour: {night_max_hour}")

morning_min_hour = df.loc[df['daytime'] == 'morning', 'hour'].min()
print(f"Morning's min hour: {morning_min_hour}")

morning_min_user = df.loc[df['hour'] == morning_min_hour].index[0]
print(f"Morning's min user: {morning_min_user}")

hour_mode = df['hour'].mode()[0]
print(f"Hour's mode: {hour_mode}")

daytime_mode = df['daytime'].mode()[0]
print(f"Daytime's mode: {daytime_mode}")

Night's max hour: 3
Morning's min hour: 8
Morning's min user: alexander
Hour's mode: 22
Daytime's mode: evening


## Show the 3 earliest hours in the morning and the corresponding usernames and the 3 latest hours and the usernames using nsmallest() and nlargest()

In [7]:
earliest_morning = df.loc[df['daytime'] == 'morning'].nsmallest(3, 'hour')
print("Earliest morning:")
earliest_morning

Earliest morning:


Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alexander,2020-05-15 08:16:03.918402,2020,5,15,8,16,3,morning
alexander,2020-05-15 08:35:01.471463,2020,5,15,8,35,1,morning
alexander,2020-05-15 09:02:24.999438,2020,5,15,9,2,24,morning


In [8]:
latest_hours = df.nlargest(3, 'hour')
print("Latest hours:")
latest_hours

Latest hours:


Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ekaterina,2020-05-14 23:02:11.327532,2020,5,14,23,2,11,evening
ekaterina,2020-05-14 23:02:14.494985,2020,5,14,23,2,14,evening
ekaterina,2020-05-14 23:02:15.588808,2020,5,14,23,2,15,evening


## Use the method describe() to get the basic statistics for the columns

- to find out what the most popular interval for visiting the page is, calculate the interquartile range for the hour by extracting values from the result of the describe() method and store it in the variable iqr

In [9]:
describe_stats = df.describe()

iqr = describe_stats.loc['75%', 'hour'] - describe_stats.loc['25%', 'hour']

print(f"Interquartile range for the hour: {iqr}")

Interquartile range for the hour: 9.0
