In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import statistics
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.metrics import matthews_corrcoef
import category_encoders as ce

In [2]:
data = pd.read_csv('data/wine_cleared.zip')

In [3]:
data['price_round'] = data['price'].round().astype(int)

In [3]:
regex = '\d{4}' # регулярное выражение для нахождения чисел
data['year'] = data['title'].str.findall(regex).str.get(0)

In [5]:
data['is_usa'] = data['country'].apply(lambda x: 1 if x == 'US' else 0)

### 2.1. Выберите из списка две самых популярных (помимо США) страны, производящих вино

In [13]:
', '.join(list(data.country.value_counts().head(3)[1:3].index))

'France, Italy'

### 2.2. Создайте бинарные признаки is_france, is_italy

In [15]:
data['is_france'] = data['country'].apply(lambda x: 1 if x == 'France' else 0)
data['is_france'].sum()

22093

In [16]:
data['is_italy'] = data['country'].apply(lambda x: 1 if x == 'Italy' else 0)
data['is_italy'].sum()

19540

### 2.3. Создайте новый бинарный признак old_wine, где значение 1 будет обозначать, что вино старше 2010 года

In [12]:
data['old_wine'] = np.where(data['year'].astype(float) >= 2011, 0, 1)
data['old_wine'].sum()

56539

In [15]:
data['old_wine'] = data['year'].apply(lambda x: 0 if float(x) > 2010 else 1)
data['old_wine'].sum()

56539

### 2.7. Создайте новый признак locality из признака title, который будет обозначать название долины/местности производства вина.

In [47]:
regex = '\(([a-y,A-Y,\s]*?)\)'
data['locality'] = data.title.str.findall(regex).str.get(0)
data.locality

0                        Etna
1                       Douro
2           Willamette Valley
3         Lake Michigan Shore
4           Willamette Valley
                 ...         
129966                  Mosel
129967                 Oregon
129968                 Alsace
129969                 Alsace
129970                 Alsace
Name: locality, Length: 129971, dtype: object

In [51]:
country_population = pd.read_csv('data/country_population.zip', sep=';')
country_population

Unnamed: 0,country,population
0,China,1411778724
1,India,1386584581
2,US,333022386
3,Indonesia,271350000
4,Pakistan,225200000
...,...,...
236,Niue,1549
237,Tokelau,1501
238,Vatican City,825
239,Cocos Islands,573


### 3.1. Каково население Италии?

In [62]:
country_population[country_population.country == 'Italy'].population

24    59,097,904
Name: population, dtype: object

In [68]:
data = data.join(country_population.set_index('country'), on='country')

In [64]:
country_area = pd.read_csv('data/country_area.zip', sep=';')
country_area

Unnamed: 0,country,area
0,Russia,17075400.00
1,Canada,9984670.00
2,China,9596960.00
3,US,9372610.00
4,Brazil,8547000.00
...,...,...
188,San Marino,61.00
189,Tuvalu,26.00
190,Nauru,21.00
191,Monaco,2.00


In [69]:
data = data.join(country_area.set_index('country'), on='country')

### 3.2. Какая площадь страны у вина под названием 'Gård 2014 Grand Klasse Reserve Lawrence Vineyards Viognier (Columbia Valley (WA))'? 

In [70]:
mask = data.title == 'Gård 2014 Grand Klasse Reserve Lawrence Vineyards Viognier (Columbia Valley (WA))'
data[mask].area

94    9372610.0
Name: area, dtype: float64

In [73]:
calls_list = [
    [460, '2013-12-17 04:55:39', '2013-12-17 04:55:44', '2013-12-17 04:55:45'],
    [12, '2013-12-16 20:03:20', '2013-12-16 20:03:22', '2013-12-16 20:07:13'],
    [56, '2013-12-16 20:03:20', '2013-12-16 20:03:20', '2013-12-16 20:05:04'],
    [980, '2013-12-16 20:03:20','2013-12-16 20:03:27', '2013-12-16 20:03:29'],
    [396, '2013-12-16 20:08:27', '2013-12-16 20:08:28','2013-12-16 20:12:03'],
    [449, '2013-12-16 20:03:20', '2013-12-16 20:03:25','2013-12-16 20:05:00'],
    [397, '2013-12-16 20:08:25', '2013-12-16 20:08:27', '2013-12-16 20:09:59'],
    [398, '2013-12-16 20:01:23', '2013-12-16 20:01:23', '2013-12-16 20:04:58'],
    [452, '2013-12-16 20:03:20', '2013-12-16 20:03:21','2013-12-16 20:04:55'],
    [440, '2013-12-16 20:03:20', '2013-12-16 20:04:26', '2013-12-16 20:04:32']
]

calls = pd.DataFrame(calls_list, columns = ['client_id',  'agent_date', 'created_at' ,'end_date'])
calls['agent_date'] = pd.to_datetime(calls['agent_date'])
calls['created_at'] = pd.to_datetime(calls['created_at'])
calls['end_date'] = pd.to_datetime(calls['end_date'])

calls

Unnamed: 0,client_id,agent_date,created_at,end_date
0,460,2013-12-17 04:55:39,2013-12-17 04:55:44,2013-12-17 04:55:45
1,12,2013-12-16 20:03:20,2013-12-16 20:03:22,2013-12-16 20:07:13
2,56,2013-12-16 20:03:20,2013-12-16 20:03:20,2013-12-16 20:05:04
3,980,2013-12-16 20:03:20,2013-12-16 20:03:27,2013-12-16 20:03:29
4,396,2013-12-16 20:08:27,2013-12-16 20:08:28,2013-12-16 20:12:03
5,449,2013-12-16 20:03:20,2013-12-16 20:03:25,2013-12-16 20:05:00
6,397,2013-12-16 20:08:25,2013-12-16 20:08:27,2013-12-16 20:09:59
7,398,2013-12-16 20:01:23,2013-12-16 20:01:23,2013-12-16 20:04:58
8,452,2013-12-16 20:03:20,2013-12-16 20:03:21,2013-12-16 20:04:55
9,440,2013-12-16 20:03:20,2013-12-16 20:04:26,2013-12-16 20:04:32


In [74]:
# Подсчитаем разницу между датой и временем начала разговора с клиентом и датой и временем окончания звонка
calls['duration'] = (calls['end_date'] - calls['created_at']).dt.seconds
calls

Unnamed: 0,client_id,agent_date,created_at,end_date,duration
0,460,2013-12-17 04:55:39,2013-12-17 04:55:44,2013-12-17 04:55:45,1
1,12,2013-12-16 20:03:20,2013-12-16 20:03:22,2013-12-16 20:07:13,231
2,56,2013-12-16 20:03:20,2013-12-16 20:03:20,2013-12-16 20:05:04,104
3,980,2013-12-16 20:03:20,2013-12-16 20:03:27,2013-12-16 20:03:29,2
4,396,2013-12-16 20:08:27,2013-12-16 20:08:28,2013-12-16 20:12:03,215
5,449,2013-12-16 20:03:20,2013-12-16 20:03:25,2013-12-16 20:05:00,95
6,397,2013-12-16 20:08:25,2013-12-16 20:08:27,2013-12-16 20:09:59,92
7,398,2013-12-16 20:01:23,2013-12-16 20:01:23,2013-12-16 20:04:58,215
8,452,2013-12-16 20:03:20,2013-12-16 20:03:21,2013-12-16 20:04:55,94
9,440,2013-12-16 20:03:20,2013-12-16 20:04:26,2013-12-16 20:04:32,6


### 4.1. сколько секунд тратят сотрудники компании на дозвон клиенту. Результат запишите в новый признак time_connection.

In [77]:
calls['time_connection'] = (calls.created_at - calls.agent_date).dt.seconds
calls['time_connection'].sum()

89

### 4.2. Создайте новый признак is_connection — факт соединения с клиентом. Признак будет равен 1 в случае, если разговор состоялся и продлился больше 10 секунд, иначе — 0.

In [79]:
calls['is_connection'] = calls.duration.apply(lambda x: 1 if x > 10 else 0)
calls['is_connection'].sum()

7

### 4.3. Создайте признак time_diff — разницу в секундах между началом звонка(не разговора) и его окончанием

In [81]:
calls['time_diff'] = (calls.end_date - calls.agent_date).dt.seconds
calls.time_diff.sum()

1144

In [82]:
calls = calls.drop(columns=['agent_date', 'created_at' ,'end_date'], axis=1)

### 4.5. Создайте признак количество дней с момента произведения вина — years_diff для датасета винных обзоров. За дату отсчёта возьмите 12 января 2022 года. В ответ впишите максимальное количество дней с момента произведения вина

In [87]:
data.year.info()

<class 'pandas.core.series.Series'>
Int64Index: 129972 entries, 0 to 129970
Series name: year
Non-Null Count   Dtype 
--------------   ----- 
125363 non-null  object
dtypes: object(1)
memory usage: 2.0+ MB


In [97]:
data['years_diff'] = (pd.to_datetime('2022-01-12') - pd.to_datetime(data.year, format='%Y-%m-%d', errors='coerce')).dt.days
data['years_diff'].max()

98627.0

In [99]:
clothing_list = [
    ['xxs', 'dress'],
    ['xxs', 'skirt'],
    ['xs', 'dress'],
    ['s', 'skirt'],
    ['m', 'dress'],
    ['l', 'shirt'],
    ['s', 'coat'],
    ['m', 'coat'],
    ['xxl', 'shirt'],
    ['l', 'dress']
]

clothing = pd.DataFrame(clothing_list, columns = ['size',  'type'])
clothing

Unnamed: 0,size,type
0,xxs,dress
1,xxs,skirt
2,xs,dress
3,s,skirt
4,m,dress
5,l,shirt
6,s,coat
7,m,coat
8,xxl,shirt
9,l,dress


In [100]:
ord_encoder = ce.OrdinalEncoder()
data_bin = ord_encoder.fit_transform(clothing[['size', 'type']])
clothing = pd.concat([clothing, data_bin], axis=1)

clothing

Unnamed: 0,size,type,size.1,type.1
0,xxs,dress,1,1
1,xxs,skirt,1,2
2,xs,dress,2,1
3,s,skirt,3,2
4,m,dress,4,1
5,l,shirt,5,3
6,s,coat,3,4
7,m,coat,4,4
8,xxl,shirt,6,3
9,l,dress,5,1


In [105]:
ord_encoder = ce.OrdinalEncoder(cols=['year'])
year_cod = ord_encoder.fit_transform(data.year)
data = pd.concat([data, year_cod], axis=1)
data

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,...,year,is_usa,is_france,is_italy,old_wine,locality,population,area,years_diff,year.1
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,35.363389,Sicily & Sardinia,Etna,Kerin O’Keefe,@kerinokeefe,...,2013,0,0,1,0,Etna,59097904,301230.0,3298.0,1
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.000000,Douro,unknown,Roger Voss,@vossroger,...,2011,0,0,0,0,Douro,10347892,92391.0,4029.0,2
2,2,US,"Tart and snappy, the flavors of lime flesh and...",unknown,87,14.000000,Oregon,Willamette Valley,Paul Gregutt,@paulgwine,...,2013,1,0,0,0,Willamette Valley,333022386,9372610.0,3298.0,1
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.000000,Michigan,Lake Michigan Shore,Alexander Peartree,unknown,...,2013,1,0,0,0,Lake Michigan Shore,333022386,9372610.0,3298.0,1
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.000000,Oregon,Willamette Valley,Paul Gregutt,@paulgwine,...,2012,1,0,0,0,Willamette Valley,333022386,9372610.0,3664.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.000000,Mosel,unknown,Anna Lee C. Iijima,unknown,...,2013,0,0,0,0,Mosel,83129285,357022.0,3298.0,1
129967,129967,US,Citation is given as much as a decade of bottl...,unknown,90,75.000000,Oregon,Oregon,Paul Gregutt,@paulgwine,...,2004,1,0,0,1,Oregon,333022386,9372610.0,6586.0,12
129968,129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.000000,Alsace,Alsace,Roger Voss,@vossroger,...,2013,0,1,0,0,Alsace,68035000,547030.0,3298.0,1
129969,129969,France,"A dry style of Pinot Gris, this is crisp with ...",unknown,90,32.000000,Alsace,Alsace,Roger Voss,@vossroger,...,2012,0,1,0,0,Alsace,68035000,547030.0,3664.0,3


In [106]:
encoder = ce.OneHotEncoder(cols=['type']) # указываем столбец для кодирования
type_bin = encoder.fit_transform(clothing['type'])
clothing = pd.concat([clothing, type_bin], axis=1)

clothing

AttributeError: 'DataFrame' object has no attribute 'unique'