# Dataframe merging

# Unit 1

## Hashing

In [1]:
import pandas as pd

In [None]:
orders = pd.read_csv('orders.txt')
orders.head()

In [2]:
import hashlib

In [4]:
hashlib.sha256('username@yandex.ru'.encode('utf-8')).hexdigest()

'2388d885ffef200977eefc0a585a578bc30e00cc0b1cde0678ed49ef04659d51'

In [10]:
hashlib.sha256('username@yandex1.ru'.encode('utf-8')).hexdigest()

'd1930e0bed8e793a6d6907159939df5ee4d02a75b1ebcb0908c06f8bb66a6433'

In [None]:
def hash_name(row):
    return hashlib.sha256(row['tel'].encode('utf-8')).hexdigest()

In [None]:
orders['hash_tel'] = orders.apply(hash_name, axis = 1)

# Unit 2

### Merge

In [12]:
import pandas as pd
import numpy as np
# NumPy — это библиотека языка Python, добавляющая поддержку больших многомерных массивов
# и матриц вместе с большой библиотекой высокоуровневых (и очень быстрых)
# математических функций для операций с этими массивами
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
len(movies[ movies['genres'].str.contains('Fantasy', case = False)])
#movies['genres'].str.contains('Fantasy').sum()

654

In [16]:
joined = ratings.merge(movies, on='movieId', how='left')
#Для объединения по нескольким столбцам используйте on = ['col1', 'col2'] или left_on и right_on

In [17]:
joined.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
5,1,1263,2.0,1260759151,"Deer Hunter, The (1978)",Drama|War
6,1,1287,2.0,1260759187,Ben-Hur (1959),Action|Adventure|Drama
7,1,1293,2.0,1260759148,Gandhi (1982),Drama
8,1,1339,3.5,1260759125,Dracula (Bram Stoker's Dracula) (1992),Fantasy|Horror|Romance|Thriller
9,1,1343,2.0,1260759131,Cape Fear (1991),Thriller


In [18]:
#После объединения датафреймов лучше проверять, не возникли ли дубликаты
len(joined) == len(ratings)

True

In [19]:
direct_stats = pd.read_csv('direct_stats.tsv', sep = '\t')

In [20]:
direct_stats.head()

Unnamed: 0,date,campaign,views,clicks,cost
0,2018-01-01,landings_promo,38120423,49557,1139801
1,2018-01-01,homepage_partner_1,5729483,12605,189073
2,2018-01-01,homepage_partner_2,4412029,9265,176040
3,2018-01-01,socdem_w_25-34_vip_test,913823,2559,89555
4,2018-01-02,landings_promo,40873806,61311,1471457


In [23]:
crm_stats = pd.read_csv('crm_stats.tsv', sep = '\t')

In [24]:
crm_stats.head()

Unnamed: 0,date,campaign,orders
0,2018-01-01,landings_promo,1487
1,2018-01-01,homepage_partner_1,386
2,2018-01-01,homepage_partner_2,315
3,2018-01-01,socdem_w_25-34_vip_test,85
4,2018-01-02,landings_promo,1605


In [25]:
joined_stats = direct_stats.merge(crm_stats, on=['date', 'campaign'], how='left')

In [26]:
joined_stats.head()

Unnamed: 0,date,campaign,views,clicks,cost,orders
0,2018-01-01,landings_promo,38120423,49557,1139801,1487
1,2018-01-01,homepage_partner_1,5729483,12605,189073,386
2,2018-01-01,homepage_partner_2,4412029,9265,176040,315
3,2018-01-01,socdem_w_25-34_vip_test,913823,2559,89555,85
4,2018-01-02,landings_promo,40873806,61311,1471457,1605


In [27]:
len(joined_stats) == len(direct_stats)

True

In [28]:
joined_stats['order_cost'] = joined_stats['cost']/ joined_stats['orders']
joined_stats.head()

Unnamed: 0,date,campaign,views,clicks,cost,orders,order_cost
0,2018-01-01,landings_promo,38120423,49557,1139801,1487,766.510424
1,2018-01-01,homepage_partner_1,5729483,12605,189073,386,489.826425
2,2018-01-01,homepage_partner_2,4412029,9265,176040,315,558.857143
3,2018-01-01,socdem_w_25-34_vip_test,913823,2559,89555,85,1053.588235
4,2018-01-02,landings_promo,40873806,61311,1471457,1605,916.795639


In [30]:
round(joined_stats[ (joined_stats['campaign'] == 'landings_promo') & (joined_stats['date']=='2018-01-01')]['order_cost'])

0    767.0
Name: order_cost, dtype: float64

In [32]:
genres = ['Drama', 'Action', 'Thriller']

def genre_rating(row):
    return pd.Series([row['rating'] if genre in row['genres'] else np.NaN for genre in genres])            

In [34]:
%%time
joined[genres] = joined.apply(genre_rating, axis = 1)
joined.head()

Wall time: 1min 6s


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Drama,Action,Thriller
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama,2.5,,
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,3.0,,
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller,,,3.0
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,,2.0,2.0
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,4.0,,


In [35]:
for genre in genres:
    print('{} mean rating {:.2f}'.format(genre, joined[genre].mean()))

Drama mean rating 3.68
Action mean rating 3.45
Thriller mean rating 3.52


In [41]:
#genres = ['Drama', 'Action', 'Thriller', 'Comedy', 'Romance', 'War', 'Mystery', 'Crime']
genres2 = ['Comedy', 'Romance', 'War', 'Mystery', 'Crime']

In [44]:
def genre_rat(row):
    return pd.Series([row['rating'] if genre in row['genres'] else np.NaN for genre in genres2])     

In [45]:
%%time 
joined[genres2] = joined.apply(genre_rat, axis = 1)
joined.head()

Wall time: 1min 12s


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Drama,Action,Thriller,Comedy,Romance,War,Mystery,Crime
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama,2.5,,,,,,,
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,3.0,,,,,,,
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller,,,3.0,,,,,
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,,2.0,2.0,,,,,
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,4.0,,,,,,,


In [50]:
genres3 = ['Drama', 'Action', 'Thriller', 'Comedy', 'Romance', 'War', 'Mystery', 'Crime']
for genre in genres3:
    print(genre, len(joined[genre]) - joined[genre].isna().sum())

Drama 44752
Action 27056
Thriller 25240
Comedy 38026
Romance 19336
War 5025
Mystery 7625
Crime 16266


# Unit 2

### Duplicate removal


In [52]:
import pandas as pd
ratings = pd.read_csv('ratings_example.txt', sep = '\t')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144


In [53]:
movies = pd.read_csv('movies_example.txt', sep = '\t')
movies.head()

Unnamed: 0,movieId,title,genres
0,31,Dangerous Minds (1995),Drama
1,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
2,31,Dangerous Minds (1995),Drama


In [55]:
movies.drop_duplicates(subset = 'movieId', keep = 'first', inplace = True)
movies.head()

Unnamed: 0,movieId,title,genres
0,31,Dangerous Minds (1995),Drama
1,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


In [56]:
ratings.merge(movies, on='movieId', how = 'left')

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama


In [57]:
movies = pd.read_csv('movies_example.txt', sep = '\t')
movies.head()

Unnamed: 0,movieId,title,genres
0,31,Dangerous Minds (1995),Drama
1,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
2,31,Dangerous Minds (1995),Drama


In [60]:
ratings.merge(movies, on='movieId', how='right')

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1.0,31,2.5,1260759000.0,Dangerous Minds (1995),Drama
1,1.0,31,2.5,1260759000.0,Dangerous Minds (1995),Drama
2,,32,,,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


In [61]:
movies = pd.read_csv('movies_example.txt', sep = '\t')
movies.head()

Unnamed: 0,movieId,title,genres
0,31,Dangerous Minds (1995),Drama
1,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
2,31,Dangerous Minds (1995),Drama


In [62]:
ratings.merge(movies, on='movieId', how='left')

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,1,31,2.5,1260759144,Dangerous Minds (1995),Drama


In [63]:
movies = pd.read_csv('movies_example.txt', sep = '\t')
movies.head()

Unnamed: 0,movieId,title,genres
0,31,Dangerous Minds (1995),Drama
1,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
2,31,Dangerous Minds (1995),Drama


In [64]:
ratings.merge(movies, on='movieId', how='inner')

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,1,31,2.5,1260759144,Dangerous Minds (1995),Drama


In [65]:
movies = pd.read_csv('movies_example.txt', sep = '\t')
movies.head()

Unnamed: 0,movieId,title,genres
0,31,Dangerous Minds (1995),Drama
1,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
2,31,Dangerous Minds (1995),Drama


In [66]:
ratings.merge(movies, on='movieId', how='outer')

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1.0,31,2.5,1260759000.0,Dangerous Minds (1995),Drama
1,1.0,31,2.5,1260759000.0,Dangerous Minds (1995),Drama
2,,32,,,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


In [None]:
ratings = pd.read_csv('ratings_example.txt', sep = '\t')

movies = pd.read_csv('movies_example.txt', sep = '\t')

print(ratings.merge(movies, on='movieId', how='left'))
print(ratings.merge(movies, on='movieId', how='right'))
print(ratings.merge(movies, on='movieId', how='inner'))
print(ratings.merge(movies, on='movieId', how='outer'))

In [77]:
years = [str(x) for x in range(1950,2011)]

In [78]:
years

['1950',
 '1951',
 '1952',
 '1953',
 '1954',
 '1955',
 '1956',
 '1957',
 '1958',
 '1959',
 '1960',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010']

In [85]:
def year_class(row):
    for year in years:
        if year in row['title']:
            return year
    return '1900'

In [86]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [87]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [88]:
joined = ratings.merge(movies, on = 'movieId', how = 'right')

In [89]:
joined['year'] = joined.apply(year_class, axis=1)

In [90]:
joined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1.0,31,2.5,1260759000.0,Dangerous Minds (1995),Drama,1995
1,7.0,31,3.0,851868800.0,Dangerous Minds (1995),Drama,1995
2,31.0,31,4.0,1273542000.0,Dangerous Minds (1995),Drama,1995
3,32.0,31,4.0,834828400.0,Dangerous Minds (1995),Drama,1995
4,36.0,31,3.0,847057200.0,Dangerous Minds (1995),Drama,1995


In [96]:
joined.groupby('year').mean().sort_values('rating', ascending = False)

Unnamed: 0_level_0,userId,movieId,rating,timestamp
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1957,360.933544,3001.949527,4.014241,1.083707e+09
1972,359.694878,3983.538976,4.011136,1.122759e+09
1952,346.394737,4462.532468,4.000000,1.090512e+09
1954,358.228324,2867.661850,3.994220,1.070591e+09
1951,347.106996,2605.588477,3.983539,1.052714e+09
...,...,...,...,...
2005,348.509719,37156.244600,3.448434,1.273459e+09
2003,340.186204,8166.840972,3.444777,1.238031e+09
1996,347.970249,910.870646,3.426600,1.020020e+09
1997,357.747044,1970.257389,3.415764,1.085420e+09
