In [1]:
import difflib
import itertools
import pandas as pd


class Merger(object):
    unique_users = []
    not_unique_users = []
    email_uniqueness = 0.95
    tel_uniqueness = 0.66

    def __init__(self, table_file_name):
        self.table = pd.read_csv(table_file_name,sep=';')
        print(self.table)
        self.rows = [row for row in self.table.T.to_dict().values()]
        self.not_unique_users.clear()
        self.unique_users.clear()


    def find_not_unique_users(self):
        for pair in itertools.combinations(self.rows, 2):
            if self.similarity(pair[0]['email'], pair[1]['email']) >= self.email_uniqueness or \
               self.similarity(str(pair[0]['tel']), str(pair[1]['tel'])) >= self.tel_uniqueness:
                appended_flag = False
                for not_unique_user in self.not_unique_users:
                    if pair[0]['id_u'] in not_unique_user and pair[1]['id_u'] not in not_unique_user:
                        not_unique_user.append(pair[1]['id_u'])
                        appended_flag = True
                    elif pair[1]['id_u'] in not_unique_user and pair[0]['id_u'] not in not_unique_user:
                        not_unique_user.append(pair[0]['id_u'])
                        appended_flag = True
                    elif pair[0]['id_u'] in not_unique_user and pair[1]['id_u'] in not_unique_user:
                        appended_flag = True

                if not appended_flag:
                    self.not_unique_users.append([pair[0]['id_u'], pair[1]['id_u']])

    def find_unique_users(self):
        for row in self.rows:
            unique_user_flag = True
            for not_unique_user in self.not_unique_users:
                if row['id_u'] in not_unique_user:
                    unique_user_flag = False
            if unique_user_flag:
                self.unique_users.append(row['id_u'])
        self.unique_users += self.not_unique_users

    def get_users_data(self, new_table_file_name):
        users_data = []
        for user in self.unique_users:
            user_data = {'ids': [],
                         'emails': [],
                         'tels': []}
            if type(user) == int:
                user_data['ids'] = [user]
            else:
                user_data['ids'] = user
            user_data['emails'] = self.table.loc[self.table['id_u'].isin(user_data['ids'])]['email'].tolist()
            user_data['tels'] = self.table.loc[self.table['id_u'].isin(user_data['ids'])]['tel'].tolist()
            users_data.append(user_data)
        users_data = pd.DataFrame(users_data)
        users_data.to_csv(new_table_file_name, index=False, sep=';')
        return users_data

    def similarity(self, s1, s2):
        normalized1 = s1.lower()
        normalized2 = s2.lower()
        matcher = difflib.SequenceMatcher(None, normalized1, normalized2)
        return matcher.ratio()

In [2]:
merge1 = Merger('data1.csv')

   id_u             email     tel
0     1     12345@mail.ru     123
1     2     12345@mail.ru     124
2     3    123456@mail.ru     124
3     4   1234567@mail.ru     124
4     5  12345678@mail.ru     124
5     6    123456@mail.ru  123435
6     7    123456@mail.ru  235123
7     8    123456@mail.ru  534534


In [3]:
merge1.find_not_unique_users()
merge1.find_unique_users()
merge1.get_users_data('unique_users.csv')

Unnamed: 0,ids,emails,tels
0,"[1, 2, 3, 4, 5, 6, 7, 8]","[12345@mail.ru, 12345@mail.ru, 123456@mail.ru,...","[123, 124, 124, 124, 124, 123435, 235123, 534534]"


In [4]:
merge2 = Merger('data2.csv')

    id_u               email      tel
0      1       12345@mail.ru      123
1      2       12345@mail.ru      124
2      3      123456@mail.ru      124
3      4     1234567@mail.ru      124
4      5    12345678@mail.ru      124
5      6      123456@mail.ru   123435
6      7      123456@mail.ru   235123
7      8      123456@mail.ru   534534
8      9      512414@mail.ru  4124124
9     10      512414@mail.ru      521
10    11  fasfknfkja@mail.ru  4124123


In [5]:
merge2.find_not_unique_users()
merge2.find_unique_users()
merge2.get_users_data('unique_users.csv')

Unnamed: 0,ids,emails,tels
0,"[1, 2, 3, 4, 5, 6, 7, 8]","[12345@mail.ru, 12345@mail.ru, 123456@mail.ru,...","[123, 124, 124, 124, 124, 123435, 235123, 534534]"
1,"[9, 10, 11]","[512414@mail.ru, 512414@mail.ru, fasfknfkja@ma...","[4124124, 521, 4124123]"
