# <font color='navy'> Web Data Integration Project: Python code for data preprocessing and gold standards</font>

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
# Importing datasets
zomato_data = pd.read_csv("zomato.csv") 
yelp_data = pd.read_csv("yelp.csv")
yp_data = pd.read_csv("yellow_pages.csv")

# helper dataset
cities = pd.read_csv("uscities.csv")

# Preprocessing for Schema Mapping stage

### 1) Standardizing address for Zomato dataset

In [3]:
def tokenize(text): #tokenize
    result = [x.lower() for x in re.split(r'[,|;"]+', str(text))]
    return result

In [4]:
def extract_city(zomato_data):
    
    # create a list of cities that will serve as a dictionary
    cities_dict = cities["city"].to_numpy()
    cities_dict = [x.lower() for x in cities_dict]

    # tokenize address
    zomato_data['address_tok'] = zomato_data.apply(lambda x: tokenize(x['address']), axis=1)
    
    # check for city presence in address_tokens
    def if_city(tokens):
        last_element = tokens[-1].strip()
        if last_element in cities_dict:
            return last_element
    
    # extract city
    zomato_data['city'] = zomato_data.apply(lambda x: if_city(x['address_tok']), axis=1)
    
    # remove city from address
    def remove_city(data):
        for index, row in data.iterrows():
            tokens = row["address_tok"]
            if row["city"] is not None:
                del tokens[-1]
                
    remove_city(zomato_data)
    
    # join adress_tokens back to string and remove tokens from dataset
    zomato_data['address'] = zomato_data.apply(lambda x: ",".join(x['address_tok']), axis=1)
    del zomato_data["address_tok"]
    
    return zomato_data

In [5]:
zomato_data = extract_city(zomato_data)

### 2) Mapping missing cities in Zomato datasets using zip code

In [6]:
# get zip code and city dictionary

def getzipcitydict(citydata):
    # non_digit_zip_count counts the number of cities in cities dataset which does not have a numeric zipcode
    # for one row this is true
    non_digit_zip_count = 0
    dict_zip_city = {}
    for index1, row1 in citydata.iterrows(): 
        zips = str(row1['zips']).split(" ")
        for zip_each in zips:
            if zip_each.isdigit():
                dict_zip_city[int(zip_each)] = str(row1['city']).lower()
            else:
                non_digit_zip_count = non_digit_zip_count + 1
                #Attu Station has no zip code
                #print(row1['city'])           
    return dict_zip_city, non_digit_zip_count

In [7]:
dictzipcity, non_digit_zip_count = getzipcitydict(cities)
zomatocopy = zomato_data.copy()

In [8]:
def fillcitywithzipcodemapping(zipcode, city):
    try:
        #Fill the city data only if it is none. If not none it already contains a legal value and hence return the same 
        #legal value in the else case
        if city == None:
            zipcode =  int(zipcode)
            if zipcode in dictzipcity:
                return dictzipcity[zipcode]
        else:
            return city        
    except ValueError:
        return None 

In [9]:
zomatocopy['city'] = zomatocopy.apply(lambda x: fillcitywithzipcodemapping(x['zip'],x['city']), axis=1)
print("Rows in dataset with no city data after zip code mapping: " 
      + str(len(zomatocopy[zomatocopy.city.isnull()])))

Rows in dataset with no city data after zip code mapping: 0


### 3) Adding state attribute to Zomato dataset

In [10]:
def getzipstatedict(citydata):
    # non_digit_zip_count counts the number of cities in cities dataset which does not have a numeric zipcode
    # for one row this is true
    non_digit_zip_count = 0
    dictzipstate = {}
    for index1, row1 in citydata.iterrows(): 
        zips = str(row1['zips']).split(" ")
        for zip_each in zips:
            if zip_each.isdigit():
                dictzipstate[int(zip_each)] = str(row1['state_id']).lower()        
    return dictzipstate

dictzipstate = getzipstatedict(cities)

In [11]:
def fillzip(zipcode):
    try:
            zipcode =  int(zipcode)
            if zipcode in dictzipstate:
                return dictzipstate[zipcode]        
    except ValueError:
        return None 

In [12]:
zomatocopy['state'] = zomatocopy.apply(lambda x: fillzip(x['zip']), axis=1)

zomato_data = zomatocopy

### 4) Temporary standardization of attrubutes' names to faciliate further operations on datasets 

In [13]:
# Yellow Pages
yp_data = yp_data.rename(columns = {'streetAddress':'address'})
yp_data = yp_data.rename(columns = {'zipCode':'zip'})

# Yelp
yelp_data = yelp_data.rename(columns = {'telephone':'phone'})
yelp_data = yelp_data.rename(columns = {'streetAddress':'address'})
yelp_data = yelp_data.rename(columns = {'zipCode':'zip'})

# Zomato
zomato_data = zomato_data.rename(columns = {'ID':'id'})

### 5) Extracting cuisine from 'category' attribute for Yellow Pages

We decided to pay more attention on preprocessing the cuisines, because they carry important information for the end user.

Zomato and Yelp have clean cuisine attributes, while in Yellow Pages the cuisine is hidden in the attribute 'category'. We want to extract cuisine from these categories. We noticed that typically cuisine is described as a noun (e.g. French, American, Korean) followed by a word 'restaurants'. Thus as a first step we want to extract the categories that contain word 'restaurant' (around 50). This is still not satisfactionary, because we are missig names such as 'pizza' or 'hamburgers' which also indicate type of food served. Knowing that Zomato and Yelp have it nicely organized, we create a cuisine dictionary consisted of unique cusine names for both Zomato and Yelp. Then, we check the remaining of YP categories against this dictionary. To sum up, the final attribute 'cuisine' for YP is composed of 1) names with 'restaurants' string 2) names that occur in Zomato and Yelp cuisines.

In [14]:
# Joint dictionary for cuisine yelp & zomato
def vocab(data, column):
    vocab = []
    tokens = data.apply(lambda x: tokenize(x[column]), axis=1)
    for i, j in tokens.iteritems():
        for token in j:
            if token not in vocab:
                vocab.append(token)        
    return vocab

def joint_vocab(vocab_1, vocab_2):
    cuisine_dict = list(vocab_1)
    cuisine_dict.extend(x for x in vocab_2 if x not in cuisine_dict)
    return sorted(cuisine_dict)
    
vocab_yelp = vocab(yelp_data, "category")
vocab_zomato = vocab(zomato_data, "cuisine")
vocab_yp = vocab(yp_data, "categories")

vocab = joint_vocab(vocab_yelp, vocab_zomato)

In [15]:
# Keep categories for Yellow Page that contain word "restaurant"
def trim_yp(vocab_yp):
    restaurants = []
    for i in range(len(vocab_yp)):
        if 'restaurant' in vocab_yp[i]:
            restaurants.append(vocab_yp[i])

    vocab_yp_wo_r = [x for x in vocab_yp if x not in restaurants]        
    vocab_yp_rest = [x for x in vocab_yp if x in vocab]
    vocab_yp = restaurants + vocab_yp_rest  

    return vocab_yp

In [16]:
def filter_categories(text):
    tokens = tokenize(text.lower())
    tokens = [x for x in tokens if x in vocab_yp]
    
    # removing "restaurants", blank spaces at the end of token and empty tokens (empty tokens are string that contain 
                                                                                 # only word 'restaurant')
    tokens = [x.replace('restaurants', '').strip() for x in tokens]
    
    while '' in tokens:
           tokens.remove('')
    return tokens

yp_data['cuisine'] = yp_data.apply(lambda x: filter_categories(x['categories']), axis=1)
yp_data['cuisine'] = yp_data.apply(lambda x: ",".join(x['cuisine']), axis=1)
del yp_data["categories"]

# Preprocessing for Identity Resolution stage: Normalization

In [17]:
from nltk.corpus import stopwords 

# Special characters - punctuation
def remove_special(text):
    text = str(text)
    return re.sub(r"[^a-zA-Z0-9]+", ' ', text)

# Abbreviation rules
def abb(text):
    replacements =  {"ave": "avenue", "st": "street", "blvd": "boulevard", 
                     "3rd": "third", "rd": "road", "ln": "lane", 
                     "hwy": "highway", "expy": "expressway", "dr": "drive", 
                     "pkwy": "parkway", "plz": "plaza", "pl": "place", "w": "west",
                     "n": "north", "e": "east", "s": "south"}

    def replace(match):
        return replacements[match.group(0)]

    text = re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in replacements), replace, text) 
    
    return text

# Stop words and space stripping
stop = set(stopwords.words('english'))  

def remove_stop_strip(t):
    t = t.split()
    filtered_words = []
    for word in t:
        if word not in stop:
            filtered_words.append(word.strip())
    return ' '.join(filtered_words)

In [18]:
datasets = [zomato_data, yp_data, yelp_data]
attriutes_to_normalize = ["name", "address", "city", "state"]

for i in datasets:
    for j in attriutes_to_normalize:
        j = str(j)
        i[j] = i.apply(lambda x: str((x[j])).lower(), axis=1)
        i[j] = i.apply(lambda x: remove_special(x[j]), axis=1)
        i[j] = i.apply(lambda x: abb(x[j]), axis=1)
        i[j] = i.apply(lambda x: remove_stop_strip(x[j]), axis=1)
        
        ## abb and stop(?) --> address

In [19]:
# Previewing the data after preprocessing step

In [20]:
zomato_data.head(3)

Unnamed: 0,id,name,votes,rating,phone,address,zip,cuisine,reviewcount,city,state
0,0,strings ramen shop,15,3.1,(312) 374-3450,2141 south archer avenue,60616,"Asian,Chinese,Ramen",2,chicago,il
1,1,francesco south hole wall,179,4.0,(847) 272-0155,254 skokie boulevard,60062,Italian,6,northbrook,il
2,3,four belly,12,3.0,(773) 661-6182,3227 north clark street,60657,"Asian,BBQ,Japanese",0,chicago,il


In [21]:
zomato_data[zomato_data["id"] == 7314]

Unnamed: 0,id,name,votes,rating,phone,address,zip,cuisine,reviewcount,city,state
6586,7314,pepolino ristorante,216,3.8,(212) 966-9983,281 west broadway,10013,Italian,16,new york,ny


In [24]:
yp_data.head(3)

Unnamed: 0,id,name,address,city,state,zipCode,phone,website,priceRange,ratingValue,neighborhood,payment-method,years-in-business,extra-phones,aka,cuisine
0,1,full shilling,160 pearl street,new york,ny,10005,(212) 422-3855,http://www.thefullshilling.com,$$,4.0,Downtown Manhattan;Financial District,discover,16.0,Phone;(917) 962-0367;Phone;(212) 422-0036;Fax;...,The Full Shilling,"sandwich shops,take out,hamburgers & hot dogs,..."
1,2,dovetail,103 west 77th street,new york,ny,10024,(212) 362-3800,http://www.dovetailnyc.com;http://dovetailnyc....,$$$$,4.0,Upper Manhattan;Upper West Side,amex;master card;visa;diners club;discover;all...,8.0,,,"american,french,ice cream & frozen desserts,fi..."
2,3,patron mexican grill,608 9th avenue,new york,ny,10036,(212) 957-9050,http://www.patronnyc.com,$$,3.5,Hell's Kitchen;Midtown Manhattan,amex;visa;master card;all major credit cards,,Phone;(917) 791-5098;Fax;(212) 957-4047,,"mexican,latin american,bar & grills,take out"


In [20]:
yelp_data.head(3)

Unnamed: 0,id,name,address,city,state,zipCode,phone,website,priceRange,category,...,Accepts Credit Cards,Good For,Parking,Attire,Ambience,Alcohol,Outdoor Seating,Wi-Fi,Waiter Service,Caters
0,1,sunshine co,780 washington avenue,new york,ny,11238.0,(347) 750-5275,sunshinecobk.com,$$,American (New);Cocktail Bars,...,Yes,Brunch,Street,Casual,Hipster,Full Bar,Yes,Free,Yes,No
1,2,adella,410 w 43rd street,new york,ny,10036.0,(212) 273-0737,adellanyc.com,$$,Tapas Bars;American (New);Wine Bars,...,Yes,Dessert,Street,Casual,Trendy,Beer & Wine Only,Yes,No,Yes,No
2,3,rex,864 10th avenue,new york,ny,10019.0,(929) 900-5784,rexcoffeenyc.com,$,Coffee & Tea;Sandwiches,...,Yes,Breakfast,Street,Casual,Hipster,No,No,Free,No,Yes


In [25]:
zomato_data.to_csv('zomato_normalized.csv')
yp_data.to_csv('yp_normalized.csv')
yelp_data.to_csv('yelp_normalized.csv')

# Gold standards

In [28]:
# Jaacard similarity
def get_jaccard_sim(str1, str2):
    str1 = str(str1)
    str2 = str(str2)
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    
    if (len(a) + len(b) - len(c)) == 0:
        return 0
    else:
        return float(len(c)) / (len(a) + len(b) - len(c))

# Hard non-matches

In [219]:
def hard_no_matches(df1, df2):
    
    def same_name(text1, text2):
        text1 = str(text1)
        text2 = str(text2)
        return 0 if text1 != text2 else 1
    
    df1 = df1[["id", "name", "address", "city", "state", "zip", "phone"]]
    df1.columns = ["id_x", "name_x", "address_x", "city_x", "state_x", "zip_x", "phone_x"]
    df2 = df2[["id", "name", "address", "city", "state", "zip", "phone"]]
    df2.columns = ["id_y", "name_y", "address_y", "city_y", "state_y", "zip_y", "phone_y"]
    
    horizontal_stack = pd.concat([df1, df2], axis=1)
    
    horizontal_stack = horizontal_stack[(horizontal_stack["phone_x"] != horizontal_stack["phone_y"]) 
                                        & horizontal_stack["id_x"].notna() 
                                        & horizontal_stack["id_y"].notna()]
    
    horizontal_stack["id_x"] = horizontal_stack["id_x"].astype('int')
    horizontal_stack["id_y"] = horizontal_stack["id_y"].astype('int')
    horizontal_stack["row_type"] = horizontal_stack.apply(lambda x: "HN-M", axis=1)
    
    horizontal_stack = horizontal_stack[["id_x", 
                   "id_y", 
                   "name_x", 
                   "name_y", 
                   "address_x", 
                   "address_y", 
                   "city_x",
                   "city_y",
                   "state_x",
                   "state_y",
                   "zip_x",
                   "zip_y",
                   "row_type"]]
    return horizontal_stack

# Hard matches

In [220]:
def hard_matches(df1, df2):
    
    merge0 = pd.merge(df1,df2,on=["name", "address", "city", "state", "zip"])
    merge0["jaccard_tot"] = merge0.apply(lambda x: float(1), axis=1)
    
    merge0 = merge0[["id_x", 
                   "id_y", 
                   "name", 
                   "name", 
                   "address", 
                   "address", 
                   "city",
                   "city",
                   "state",
                   "state",
                   "zip",
                   "zip",
                   "jaccard_tot"]] 
    
    merge0.columns = ["id_x", 
                   "id_y", 
                   "name_x", 
                   "name_y", 
                   "address_x", 
                   "address_y", 
                   "city_x",
                   "city_y",
                   "state_x",
                   "state_y",
                   "zip_x",
                   "zip_y",
                   "jaccard_tot"]
    
    merge1 = pd.merge(df1,df2,on=["phone"])
    
    merge1["jaccard_name"] = merge1.apply(lambda x: round(get_jaccard_sim(x["name_x"], x["name_y"]),1), axis=1)                                         
    merge1["jaccard_address"] = merge1.apply(lambda x: round(get_jaccard_sim(x["address_x"], x["address_y"]),1), axis=1)
    merge1["jaccard_city"] = merge1.apply(lambda x: round(get_jaccard_sim(x["city_x"], x["city_y"]),1), axis=1)
    merge1["jaccard_state"] = merge1.apply(lambda x: round(get_jaccard_sim(x["state_x"], x["state_y"]),1), axis=1)
    merge1["jaccard_zip"] = merge1.apply(lambda x: round(get_jaccard_sim(x["zip_x"], x["zip_y"]),1), axis=1)
    
    merge1["jaccard_tot"] = merge1.apply(lambda x: round(float(x["jaccard_name"]) + float(x["jaccard_address"])
                                                       + float(x["jaccard_city"]) + float(x["jaccard_state"])
                                                       + float(x["jaccard_zip"]), 2)/4, axis=1)
    
    
    merge_1 = merge1[(merge1["jaccard_tot"] >= 0.8) & (merge1["jaccard_tot"] < 1)]
    
    merge_1 = merge1[["id_x", 
                   "id_y", 
                   "name_x", 
                   "name_y", 
                   "address_x", 
                   "address_y", 
                   "city_x",
                   "city_y",
                   "state_x",
                   "state_y",
                   "zip_x",
                   "zip_y",
                   "jaccard_tot"]] 
    
    merge = pd.concat([merge0, merge_1])
    merge["row_type"] = merge.apply(lambda x: "HM", axis=1)
    
    merge_ = merge[["id_x", 
                   "id_y", 
                   "name_x", 
                   "name_y", 
                   "address_x", 
                   "address_y", 
                   "city_x",
                   "city_y",
                   "state_x",
                   "state_y",
                   "zip_x",
                   "zip_y",
                   "row_type"]] 
    
    return merge_

# Corner cases

In [221]:
# Get records not matched on attribute
def false_negatives(df1, df2):
    
    merge = pd.merge(df1,df2,on='phone') 
    
    # check if attribute name is the same
    def same_name(text1, text2):
        text1 = str(text1)
        text2 = str(text2)
        return 0 if text1 != text2 else 1
    
    merge["if_same_name"] = merge.apply(lambda x: same_name(x["name_x"], x["name_y"]), axis=1)
    merge["if_same_address"] = merge.apply(lambda x: same_name(x["address_x"], x["address_y"]), axis=1)
    merge_ = merge[(merge["if_same_address"] == 0) | (merge["if_same_address"] == 0)]
    
    # compute Jaccard distance
    merge["jaccard_name"] = merge.apply(lambda x: round(get_jaccard_sim(x["name_x"], x["name_y"]),1), axis=1)                                         
    merge["jaccard_address"] = merge.apply(lambda x: round(get_jaccard_sim(x["address_x"], x["address_y"]),1), axis=1)
    merge["jaccard_city"] = merge.apply(lambda x: round(get_jaccard_sim(x["city_x"], x["city_y"]),1), axis=1)
    merge["jaccard_state"] = merge.apply(lambda x: round(get_jaccard_sim(x["state_x"], x["state_y"]),1), axis=1)
    merge["jaccard_zip"] = merge.apply(lambda x: round(get_jaccard_sim(x["zip_x"], x["zip_y"]),1), axis=1)
    
    merge["jaccard_tot"] = merge.apply(lambda x: round(float(x["jaccard_name"]) + float(x["jaccard_address"])
                                                       + float(x["jaccard_city"]) + float(x["jaccard_state"])
                                                       + float(x["jaccard_zip"]), 2)/4, axis=1)
    
    merge["row_type"] = merge.apply(lambda x: "CC-FN", axis=1)
    
    merge_ = merge[(merge["jaccard_tot"] <= 0.8) & (merge["jaccard_tot"] >= 0.6)]
    
    merge_ = merge_[["id_x", 
                   "id_y", 
                   "name_x", 
                   "name_y", 
                   "address_x", 
                   "address_y", 
                   "city_x",
                   "city_y",
                   "state_x",
                   "state_y",
                   "zip_x",
                   "zip_y",
                   "row_type"]]
    
    return merge_

In [222]:
def false_positives_1(df1, df2):
    
    merge = pd.merge(df1,df2,on="name")
    
    def same_name(text1, text2):
        return 0 if text1 != text2 else 1
    
    # select restaurants which addresses are not EXACNTLY the same     
    merge["if_same_address"] = merge.apply(lambda x: same_name(x["address_x"], 
                                                                              x["address_y"]), axis=1)
    
    # and the are not same restaurants
    merge["if_same_phone"] = merge.apply(lambda x: same_name(x["phone_x"], 
                                                                              x["phone_y"]), axis=1) 
    
    # compute Jaccard distance for address and city
    merge["jaccard_address"] = merge.apply(lambda x: round(get_jaccard_sim(x["address_x"], 
                                                                              x["address_y"]),1), axis=1)
    
    merge["jaccard_city"] = merge.apply(lambda x: round(get_jaccard_sim(x["city_x"], 
                                                                              x["city_y"]),1), axis=1)
    
    merge["jaccard_state"] = merge.apply(lambda x: round(get_jaccard_sim(x["state_x"], 
                                                                              x["state_y"]),1), axis=1)
    merge["jaccard_zip"] = merge.apply(lambda x: round(get_jaccard_sim(x["zip_x"], 
                                                                              x["zip_y"]),1), axis=1)
    
    merge["jaccard_tot"] = merge.apply(lambda x: (float(x["jaccard_address"]) 
                                                  + float(x["jaccard_city"]) 
                                                  + float(x["jaccard_state"])
                                                  + float(x["jaccard_zip"]))/4
                                                                              , axis=1)
    
    merge_ = merge[(merge["if_same_address"] == 0) & (merge["if_same_phone"] == 0)]
    
    merge_["row_type"] = merge_.apply(lambda x: "CC-FP-1", axis=1)
    
    merge_ = merge_[(merge_["jaccard_tot"] <= 0.7) & (merge_["jaccard_tot"] >= 0.4)]
    merge_.sort_values(by=["jaccard_tot"], inplace=True,ascending=False)

    merge_ = merge_[["id_x",
                     "id_y",
                     "name",
                     "name",
                     "address_x", 
                     "address_y", 
                     "city_x",
                     "city_y",
                     "state_x",
                     "state_y",
                     "zip_x",
                     "zip_y",
                     "row_type"]]
      
    merge_.columns = ["id_x",
                     "id_y",
                     "name_x",
                     "name_y",
                     "address_x", 
                     "address_y", 
                     "city_x",
                     "city_y",
                     "state_x",
                     "state_y",
                     "zip_x",
                     "zip_y",
                     "row_type"]
    
    return merge_ 

In [223]:
def false_positives_2(df1, df2):
    
    merge = pd.merge(df1, df2 ,left_on=["zip", "city", "state"], right_on=["zip", "city", "state"])
    
    def same_name(text1, text2):
        return 0 if text1 != text2 else 1
    
    # and the are not same restaurants
    merge["if_same_phone"] = merge.apply(lambda x: same_name(x["phone_x"], 
                                                                              x["phone_y"]), axis=1) 

    merge["jaccard_name"] = merge.apply(lambda x: round(get_jaccard_sim(x["name_x"], 
                                                                              x["name_y"]),1), axis=1)
    merge["jaccard_address"] = merge.apply(lambda x: round(get_jaccard_sim(x["address_x"], 
                                                                              x["address_y"]),1), axis=1)
    
    merge["jaccard_tot"] = merge.apply(lambda x: (float(x["jaccard_address"]) 
                                                  + float(x["jaccard_name"]))/2, axis=1)
    
    merge_ = merge[(merge["if_same_phone"] == 0)]
    
    merge_["row_type"] = merge_.apply(lambda x: "CC-FP-2", axis=1)
    
    merge_ = merge_[(merge_["jaccard_address"] < 0.7) & (merge_["jaccard_tot"] >= 0.4) & (merge_["jaccard_name"] < 0.5)]

    merge_ =  merge_[[
                  "id_x",
                  "id_y",       
                  "name_x", 
                  "name_y", 
                  "address_x", 
                  "address_y", 
                  "city",
                  "city",
                  "state",
                  "state",
                  "zip",
                  "zip",
                  "row_type"]]     
    
    merge_.columns = [
                  "id_x",
                  "id_y",       
                  "name_x", 
                  "name_y", 
                  "address_x", 
                  "address_y", 
                  "city_x",
                  "city_y",
                  "state_x",
                  "state_y",
                  "zip_x",
                  "zip_y",
                  "row_type"]
    
    return merge_

In [226]:
datasets = [zomato_data, yp_data, yelp_data]
datasets_names = ["zomato", "yellow_pages", "yelp"]

ds = [(datasets[i],datasets[j]) for i in range(len(datasets)) for j in range(i+1, len(datasets))]
ds_names = [(datasets_names[i],datasets_names[j]) for i in range(len(datasets_names)) for j in range(i+1, len(datasets_names))]

def sample_and_concat(ds, ds_names):
    
    for i in range(len(ds)):
        
        pair = ds[i]
        
        # sample hard matches
        hm = hard_matches(pair[0], pair[1]).sample(100)

        # sample hard not-matches
        hnm = hard_no_matches(pair[0], pair[1]).sample(250)

        # sample from CC False Negatives         
        cc0 = false_negatives(pair[0], pair[1])
        if len(cc0) >= 80:
            cc0 = cc0.sample(80)
        else: cc0

        # sample from CC False Positives 1
        cc1 = false_positives_1(pair[0], pair[1]).sample(40)

        # sample from CC False Positives 2
        cc2 = false_positives_2(pair[0], pair[1]).sample(40)

        #concatinate
        cc = pd.concat([hm, hnm, cc0, cc1, cc2], ignore_index=True)
        
        cc["id_x"] = cc["id_x"].astype(str)
        cc["id_y"] = cc["id_y"].astype(str)
        
        cc.to_csv('gs_' + str(ds_names[i][0]) + "_" + str(ds_names[i][1]) + ".csv")
        print('File ' + str(i) + ' saved.')

In [227]:
test = sample_and_concat(ds, ds_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


File 0 saved.
File 1 saved.
File 2 saved.
