In [8]:
import numpy as np
import pandas as pd

def read_search_strings(file_path='search_strings.csv'):
    '''
    Reads from csv from file_path
    :return: pandas DataFrame of the csv
    '''
    df = pd.read_csv(file_path, header=0, sep=',', encoding='latin1')
    return df

df = read_search_strings()

In [12]:
def cleanup_categoryid(df):
    '''
    Assigns new category id starting from 1.
    ** This function modifies df **
    :return: dictionary[key] = categroyId
    '''
    i = 0
    category_dict = dict()
    for j, row in df.iterrows():
        category = row[3]
        if not category in category_dict.keys():
            i += 1
            category_dict[category] = i
            df.at[j, 'categoryId'] = i
        else:
            df.at[j, 'categoryId'] = i
    return category_dict

print(cleanup_categoryid(df))
df[100:200]
df

{'Adhesives Sealants and Tapes': 1, 'Automation Motors and Drives': 2, 'Automotive Tools and Supplies': 3, 'Baby': 4, 'Baby and Toddler Clothing': 5, 'Cars and Trucks': 6, 'Cell Phone Accessories': 7, 'Cell Phone and Smartphones': 8, 'Cleaning and Janitorial Supplies': 9, 'Costumes Reenactment Theater': 10, 'Crafts': 11, 'DVDs and Movies': 12, 'Desktops and All-In-Ones': 13, 'Digital Cameras': 14, 'Dolls and Bears': 15, 'E-Cigarettes Vapes and Accessories': 16, 'Electrical Equipment and Supplies': 17, 'Fasteners and Hardware': 18, 'Food and Beverages': 19, 'Furniture': 20, 'Heavy Equipment Parts and Attachments': 21, 'Holiday and Season Decor': 22, 'Home Telephone and Accessories': 23, 'Household Supplies and Cleaning': 24, 'Jewellery and Watches': 25, 'Kids Clothing Shoes and Accesories': 26, 'Laptop and Desktop Accessories': 27, 'Laptops and Netbooks': 28, 'Lenses and Filters': 29, 'Light Equipment and Tools': 30, 'Luggage': 31, 'Luggage Accessories': 32, 'Major Appliances': 33, 'Mak

Unnamed: 0.1,Unnamed: 0,item_title,watch_count,category,categoryId
0,0,Auto Window Sun Shade Sock Cover Baby Child UV...,624,Adhesives Sealants and Tapes,1
1,3,EPOXY RESIN 16 oz Kit CRYSTAL CLEAR for Super ...,1722,Adhesives Sealants and Tapes,1
2,5,Crystal Clear Epoxy Resin General Purpose Bar ...,1016,Adhesives Sealants and Tapes,1
3,6,Crystal Clear Epoxy Resin General Purpose Bar ...,704,Adhesives Sealants and Tapes,1
4,7,* Plastic Casting Resin for Casting in Silicon...,665,Adhesives Sealants and Tapes,1
5,9,Flex Seal Clear Jumbo Can Liquid Rubber Spray ...,533,Adhesives Sealants and Tapes,1
6,10,"ULTRA CLEAR EPOXY RESIN - Fiberglass, Table T...",504,Adhesives Sealants and Tapes,1
7,11,Flex Seal Black Jumbo Can Liquid Rubber Spray ...,504,Adhesives Sealants and Tapes,1
8,13,Crystal Clear Bar Table Top Epoxy Resin Coatin...,429,Adhesives Sealants and Tapes,1
9,14,Ultra-Low Viscosity Liquid Plastic Urethane Ca...,416,Adhesives Sealants and Tapes,1


In [40]:
from sklearn.model_selection import train_test_split

def data_split(df, train=0.65, valid=0.15, test=0.20):
    """
    split data into training, validation, and test sets
    :param df: the data set
    :param train: percentage of training data
    :param valid: percentage of validation data
    :param test: percentage of test data
    :return: X_train, X_valid, X_test, Y_train, Y_valid, Y_test
    """

    # instantiate variables
    column_headers = list(df.columns.values)
    X_train = pd.DataFrame()
    X_valid = pd.DataFrame()
    X_test = pd.DataFrame()
    Y_train = pd.DataFrame()
    Y_valid = pd.DataFrame()
    Y_test = pd.DataFrame()
    
    id_num = df['categoryId'].nunique()
    for i in range(1, id_num+1):
        x_category_df = df.loc[df['categoryId'] == i]['item_title']
        y_category_df = df.loc[df['categoryId'] == i]['categoryId']

        x_category_train_valid, x_category_test, y_category_train_valid, y_category_test = \
            train_test_split(x_category_df, y_category_df, test_size=test)
        x_category_train, x_category_valid, y_category_train, y_category_valid = \
            train_test_split(x_category_train_valid, y_category_train_valid, train_size=train/(train+valid))
        X_train = pd.concat([X_train, x_category_train], axis=0)
        X_valid = pd.concat([X_valid, x_category_valid], axis=0)
        X_test = pd.concat([X_test, x_category_test], axis=0)
        Y_train = pd.concat([Y_train, y_category_train], axis=0)
        Y_valid = pd.concat([Y_valid, y_category_valid], axis=0)
        Y_test = pd.concat([Y_test, y_category_test], axis=0)

    return X_train, X_valid, X_test, Y_train, Y_valid, Y_test
data_split(df)



(                                                       0
 154          1 LB Pound Mass Loaded Vinyl 4' x 25' Rolls
 92     1 x MILLIPUT STANDARD EPOXY PUTTY 2 PART SELF ...
 97     10x Roll Teflon Plumbing Fitting Thread Seal T...
 130    12 TUBES Chemlink M1 BLACK Structural Sealant ...
 56     20 PCS BLACK ELECTRICAL 3M TEMFLEX VINYL TAPE ...
 121    IPS Weld-On #4SC Plastic Solvent Glue Cement f...
 37     Norland 61 Optical Adhesive Glue - UV Cure - T...
 4      * Plastic Casting Resin for Casting in Silicon...
 123    2 Rolls 26 FT x 1.88" Aluminum Foil Heat Shiel...
 50     LOCTITE 243 Medium Strength Threadlocker 50ml ...
 61     3M  1" W x 5' Dual Lock SJ3550 Type 250 VHB Bl...
 67           Spray Adhesive,Foam and Fabric,20 oz. 3M 24
 29               WIRE GLUE Electrically Conductive Glue 
 35     Crystal Clear Epoxy Resin General Purpose Bar ...
 55     Silver Conductive 0.2ML Glue Wire Electrically...
 2      Crystal Clear Epoxy Resin General Purpose Bar ...
 90     NEW HO