# Refining and Cleaning the profile Data
### To transform the Numerical Randomized variables in the bios-dataset into categorical datas


#### Importing necessary libraries

In [1]:
import pandas as pd
import _pickle as pickle
import numpy as np
from scipy.stats import halfnorm

In [2]:
with open("profiles.pkl",'rb') as fp:
    df = pickle.load(fp)

In [3]:
# Removing the numerical data
df = df[['Bios']]
df

Unnamed: 0,Bios
0,Passionate analyst. Incurable pop culture prac...
1,Organizer. Hardcore web guru. Certified coffee...
2,Hipster-friendly social media scholar. Profess...
3,Writer. Introvert. Beer aficionado. Music spec...
4,Creator. Unable to type with boxing gloves on....
...,...
14995,Coffee expert. Unapologetic twitter guru. Avid...
14996,Freelance bacon enthusiast. Infuriatingly humb...
14997,Hardcore twitter practitioner. Extreme web exp...
14998,Hardcore tv guru. Troublemaker. Typical food l...


# Creating different categories for the attributes
### Using probability dictionary to assign the weights and adding individual entries for each corresponding numerical values

In [5]:
p = {} #Probability Dictionary

# Drink taste
Drinks = ['Coffee',
          'Tea',
          'Water',
          'Wine',
          'Beer',
          'Vodka',
          'Whiskey',
          'Hot Choclate',
          'Brandy',
          'Tequila',
          'Iced Tea',
           'Soda']

p['Drinks'] = [0.18,
               0.11,
               0.06,
               0.15,
               0.05,
               0.12,
               0.04,
               0.01, 
               0.01,
              0.10,
              0.08,
              0.09]

# Perfumes Genres
Perfume = ['Confident',
      'Unique',
      'Adventure',
      'Sensual',
      'Athletic',
      'Macho',
      'Bold',
      'Dreamy',
      'Exotic',
      'Choclate',
        'Woody',
        'Floral']

p['Perfume'] = [0.10,
               0.13,
               0.12,
               0.12,
               0.09,
               0.08,
               0.03,
               0.04,
               0.05,
                0.06,
                0.05,
                0.13]

# Ideal Vacation Choices
Vacation = ['Relaxing on the beach',
            'Explore new surroundings',
            'Hiking',
            'Trekking',
            'Skiing in the Mountains',
            'Water Sports',
            'Spiritual travel',
            'Visit Party Grounds',
            'Sit at home with a book',
            'Golf',
             'Explore the countryside',
             'Do nothing']

p['Vacation'] = [0.10,
                 0.10,
                 0.01,
                 0.19,
                 0.11,
                 0.05,
                 0.10,
                 0.09,
                 0.07,
                 0.06,
                 0.08,
                 0.04]

# Sense of Personal Style
style = ['Daring',
         'Classic',
         'Flirty',
         'Utilitarian',
         'Charming',
         'Calm',
         'Idealist',
         'Rebellious',
         'Femme Fatale',
         'Stylish',
          'Sporty',
          'Intellectual']

p['style'] = [0.10,
              0.13,
              0.11,
              0.14,
              0.06,
              0.04,
              0.03,
              0.02,
              0.02,
              0.05,
              0.11,
              0.19]

# Music
Music = ['EDM',
          'Techno',
          'Classical',
          'Jazz',
          'Blues',
          'Metal',
          'Post-Rock',
          'Indie',
          'Country',
          'Rock',
          'Mainstream',
           'Romantic']

p['Music'] = [0.04,
               0.06,
               0.16, 
               0.13,
               0.11,
               0.03,
               0.08,
               0.02,
               0.12,
               0.14,
               0.06,
               0.05]

# Hobbies
Hobbies = ['Reading',
            'Cooking',
            'Meditation',
            'Traveling',
            'Hiking',
            'Sports',
            'Music',
            'Partying',
            'Netflix and Chill',
            'Stamp Collection',
             'Wine Tasting',
             'Gaming']

p['Hobbies'] = [0.10,
              0.13,
              0.11,
              0.14,
              0.06,
              0.04,
              0.03,
              0.02,
              0.02,
              0.05,
              0.11,
              0.19]

# Food Preferences
Food  = ['Sweet',
          'Spicy',
          'Bitter',
          'Intercontinental',
          'Barbecue',
          'Indian',
          'Oriental',
          'Comfort/Soul',
          'Middle Eastern',
          'Classic French',
          'Fusion',
          'New and Trendy']

p['Food'] = [0.04,
             0.06,
             0.16, 
             0.13,
             0.11,
             0.03,
             0.08,
             0.02,
             0.12,
             0.14,
             0.06,
             0.05]

#Scent Preferences
Scent = ['Citrus',
         'Floral',
         'Aromatic',
         'Woody',
         'Green',
         'Oriental',
         'Fougere',
          'Chypre',
          'Exotic',
          'Elegance',
          'Fruity',
          'Aromatic']

p['Scent'] = [0.10,
              0.10,
              0.01,
              0.19,
              0.11,
              0.05,
              0.10,
              0.09,
              0.07,
              0.06,
              0.08,
              0.04]

#What the users want their scent to make them feel
Feel = ['Energized',
        'Peaceful',
        'Sensual',
        'Youthful',
        'Courageous',
        'Elegant',
        'Macho',
        'Chic',
         'Sexy',
        'Glamorous',
        'Wild',
        'Athlectic']

p['Feel'] = [0.18,
             0.11,
             0.06,
             0.15,
             0.05,
             0.12,
             0.04,
             0.01, 
             0.01,
             0.10,
             0.08,
             0.09]



# Age (generating random numbers based on half normal distribution)
age = halfnorm.rvs(loc=18,scale=8, size=df.shape[0]).astype(int)

# Lists of Names and the list of the lists
categories = [Drinks, Perfume, Vacation, style, Music, Hobbies, Food, Scent, Feel, age ]

names = ['Drinks','Perfume', 'Vacation', 'Personal Style', 'Music', 'Hobbies', 'Food', 'Scent', 'Feel', 'Age']

combined = dict(zip(names, categories))

## Incorporating the categories into the random numerical values

In [6]:
# Looping through and assigning random values
for name, cats in combined.items():
    
    if name == 'Age':
        # Generating random ages based on a normal distribution
        df[name] = cats
    else:
        # Picking 3 from the list 
        try:
            df[name] = list(np.random.choice(cats, size=(df.shape[0],1,3), p=p[name]))
        except:
            df[name] = list(np.random.choice(cats, size=(df.shape[0],1,3)))
        
        df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))


In [7]:
df

Unnamed: 0,Bios,Drinks,Perfume,Vacation,Personal Style,Music,Hobbies,Food,Scent,Feel,Age
0,Passionate analyst. Incurable pop culture prac...,"[Tequila, Coffee, Vodka]","[Adventure, Unique, Athletic]","[Water Sports, Trekking, Explore new surroundi...","[Daring, Rebellious, Femme Fatale]","[Jazz, Classical]","[Stamp Collection, Reading]","[Classic French, Fusion]","[Fruity, Chypre, Fougere]","[Elegant, Sensual, Wild]",27
1,Organizer. Hardcore web guru. Certified coffee...,"[Vodka, Coffee]","[Bold, Exotic, Athletic]","[Relaxing on the beach, Trekking, Spiritual tr...","[Sporty, Intellectual]","[Jazz, Metal]","[Wine Tasting, Traveling, Reading]","[Intercontinental, Indian, New and Trendy]","[Fougere, Exotic]","[Athlectic, Energized]",30
2,Hipster-friendly social media scholar. Profess...,"[Whiskey, Water, Coffee]","[Exotic, Athletic]","[Relaxing on the beach, Spiritual travel, Expl...","[Classic, Charming, Intellectual]","[Blues, Techno, Country]","[Wine Tasting, Traveling, Hiking]","[Classic French, Intercontinental, Middle East...","[Floral, Oriental, Citrus]","[Sensual, Energized]",23
3,Writer. Introvert. Beer aficionado. Music spec...,"[Wine, Vodka]","[Floral, Macho, Dreamy]","[Visit Party Grounds, Trekking, Spiritual travel]","[Classic, Flirty, Intellectual]","[Jazz, Techno, Classical]","[Meditation, Traveling, Hiking]","[Classic French, Intercontinental, Indian]","[Fruity, Chypre, Aromatic]","[Peaceful, Courageous, Energized]",21
4,Creator. Unable to type with boxing gloves on....,"[Water, Wine, Soda]","[Bold, Sensual, Adventure]","[Trekking, Explore new surroundings]","[Daring, Sporty, Stylish]","[Blues, Jazz, Classical]","[Wine Tasting, Meditation, Traveling]","[Classic French, Barbecue, Indian]","[Woody, Floral, Aromatic]","[Elegant, Courageous, Youthful]",21
...,...,...,...,...,...,...,...,...,...,...,...
14995,Coffee expert. Unapologetic twitter guru. Avid...,"[Wine, Soda]","[Unique, Choclate]","[Skiing in the Mountains, Spiritual travel, Ex...","[Idealist, Calm]","[Blues, Country, Rock]","[Sports, Meditation]","[Spicy, Comfort/Soul, Bitter]","[Green, Chypre]","[Energized, Glamorous]",30
14996,Freelance bacon enthusiast. Infuriatingly humb...,"[Tea, Coffee, Soda]","[Bold, Sensual, Floral]","[Trekking, Spiritual travel]","[Calm, Rebellious]","[EDM, Country, Rock]","[Reading, Netflix and Chill]","[Classic French, Bitter, Middle Eastern]","[Woody, Fougere, Elegance]","[Elegant, Energized]",28
14997,Hardcore twitter practitioner. Extreme web exp...,"[Tequila, Vodka]","[Bold, Sensual, Confident]","[Skiing in the Mountains, Trekking, Sit at hom...","[Daring, Calm, Idealist]","[EDM, Country, Mainstream]","[Traveling, Hiking, Reading]","[Indian, Barbecue, Bitter]","[Fruity, Oriental, Chypre]","[Elegant, Energized, Glamorous]",20
14998,Hardcore tv guru. Troublemaker. Typical food l...,"[Tea, Wine, Coffee]","[Woody, Sensual, Athletic]","[Visit Party Grounds, Relaxing on the beach, S...","[Utilitarian, Sporty, Flirty]","[EDM, Jazz, Post-Rock]","[Wine Tasting, Traveling, Cooking]","[Barbecue, Fusion]","[Fruity, Green, Chypre]","[Sexy, Courageous, Youthful]",19


# Exporting the refined dataset


In [8]:
with open("refined_profiles.pkl",'wb') as fp:
    pickle.dump(df, fp)