### Import Libraries

In [25]:
import pandas as pd
import _pickle as pickle
import numpy as np
from scipy.stats import halfnorm

In [26]:
with open("ProfileData_pickleFiles/profile_data.pkl",'rb') as fp:
    df = pickle.load(fp)

In [27]:
df

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Travelling,Foodie,Books,Politics,Finance,Coding
0,Typical communicator. Subtly charming web advo...,0,6,4,2,2,2,5,2,3,5,5
1,Introvert. Friendly beer guru. Bacon fanatic. ...,0,3,5,5,7,3,7,5,4,2,0
2,Friend of animals everywhere. Passionate zombi...,2,5,6,4,3,0,3,1,8,7,8
3,Webaholic. Pop culture ninja. Wannabe organize...,4,2,9,6,9,5,0,5,1,5,2
4,Zombie buff. Troublemaker. Social media ninja....,3,0,9,6,7,1,0,2,3,5,6
...,...,...,...,...,...,...,...,...,...,...,...,...
8290,Wannabe analyst. General food enthusiast. Musi...,1,2,1,1,1,5,6,2,7,2,9
8291,Friendly pop cultureaholic. Analyst. Gamer. Am...,1,4,8,2,8,5,5,3,1,5,1
8292,Alcohol nerd. Award-winning music fan. Profess...,2,6,3,0,0,2,7,1,1,9,0
8293,Organizer. Introvert. Twitter junkie. Certifie...,1,8,2,2,9,6,1,8,7,0,8


In [28]:
# Using only Bios
# Removing the numerical data
df = df[['Bios']]

In [29]:
df

Unnamed: 0,Bios
0,Typical communicator. Subtly charming web advo...
1,Introvert. Friendly beer guru. Bacon fanatic. ...
2,Friend of animals everywhere. Passionate zombi...
3,Webaholic. Pop culture ninja. Wannabe organize...
4,Zombie buff. Troublemaker. Social media ninja....
...,...
8290,Wannabe analyst. General food enthusiast. Musi...
8291,Friendly pop cultureaholic. Analyst. Gamer. Am...
8292,Alcohol nerd. Award-winning music fan. Profess...
8293,Organizer. Introvert. Twitter junkie. Certifie...


### Creating Lists for the Categories
Here, we are specifying the categories into subcategories and allocating the probability of being an interest to each subcategory of interest

In [33]:
#Probability distribution
p = {}

# TV Genres
tv = ['Comedy',
      'Drama',
      'Action/Adventure',
      'Suspense/Thriller',
      'Documentaries',
      'Crime/Mystery',
      'News',
      'SciFi',
      'History']

p['TV'] = [0.25,
           0.21,
           0.17,
           0.16,
           0.09,
           0.08,
           0.03,
           0.02,
           0.01]

# Movie Genres
movies = ['Adventure',
          'Action',
          'Drama',
          'Comedy',
          'Thriller',
          'Horror',
          'RomCom',
          'Musical',
          'Documentary']

p['Movies'] = [0.26,
               0.21,
               0.16,
               0.14,
               0.09,
               0.06,
               0.04,
               0.01, 
               0.03]

# Religions (could potentially create a spectrum)
religion = ['Catholic',
            'Christian',
            'Jewish',
            'Muslim',
            'Hindu',
            'Buddhist',
            'Spiritual',
            'Other',
            'Agnostic',
            'Atheist']

p['Religion'] = [0.07,
                 0.13,
                 0.01,
                 0.19,
                 0.24,
                 0.05,
                 0.10,
                 0.09,
                 0.07,
                 0.05]

# Music
music = ['Rock',
         'HipHop',
         'Romantic',
         'Pop',
         'Country',
         'EDM',
         'Jazz',
         'Classical',
         ]

p['Music'] = [0.25,
              0.19,
              0.16,
              0.14,
              0.10,
              0.06,
              0.04,
              0.03,
              0.02,
              0.01,]

# Sports
sports = [
          'Cricket',
          'Chess',
          'Badminton'
          'Football',
          'Baseball',
          'Basketball',
          'Hockey',
          'Soccer',
          'Other']

p['Sports'] = [0.29,
               0.24,
               0.23, 
               0.13,
               0.04,
               0.03,
               0.02,
               0.02]

# Politics (could also put on a spectrum)
politics = ['Liberal',
            'Progressive',
            'Centrist',
            'Moderate',
            'Conservative']

p['Politics'] = [0.26,
                 0.11,
                 0.11,
                 0.15,
                 0.37]

# Social Media
social = ['Facebook',
          'Youtube',
          'Twitter',
          'Reddit',
          'Instagram',
          'Pinterest',
          'LinkedIn',
          'SnapChat',
          'TikTok']

p['Social Media'] = [0.36,
                     0.27,
                     0.11,
                     0.09,
                     0.05,
                     0.03,
                     0.03,
                     0.03,
                     0.03]

# Programming
programming = [
          'Python',
          'Java',
          'JavaScript',
          'C++',
          'C#',
          'Swift',
          'Go'
         ]

p['programming'] = [
                    0.23,
                    0.20,
                    0.18,
                    0.15,
                    0.12,
                    0.10,
                    0.02
                   ]

#travelling
travelling = [
            'Treking',
            'Adventure',
            'Long Trips',
            'Short journeys'
             ]

p['travelling'] = [
                 0.35,
                 0.33,
                 0.21,
                 0.11
                  ]


# Age (generating random numbers based on half normal distribution)
age = halfnorm.rvs(loc=18,scale=8, size=df.shape[0]).astype(int)

# Lists of Names and the list of the lists
categories = [movies, religion, music, politics, social, sports,programming, travelling, age]

names = ['Movies','Religion', 'Music', 'Politics', 'Social Media', 'Sports', 'Programming', 'Traveller', 'Age']

combined = dict(zip(names, categories))

In [34]:
combined

{'Movies': ['Adventure',
  'Action',
  'Drama',
  'Comedy',
  'Thriller',
  'Horror',
  'RomCom',
  'Musical',
  'Documentary'],
 'Religion': ['Catholic',
  'Christian',
  'Jewish',
  'Muslim',
  'Hindu',
  'Buddhist',
  'Spiritual',
  'Other',
  'Agnostic',
  'Atheist'],
 'Music': ['Rock',
  'HipHop',
  'Romantic',
  'Pop',
  'Country',
  'EDM',
  'Jazz',
  'Classical'],
 'Politics': ['Liberal',
  'Progressive',
  'Centrist',
  'Moderate',
  'Conservative'],
 'Social Media': ['Facebook',
  'Youtube',
  'Twitter',
  'Reddit',
  'Instagram',
  'Pinterest',
  'LinkedIn',
  'SnapChat',
  'TikTok'],
 'Sports': ['Cricket',
  'Chess',
  'BadmintonFootball',
  'Baseball',
  'Basketball',
  'Hockey',
  'Soccer',
  'Other'],
 'Programming': ['Python', 'Java', 'JavaScript', 'C++', 'C#', 'Swift', 'Go'],
 'Traveller': ['Treking', 'Adventure', 'Long Trips', 'Short journeys'],
 'Age': array([19, 19, 23, ..., 34, 24, 38])}

### Assigning random values to each subcategory of a category
Looping through the combined df, we will assign randomly generated value from each subcategory to the repsective category

In [35]:
# Looping through and assigning random values
for name, ops in combined.items():
    if name in ['Religion', 'Politics']:
        # Picking only 1 from the list
        df[name] = np.random.choice(ops, df.shape[0], p=p[name])
        
    elif name == 'Age':
        # Generating random ages based on a normal distribution
        df[name] = ops
    else:
        # Picking 3 from the list 
        try:
            df[name] = list(np.random.choice(ops, size=(df.shape[0],1,3), p=p[name]))
        except:
            df[name] = list(np.random.choice(ops, size=(df.shape[0],1,3)))
        
        df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = list(np.random.choice(ops, size=(df.shape[0],1,3), p=p[name]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = np.random.choice(ops, df.shape[0], p=p[name])
A value is t

In [36]:
df

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Programming,Traveller,Age
0,Typical communicator. Subtly charming web advo...,"[Adventure, Action, Thriller]",Catholic,"[Rock, Romantic, Jazz]",Liberal,"[Reddit, Facebook, Instagram]","[Chess, Cricket]",[JavaScript],"[Adventure, Treking, Short journeys]",19
1,Introvert. Friendly beer guru. Bacon fanatic. ...,"[Drama, Action]",Agnostic,"[Country, Jazz]",Moderate,[Facebook],"[Chess, Cricket]","[C++, Python]","[Adventure, Treking, Long Trips]",19
2,Friend of animals everywhere. Passionate zombi...,"[Adventure, Comedy]",Muslim,"[Classical, Country, Pop]",Liberal,"[Youtube, Twitter]","[Cricket, Baseball, BadmintonFootball]","[C++, C#]","[Adventure, Treking]",23
3,Webaholic. Pop culture ninja. Wannabe organize...,"[Horror, Action]",Agnostic,"[Classical, Pop]",Conservative,"[Reddit, Facebook, Pinterest]","[Cricket, BadmintonFootball]","[Swift, JavaScript, Python]","[Treking, Long Trips, Short journeys]",28
4,Zombie buff. Troublemaker. Social media ninja....,"[Action, Comedy]",Catholic,"[EDM, Country, Jazz]",Moderate,"[Facebook, LinkedIn]","[Cricket, Basketball, BadmintonFootball]","[Swift, Go]","[Treking, Long Trips, Short journeys]",19
...,...,...,...,...,...,...,...,...,...,...
8290,Wannabe analyst. General food enthusiast. Musi...,"[Adventure, Action]",Agnostic,"[Classical, Pop, Jazz]",Moderate,"[Youtube, SnapChat, LinkedIn]","[Chess, Baseball, BadmintonFootball]","[C++, Go, Python]","[Adventure, Short journeys]",23
8291,Friendly pop cultureaholic. Analyst. Gamer. Am...,"[Adventure, Drama]",Buddhist,"[Romantic, HipHop, Pop]",Conservative,[Facebook],"[Cricket, Chess, Baseball]","[Swift, C++, Python]",[Short journeys],23
8292,Alcohol nerd. Award-winning music fan. Profess...,"[Adventure, Drama]",Hindu,"[HipHop, Classical, Country]",Moderate,"[Youtube, Facebook]","[Chess, Basketball, Cricket]","[Java, JavaScript]","[Adventure, Long Trips, Short journeys]",34
8293,Organizer. Introvert. Twitter junkie. Certifie...,"[Drama, Action]",Hindu,"[Rock, Jazz, EDM]",Liberal,"[Youtube, Facebook]","[Chess, Baseball, BadmintonFootball]","[Java, JavaScript, Python]","[Treking, Short journeys]",24


### Categorizing the religion and poilitics

In [37]:
df['Religion'] = pd.Categorical(df.Religion, ordered=True,
                                categories=['Catholic',
                                            'Christian',
                                            'Jewish',
                                            'Muslim',
                                            'Hindu',
                                            'Buddhist',
                                            'Spiritual',
                                            'Other',
                                            'Agnostic',
                                            'Atheist'])

df['Politics'] = pd.Categorical(df.Politics, ordered=True,
                                categories=['Liberal',
                                            'Progressive',
                                            'Centrist',
                                            'Moderate',
                                            'Conservative'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Religion'] = pd.Categorical(df.Religion, ordered=True,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Politics'] = pd.Categorical(df.Politics, ordered=True,


In [38]:
df

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Programming,Traveller,Age
0,Typical communicator. Subtly charming web advo...,"[Adventure, Action, Thriller]",Catholic,"[Rock, Romantic, Jazz]",Liberal,"[Reddit, Facebook, Instagram]","[Chess, Cricket]",[JavaScript],"[Adventure, Treking, Short journeys]",19
1,Introvert. Friendly beer guru. Bacon fanatic. ...,"[Drama, Action]",Agnostic,"[Country, Jazz]",Moderate,[Facebook],"[Chess, Cricket]","[C++, Python]","[Adventure, Treking, Long Trips]",19
2,Friend of animals everywhere. Passionate zombi...,"[Adventure, Comedy]",Muslim,"[Classical, Country, Pop]",Liberal,"[Youtube, Twitter]","[Cricket, Baseball, BadmintonFootball]","[C++, C#]","[Adventure, Treking]",23
3,Webaholic. Pop culture ninja. Wannabe organize...,"[Horror, Action]",Agnostic,"[Classical, Pop]",Conservative,"[Reddit, Facebook, Pinterest]","[Cricket, BadmintonFootball]","[Swift, JavaScript, Python]","[Treking, Long Trips, Short journeys]",28
4,Zombie buff. Troublemaker. Social media ninja....,"[Action, Comedy]",Catholic,"[EDM, Country, Jazz]",Moderate,"[Facebook, LinkedIn]","[Cricket, Basketball, BadmintonFootball]","[Swift, Go]","[Treking, Long Trips, Short journeys]",19
...,...,...,...,...,...,...,...,...,...,...
8290,Wannabe analyst. General food enthusiast. Musi...,"[Adventure, Action]",Agnostic,"[Classical, Pop, Jazz]",Moderate,"[Youtube, SnapChat, LinkedIn]","[Chess, Baseball, BadmintonFootball]","[C++, Go, Python]","[Adventure, Short journeys]",23
8291,Friendly pop cultureaholic. Analyst. Gamer. Am...,"[Adventure, Drama]",Buddhist,"[Romantic, HipHop, Pop]",Conservative,[Facebook],"[Cricket, Chess, Baseball]","[Swift, C++, Python]",[Short journeys],23
8292,Alcohol nerd. Award-winning music fan. Profess...,"[Adventure, Drama]",Hindu,"[HipHop, Classical, Country]",Moderate,"[Youtube, Facebook]","[Chess, Basketball, Cricket]","[Java, JavaScript]","[Adventure, Long Trips, Short journeys]",34
8293,Organizer. Introvert. Twitter junkie. Certifie...,"[Drama, Action]",Hindu,"[Rock, Jazz, EDM]",Liberal,"[Youtube, Facebook]","[Chess, Baseball, BadmintonFootball]","[Java, JavaScript, Python]","[Treking, Short journeys]",24


### Saving and exporting the pickle file

In [39]:
with open("refined_profiles.pkl",'wb') as fp:
    pickle.dump(df, fp)