In [1]:
import pandas as pd
import re
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
post_path = './2000/first_2000.csv'
columns = ['id', 'categories', 'description', 'birthday', 'gender',
       'post_followers', 'post_time', 'post_interactions', 'post_likes', 'post_comments',
       'post_caption', 'post_hashtags', 'post_mentions', 'perma_link',
       'image']

In [3]:
df = pd.read_csv(post_path, names=columns, skiprows=1, keep_default_na=False)

Reassign columns due to inaccurate labeling from the original post csv. 

In [4]:
df.drop(index=[359371, 131072, 262144], inplace=True)

Create 3 Pandas Dataframes and combine the Numerical, text and images features to be worked with in separate scripts. 

In [5]:
df_text = df[['id','description', 'post_caption', 'post_hashtags', 'post_mentions']]

In [120]:
df_num = df[['id','gender', 'post_followers', 'post_likes', 'post_comments']]

In [122]:
df_image = df[['id','perma_link', 'image']]

---
# Numerical Dataframe

Separate all of the numerical features into a single Dataframe. Clean and process them into usable features. Lastly save as a csv.

### Convert Categories String to list, one hot encode

In [123]:
df_cat['categories']= df_cat['categories'].str.strip('], ').str.strip('[]').str.strip('] ').str.replace(' ', '').str.split(',')

In [124]:
s = df_cat['categories']

In [125]:
df_cat_temp = pd.get_dummies(s.apply(pd.Series).stack()).sum(level=0)

Apply category labels to One Hot DF

1 => 'Beauty',
2 => 'DIY',
3 => 'Fashion',
4 => 'Lifestyle',
5 => 'Menswear',
6 => 'Models',
8 => 'News',
9 => 'Personal Style',
10 => 'Photography',
11 => 'Illustration',
12 => 'Vlog',
13 => 'Food',
14 => 'Artist',
15 => 'Musician/DJ',
16 => "Tech",
17 => "Parenting",
18 => "Entertainment",
19 => "Fitness",
20 => "Health and Wellness",
21 => "Automotive",
22 => "Home Decor",
23 => "Art and Design",
24 => "Travel"

In [126]:
cat_columns = ['no_cat','cat_beauty', 'cat_photography', 'cat_illustration', 'cat_vlog', 'cat_food', 'cat_artist', 'cat_musician/DJ', 
               "cat_tech", "cat_parenting", "cat_entertainment", "cat_fitness", 'cat_diy', "cat_health_wellness", "cat_automotive", 
               "cat_home_decor", "cat_art_design", "cat_travel",'cat_fashion', 'cat_lifestyle', 'cat_menswear', 
               'cat_models', 'unknown', 'cat_news', 'cat_personal_style']

In [127]:
df_cat_temp.columns = cat_columns

In [128]:
df_cat = df_cat.join(df_cat_temp)

In [129]:
df_cat.drop('categories', axis=1, inplace=True)

In [130]:
df_cat = df_cat.groupby('id').mean().round()

In [131]:
df_cat.drop(index = ['What do you use your mason jar for?'], inplace=True)

In [132]:
cos_sim_cat = pd.DataFrame(cosine_similarity(df_cat, dense_output=False), 
                              index=df_cat.index, columns=df_cat.index)

In [133]:
cos_sim_cat.head()

id,8,82,203,303,579,589,627,659,677,703,...,9810,9811,9824,9838,9879,9911,9957,9965,9987,_
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,1.0,0.258199,0.654654,0.774597,0.408248,0.666667,1.0,0.480384,0.666667,0.654654,...,0.547723,0.666667,0.57735,0.408248,0.57735,0.57735,0.774597,0.333333,0.333333,0.0
82,0.258199,1.0,0.338062,0.4,0.316228,0.0,0.258199,0.496139,0.0,0.338062,...,0.424264,0.258199,0.223607,0.316228,0.298142,0.223607,0.2,0.258199,0.258199,0.0
203,0.654654,0.338062,1.0,0.676123,0.534522,0.436436,0.654654,0.628971,0.654654,0.714286,...,0.597614,0.654654,0.566947,0.267261,0.629941,0.377964,0.507093,0.218218,0.218218,0.0
303,0.774597,0.4,0.676123,1.0,0.632456,0.516398,0.774597,0.620174,0.774597,0.676123,...,0.565685,0.774597,0.67082,0.316228,0.596285,0.447214,0.6,0.258199,0.258199,0.0
579,0.408248,0.316228,0.534522,0.632456,1.0,0.204124,0.408248,0.686406,0.408248,0.400892,...,0.559017,0.408248,0.353553,0.25,0.589256,0.353553,0.316228,0.204124,0.204124,0.0


In [134]:
cos_sim_cat.to_csv('cos_sim_cat.csv')

### Gender: Remove comma, remove 'nil',  convert to int

In [70]:
# df_num['gender'] = pd.DataFrame(df_num['gender'].str.replace(',', ''))

In [71]:
# df_num['gender'] = pd.DataFrame(df_num['gender'].replace('nil ', None))

In [84]:
# df_num.gender = df_num.gender.astype(int)

### post_followers: Remove comma,  convert to int

In [73]:
# df_num['post_followers'] = pd.DataFrame(df_num['post_followers'].str.replace(',', ''))

In [74]:
# df_num.post_followers = pd.to_numeric(df_num.post_followers)

### post_likes: Remove commas, convert to int

In [50]:
df_num['post_likes'] = pd.DataFrame(df_num['post_likes'].str.replace(',', ''))

In [51]:
df_num.post_likes = pd.to_numeric(df_num.post_likes)

### post_interactions: Remove commas convert to int

In [37]:
# df_num['post_interactions'] = pd.DataFrame(df_num['post_interactions'].str.replace(', ', ''))

In [38]:
# df_num['post_interactions'] = pd.DataFrame(df_num['post_interactions'].str.replace(',', ''))

In [39]:
# df_num['post_interactions'] = pd.DataFrame(df_num['post_interactions'].str.replace('nil', ''))

In [None]:
# df_num['post_interactions'] = pd.DataFrame(df_num['post_interactions'].str.replace('""', ''))

In [40]:
# df_num.post_interactions = pd.to_numeric(df_num.post_interactions)

### post_comments: Remove commas, convert to int

In [75]:
df_num['post_comments'] = pd.DataFrame(df_num['post_comments'].str.replace(', ', ''))

In [76]:
df_num['post_comments'] = pd.DataFrame(df_num['post_comments'].str.replace(',', ''))

In [77]:
df_num['post_comments'] = pd.DataFrame(df_num['post_comments'].str.replace('nil', ''))

In [78]:
df_num['post_comments'] = pd.DataFrame(df_num['post_comments'].str.replace('""', ''))

In [79]:
df_num.post_comments = pd.to_numeric(df_num.post_comments)

## Save Num Values to CSV

We will save to two CSV files, one which includes the text columns and the second which only includes the integer based columns. We are doing this to allow for an intermediate step of Natural Language Processing before combining to a single Cosine Similarity recommendation. 

Group all by ID and set all values to **Mean**

In [66]:
type(df_num.post_followers.values[3])

int

In [82]:
type(df_num.gender.values[3])

int

In [68]:
type(df_num.post_likes.values[3])

str

In [80]:
type(df_num.post_comments.values[3])

numpy.float64

In [58]:
df_num = df_num.groupby('id').mean().round()

In [59]:
df_num.head()

Unnamed: 0_level_0,post_likes,post_comments
id,Unnamed: 1_level_1,Unnamed: 2_level_1
8,823.0,32.0
82,581.0,13.0
203,6155.0,69.0
303,1099.0,12.0
579,3026.0,122.0


In [None]:
df_num.fillna(value=0, inplace=True)

Save Numerical data to CSV

In [None]:
df_num.to_csv('./2000/2000_num_per_user.csv')

In [None]:
df_num.head(20)

---
# Text DataFrame

Clean DataFrame with text portion and prepare for Natural Language Processing.

### post_caption: remove quotes and commas

In [85]:
df_text['post_caption'] = pd.DataFrame(df_text['post_caption'].str.replace('",', ''))

In [86]:
df_text['post_caption'] = df_text['post_caption'].str.replace('"', '')

In [87]:
df_text['post_caption'] = pd.DataFrame(df_text['post_caption'].str.replace('"",', ''))

In [88]:
df_text['post_caption'] = pd.DataFrame(df_text['post_caption'].str.replace('\n', ''))

In [89]:
df_text.post_caption = df_text.post_caption.astype(str)

### post_hashtags: Remove quotes, convert to string

In [90]:
df_text['post_hashtags'] = pd.DataFrame(df_text['post_hashtags'].str.replace('",', ''))

In [91]:
df_text['post_hashtags'] = pd.DataFrame(df_text['post_hashtags'].str.replace('"', ''))

In [92]:
df_text['post_hashtags'] = pd.DataFrame(df_text['post_hashtags'].str.replace('"",', ''))

In [93]:
df_text.post_hashtags = df_text.post_hashtags.astype(str)

### post_mentions: Remove quotes, convert to string

In [94]:
df_text['post_mentions'] = pd.DataFrame(df_text['post_mentions'].str.replace('",', ''))

In [95]:
df_text['post_mentions'] = pd.DataFrame(df_text['post_mentions'].str.replace('"', ''))

In [96]:
df_text['post_mentions'] = pd.DataFrame(df_text['post_mentions'].str.replace('"",', ''))

In [97]:
df_text['post_mentions'] = pd.DataFrame(df_text['post_mentions'].str.replace("#<Hashie::Array \[]>", ' '))

In [98]:
df_text.post_mentions = df_text.post_mentions.astype(str)

### description: remove quotes convert to string

In [99]:
df_text['description'] = pd.DataFrame(df_text['description'].str.replace('",', ''))

In [100]:
df_text['description'] = pd.DataFrame(df_text['description'].str.replace('"', ''))

In [101]:
df_text['description'] = pd.DataFrame(df_text['description'].str.replace('"",', ''))

In [102]:
df_text['description'] = pd.DataFrame(df_text['description'].replace('nil, ', None))

In [103]:
df_text.description = df_text.description.astype(str)

### Combine all text into a single cell per user

In [104]:
df_text['all_text'] = df_text[['post_caption', 'post_hashtags', 'post_mentions' ]].apply(lambda x: ' '.join(x), axis=1)

Then groupby user id and description.  This allows for the description to be entered just once. 

In [105]:
df_text = df_text.groupby(['id', 'description'])['all_text'].apply(' '.join).reset_index()

In [106]:
df_text['all_text'] = df_text[['description', 'all_text']].apply(lambda x: ' '.join(x), axis=1)

In [107]:
df_text.drop('description', axis=1, inplace=True)

In [108]:
df_text.all_text = pd.DataFrame(df_text['all_text'].str.replace('\n-\n', ' '))

In [109]:
bad_character_list = ['\\n•', '\\n', '\n', ',', '.', '?', '!', ')', '(', '#', '&', '\r', '"', '\r\n\r\n']

In [110]:
for symbol in bad_character_list:
    df_text.all_text = pd.DataFrame(df_text['all_text'].str.replace(symbol, '', regex=False))

### Save Text data to CSV

In [111]:
df_text.to_csv('./2000/2000_text_per_user.csv')

In [112]:
df_text.head(10)

Unnamed: 0,id,all_text
0,8,Werbung | Ein Bild von vor 2 Tagen als es hie...
1,82,Natalie Zfat is a social media entrepreneur Fo...
2,203,Comfy me [] [asos kitsune chanelofficial] Bea...
3,303,Allow me to introduce myself you lovely reader...
4,579,For 8 years my body has been holding on to a ...
5,589,$10 top + $12 pants + $5 beanie + $250 mask =...
6,627,Dominican Lifestyle and Fashion blogger living...
7,659,Jordan Landes-Brenman @HAUTEHOUSEFLOWER has ea...
8,677,And we made it to Florida We took this pictur...
9,703,Kathleen Barnes is the life and style blogger ...


---

# Image DataFrame

Clean and process DataFrame to be used in Photo ETL script. 

In [None]:
df_image.head()

In [None]:
df_image.to_csv('image_per_post.csv')