# Import in libraries and data

In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
content = pd.read_csv('../Data/Content.csv', index_col=0)
reactions = pd.read_csv('../Data/Reactions.csv', index_col=0)
rtypes = pd.read_csv('../Data/ReactionTypes.csv', index_col=0)

In [3]:
files = os.listdir('../Data/')

In [4]:
pattern = re.compile(r'^(.+)\.csv$')

In [5]:
files = [pattern.match(file).group(1) for file in files if pattern.match(file)]

In [6]:
files

['Clean_Data', 'Content', 'Reactions', 'ReactionTypes']

# Preview the data

In [7]:
content.head()

Unnamed: 0,Content ID,User ID,Type,Category,URL
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


In [8]:
reactions.head()

Unnamed: 0,Content ID,User ID,Type,Datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,2021-01-06 19:13:01


In [9]:
rtypes.head()

Unnamed: 0,Type,Sentiment,Score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30


# Cleaning the data
The guided task are to:
    removing rows that have values which are missing,
    changing the data type of some values within a column, and
    removing columns which are not relevant to this task.
    Think about how each column might be relevant to the business question you’re investigating. If you can’t think of why a column may
    be useful, it may not be worth including it.

## Removing columns

Since our task requires us to only find the most popular categories we will not need any user data, so we can drop user ID and URL, we will keep the other columns because it provides more context. Date and time can be dropped, but I believe knowing user engagement over time will be useful.

In [10]:
reactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25553 entries, 0 to 25552
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Content ID  25553 non-null  object
 1   User ID     22534 non-null  object
 2   Type        24573 non-null  object
 3   Datetime    25553 non-null  object
dtypes: object(4)
memory usage: 998.2+ KB


In [11]:
reactions['Datetime'] = pd.to_datetime(reactions['Datetime'])

In [12]:
reactions.dtypes

Content ID            object
User ID               object
Type                  object
Datetime      datetime64[ns]
dtype: object

In [13]:
reactions['Type'].value_counts()

Type
heart          1622
scared         1572
peeking        1559
hate           1552
interested     1549
dislike        1548
adore          1548
want           1539
love           1534
disgust        1526
like           1520
super love     1519
indifferent    1512
cherish        1501
worried        1497
intrigued      1475
Name: count, dtype: int64

In [14]:
content.drop(columns= ['URL', 'User ID'], inplace=True)
content

Unnamed: 0,Content ID,Type,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,Studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,video,food
...,...,...,...
995,b4cef9ef-627b-41d7-a051-5961b0204ebb,video,public speaking
996,7a79f4e4-3b7d-44dc-bdef-bc990740252c,GIF,technology
997,435007a5-6261-4d8b-b0a4-55fdc189754b,audio,veganism
998,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,GIF,culture


In [15]:
reactions.drop(columns= ['User ID'], inplace=True)
reactions

Unnamed: 0,Content ID,Type,Datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,2021-04-22 15:17:15
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01
...,...,...,...
25548,75d6b589-7fae-4a6d-b0d0-752845150e56,dislike,2020-06-27 09:46:48
25549,75d6b589-7fae-4a6d-b0d0-752845150e56,intrigued,2021-02-16 17:17:02
25550,75d6b589-7fae-4a6d-b0d0-752845150e56,interested,2020-09-12 03:54:58
25551,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,2020-11-04 20:08:31


In [16]:
reactions.dropna(inplace=True)

In [17]:
reactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24573 entries, 1 to 25552
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Content ID  24573 non-null  object        
 1   Type        24573 non-null  object        
 2   Datetime    24573 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 767.9+ KB


## Finding null values

Although the task is to remove rows with any missing column i think its important to identify what data is missing, so if needed we can impute the values or correct the data

In [18]:
rtypes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Type       16 non-null     object
 1   Sentiment  16 non-null     object
 2   Score      16 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 512.0+ bytes


In [19]:
rtypes['Type'].values

array(['heart', 'want', 'disgust', 'hate', 'interested', 'indifferent',
       'love', 'super love', 'cherish', 'adore', 'like', 'dislike',
       'intrigued', 'peeking', 'scared', 'worried'], dtype=object)

In [20]:
content.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Content ID  1000 non-null   object
 1   Type        1000 non-null   object
 2   Category    1000 non-null   object
dtypes: object(3)
memory usage: 31.2+ KB


In [21]:
content['Type'].value_counts()

Type
photo    261
video    259
GIF      244
audio    236
Name: count, dtype: int64

In [22]:
content['Category'].value_counts()

Category
technology           71
animals              67
travel               67
culture              63
science              63
fitness              61
food                 61
healthy eating       61
cooking              60
soccer               58
tennis               58
education            57
dogs                 56
studying             55
veganism             48
public speaking      48
Fitness               5
Animals               4
Science               4
"soccer"              3
"culture"             3
Soccer                3
"dogs"                2
Education             2
Studying              2
Travel                2
Food                  2
"veganism"            1
"public speaking"     1
Public Speaking       1
"technology"          1
"cooking"             1
Healthy Eating        1
"studying"            1
"food"                1
Culture               1
"tennis"              1
Technology            1
"animals"             1
Veganism              1
"science"             1
Name: c

# Merging datasets

In [23]:
df=pd.merge(reactions, content, on='Content ID', suffixes=('_Reaction','_Content')).merge(rtypes, left_on='Type_Reaction', right_on='Type')

In [24]:
df

Unnamed: 0,Content ID,Type_Reaction,Datetime,Type_Content,Category,Type,Sentiment,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50,photo,Studying,disgust,negative,0
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01,photo,Studying,disgust,negative,0
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-04-09 02:46:20,photo,Studying,disgust,negative,0
3,9f737e0a-3cdd-4d29-9d24-753f4e3be810,disgust,2021-03-28 21:15:26,photo,healthy eating,disgust,negative,0
4,230c4e4d-70c3-461d-b42c-ec09396efb3f,disgust,2020-08-04 05:40:33,photo,healthy eating,disgust,negative,0
...,...,...,...,...,...,...,...,...
24568,435007a5-6261-4d8b-b0a4-55fdc189754b,adore,2020-10-04 22:26:33,audio,veganism,adore,positive,72
24569,435007a5-6261-4d8b-b0a4-55fdc189754b,adore,2020-09-18 10:50:50,audio,veganism,adore,positive,72
24570,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,adore,2020-10-31 03:58:44,GIF,culture,adore,positive,72
24571,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,adore,2020-06-25 15:12:29,GIF,culture,adore,positive,72


In [25]:
df.drop(columns = 'Type', inplace=True)

In [26]:
df.head(10)

Unnamed: 0,Content ID,Type_Reaction,Datetime,Type_Content,Category,Sentiment,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50,photo,Studying,negative,0
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01,photo,Studying,negative,0
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-04-09 02:46:20,photo,Studying,negative,0
3,9f737e0a-3cdd-4d29-9d24-753f4e3be810,disgust,2021-03-28 21:15:26,photo,healthy eating,negative,0
4,230c4e4d-70c3-461d-b42c-ec09396efb3f,disgust,2020-08-04 05:40:33,photo,healthy eating,negative,0
5,3f8590c7-6ab2-4973-805a-90cdec355f05,disgust,2021-01-14 07:21:32,video,dogs,negative,0
6,3f8590c7-6ab2-4973-805a-90cdec355f05,disgust,2021-01-13 23:35:56,video,dogs,negative,0
7,3f8590c7-6ab2-4973-805a-90cdec355f05,disgust,2021-02-11 20:10:20,video,dogs,negative,0
8,b18cb63f-4c8e-44ee-a47f-541e95191d11,disgust,2021-05-27 01:44:22,photo,public speaking,negative,0
9,b18cb63f-4c8e-44ee-a47f-541e95191d11,disgust,2021-02-13 04:14:45,photo,public speaking,negative,0


In [27]:
df.rename(columns={'Type_Reaction':'Reaction_Type', 'Type_Content':'Content_Type'})

Unnamed: 0,Content ID,Reaction_Type,Datetime,Content_Type,Category,Sentiment,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50,photo,Studying,negative,0
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01,photo,Studying,negative,0
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-04-09 02:46:20,photo,Studying,negative,0
3,9f737e0a-3cdd-4d29-9d24-753f4e3be810,disgust,2021-03-28 21:15:26,photo,healthy eating,negative,0
4,230c4e4d-70c3-461d-b42c-ec09396efb3f,disgust,2020-08-04 05:40:33,photo,healthy eating,negative,0
...,...,...,...,...,...,...,...
24568,435007a5-6261-4d8b-b0a4-55fdc189754b,adore,2020-10-04 22:26:33,audio,veganism,positive,72
24569,435007a5-6261-4d8b-b0a4-55fdc189754b,adore,2020-09-18 10:50:50,audio,veganism,positive,72
24570,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,adore,2020-10-31 03:58:44,GIF,culture,positive,72
24571,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,adore,2020-06-25 15:12:29,GIF,culture,positive,72


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24573 entries, 0 to 24572
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Content ID     24573 non-null  object        
 1   Type_Reaction  24573 non-null  object        
 2   Datetime       24573 non-null  datetime64[ns]
 3   Type_Content   24573 non-null  object        
 4   Category       24573 non-null  object        
 5   Sentiment      24573 non-null  object        
 6   Score          24573 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 1.3+ MB


In [29]:
df['Category'].nunique()

40

In [30]:
df['Category'].value_counts().head(40)

Category
animals              1765
healthy eating       1711
technology           1667
science              1662
cooking              1640
travel               1618
food                 1606
culture              1586
education            1397
soccer               1334
tennis               1328
studying             1303
dogs                 1283
fitness              1257
veganism             1200
public speaking      1157
Fitness               138
Science               116
Animals                92
Food                   91
Soccer                 65
"soccer"               58
"dogs"                 55
"culture"              49
Studying               45
Culture                41
"animals"              40
"veganism"             37
Education              36
Public Speaking        32
Travel                 29
"public speaking"      28
"technology"           28
"cooking"              24
"science"              18
"studying"             15
Veganism               11
Healthy Eating          6
Tec

In [31]:
df['Category'] = df['Category'].str.replace('"', '').str.rstrip().str.title()

In [32]:
df['Category'].value_counts().head(40)

Category
Animals            1897
Science            1796
Healthy Eating     1717
Food               1699
Technology         1698
Culture            1676
Cooking            1664
Travel             1647
Soccer             1457
Education          1433
Fitness            1395
Studying           1363
Dogs               1338
Tennis             1328
Veganism           1248
Public Speaking    1217
Name: count, dtype: int64

In [33]:
df['Category'].value_counts().head(5)

Category
Animals           1897
Science           1796
Healthy Eating    1717
Food              1699
Technology        1698
Name: count, dtype: int64

The top 5 categories by post are:

1)    Animals           1897
2)    Science           1796
3)    Healthy Eating    1717
4)    Food              1699
5)    Technology        1698

In [34]:
df.loc[:, ['Category','Score']].groupby(['Category'],as_index=False).sum().sort_values(by='Score', ascending=False).head(5)

Unnamed: 0,Category,Score
0,Animals,74965
9,Science,71168
7,Healthy Eating,69339
12,Technology,68738
6,Food,66676


The top 5 categories by score are:

1) Animals 74965
2) Science 71168
3) Healthy Eating 69339
4) Technology 68738
5) Food 66676

In [35]:
df.loc[df['Category'] == 'Animals', 'Score'].sum()

74965

In [36]:
df.to_csv('../Data/Clean_Data.csv', index=False)