In [10]:
# import libraries
import numpy as np
import pandas as pd
import os


In [11]:
# data path
data_path = os.path.join(os.getcwd(), 'data')

# read in data
df_contents = pd.read_csv(os.path.join(data_path, 'Content.csv'))
df_reactions = pd.read_csv(os.path.join(data_path, 'Reactions.csv'))
df_reaction_types = pd.read_csv(os.path.join(data_path, 'ReactionTypes.csv'))

print(df_contents.shape, df_reactions.shape, df_reaction_types.shape)

(1000, 6) (25553, 5) (16, 4)


In [12]:
df_reactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25553 entries, 0 to 25552
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  25553 non-null  int64 
 1   Content ID  25553 non-null  object
 2   User ID     22534 non-null  object
 3   Type        24573 non-null  object
 4   Datetime    25553 non-null  object
dtypes: int64(1), object(4)
memory usage: 998.3+ KB


In [13]:
df_contents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   Content ID  1000 non-null   object
 2   User ID     1000 non-null   object
 3   Type        1000 non-null   object
 4   Category    1000 non-null   object
 5   URL         801 non-null    object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB


In [14]:
df_reaction_types.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  16 non-null     int64 
 1   Type        16 non-null     object
 2   Sentiment   16 non-null     object
 3   Score       16 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 640.0+ bytes


In [15]:
df_reactions.head(2)

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Datetime
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50


In [16]:
df_reaction_types.head(2)

Unnamed: 0.1,Unnamed: 0,Type,Sentiment,Score
0,0,heart,positive,60
1,1,want,positive,70


In [17]:
df_contents.head(2)

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Category,URL
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/97522e57-d9ab-4bd6-97bf-c24d952602d2
1,1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f737e0a-3cdd-4d29-9d24-753f4e3be810


#### Data Cleaning

In [18]:
# drop null values
def drop_nulls(df):
    df.dropna(inplace=True)
    return df

# convert date columns to datetime
def convert_to_datetime(df, cols):
    for col in cols:
        df[col] = pd.to_datetime(df[col])
    return df

# convert object columns to category
def convert_to_category(df, cols):
    for col in cols:
        df[col] = df[col].astype('category')
    return df

# drop unnecessary column
def drop_unamed(df):
    df.drop(columns=['Unnamed: 0'], inplace=True)
    return df

# drop User ID column
def drop_user_id(df):
    df.drop(columns=['User ID'], inplace=True)
    return df

# drop URL column
def drop_url(df):
    df.drop(columns=['URL'], inplace=True)
    return df


# drop null values
df_reactions = df_reactions.pipe(drop_nulls).pipe(drop_unamed).pipe(convert_to_datetime, ['Datetime']).pipe(convert_to_category, ['Type', 'Content ID']).pipe(drop_user_id)
df_contents = df_contents.pipe(drop_nulls).pipe(drop_unamed).pipe(convert_to_category, ['Type', 'Content ID', 'Category']).pipe(drop_user_id).pipe(drop_url)
df_reaction_types = df_reaction_types.pipe(drop_nulls).pipe(drop_unamed).pipe(convert_to_category, ['Type', 'Sentiment'])


In [19]:
df_reactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22534 entries, 1 to 25552
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Content ID  22534 non-null  category      
 1   Type        22534 non-null  category      
 2   Datetime    22534 non-null  datetime64[ns]
dtypes: category(2), datetime64[ns](1)
memory usage: 458.6 KB


In [20]:
df_contents.info()

<class 'pandas.core.frame.DataFrame'>
Index: 801 entries, 0 to 999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Content ID  801 non-null    category
 1   Type        801 non-null    category
 2   Category    801 non-null    category
dtypes: category(3)
memory usage: 48.8 KB


In [21]:
df_reaction_types.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Type       16 non-null     category
 1   Sentiment  16 non-null     category
 2   Score      16 non-null     int64   
dtypes: category(2), int64(1)
memory usage: 1.1 KB


In [22]:
df_contents.to_csv(os.path.join(data_path, 'cleaned_contents.csv'), index=False)
df_reactions.to_csv(os.path.join(data_path, 'cleaned_reactions.csv'), index=False)
df_reaction_types.to_csv(os.path.join(data_path, 'cleaned_reaction_types.csv'), index=False)