# Accenture Data Cleaning Task - Ayaan Danish

In [96]:
import pandas as pd
import numpy as np

In [97]:
content = pd.read_csv('Content.csv', index_col=[0])
reactions = pd.read_csv('Reactions.csv', index_col=[0])
reactionTypes = pd.read_csv('ReactionTypes.csv', index_col=[0])

## Cleaning content dataset

In [98]:
content.head()

Unnamed: 0,Content ID,User ID,Type,Category,URL
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


### Dropping irrevelant URL column

In [99]:
content = content.drop(['URL', 'User ID'], axis=1)

### Checking for rows with null values - none found

In [100]:
content.isnull().sum()

Content ID    0
Type          0
Category      0
dtype: int64

### Checking column types - no changes needed

In [101]:
content.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Content ID  1000 non-null   object
 1   Type        1000 non-null   object
 2   Category    1000 non-null   object
dtypes: object(3)
memory usage: 31.2+ KB


### Renaming 'Type' column to 'ContentType' for clarity

In [102]:
content = content.rename(columns={"Type": "ContentType"})

### Removing '"' from certain rows of the Category column

In [103]:
content = content.replace('"', '', regex=True)

### Cleaned dataframe

In [104]:
content

Unnamed: 0,Content ID,ContentType,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,Studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,video,food
...,...,...,...
995,b4cef9ef-627b-41d7-a051-5961b0204ebb,video,public speaking
996,7a79f4e4-3b7d-44dc-bdef-bc990740252c,GIF,technology
997,435007a5-6261-4d8b-b0a4-55fdc189754b,audio,veganism
998,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,GIF,culture


## Cleaning reactions dataset

In [105]:
reactions.head()

Unnamed: 0,Content ID,User ID,Type,Datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,2021-01-06 19:13:01


### Dropping irrelevant Datetime column

In [106]:
reactions = reactions.drop(['User ID'], axis=1)

### Checking for rows with null values - many found

In [107]:
reactions.isnull().sum()

Content ID      0
Type          980
Datetime        0
dtype: int64

### Dropping all rows with null values

In [108]:
reactions = reactions.dropna()

In [109]:
reactions.isnull().sum()

Content ID    0
Type          0
Datetime      0
dtype: int64

### Checking column types - no changes needed

In [110]:
reactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24573 entries, 1 to 25552
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Content ID  24573 non-null  object
 1   Type        24573 non-null  object
 2   Datetime    24573 non-null  object
dtypes: object(3)
memory usage: 767.9+ KB


### Cleaned dataframe

In [111]:
reactions

Unnamed: 0,Content ID,Type,Datetime
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,2020-08-23 12:25:58
...,...,...,...
25548,75d6b589-7fae-4a6d-b0d0-752845150e56,dislike,2020-06-27 09:46:48
25549,75d6b589-7fae-4a6d-b0d0-752845150e56,intrigued,2021-02-16 17:17:02
25550,75d6b589-7fae-4a6d-b0d0-752845150e56,interested,2020-09-12 03:54:58
25551,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,2020-11-04 20:08:31


## Cleaning reaction types dataset

In [112]:
reactionTypes.head()

Unnamed: 0,Type,Sentiment,Score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30


### Checking for null values - none found

In [113]:
reactionTypes.isnull().sum()

Type         0
Sentiment    0
Score        0
dtype: int64

### Checking column types - no changes needed

In [114]:
reactionTypes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Type       16 non-null     object
 1   Sentiment  16 non-null     object
 2   Score      16 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 512.0+ bytes


### Exporting cleaned dataframes back to CSV files

In [118]:
content.to_csv('Cleaned/Content-Cleaned.csv')
reactions.to_csv('Cleaned/Reactions-Cleaned.csv')
reactionTypes.to_csv('Cleaned/ReactionTypes-Cleaned.csv')