In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("MediumBlogScrapped.csv")

In [2]:
df.head()

Unnamed: 0,title,Author,Upvote,CommentCount,Publish Date,Read Time,Paragraph,Links,Tag
0,DrivenData Interview,Will Koehrsen,544,2,"Dec 14, 2020",13 min read,"In October 2020, I was interviewed by DrivenDa...",https://williamkoehrsen.medium.com/drivendata-...,Data Science
1,A Data Science Conversation,Will Koehrsen,411,43,"Mar 10, 2020",3 min read,Talking is a lot like writing in that it force...,https://williamkoehrsen.medium.com/a-data-scie...,Data Science
2,"12 Lessons from 55,000 pages of books",Will Koehrsen,1.4K,10,"Jan 2, 2020",14 min read,Reading 136 books in a year does not get you t...,https://williamkoehrsen.medium.com/12-lessons-...,Reading
3,Books of 2019,Will Koehrsen,464,1,"Jan 1, 2020",58 min read,Before we get started: reading books does not ...,https://williamkoehrsen.medium.com/books-of-20...,Reading
4,“Just Do It” Won’t Get You to Your Goals,Will Koehrsen,398,2,"Dec 27, 2019",12 min read,Rule number one for achieving goals: don’t tak...,https://williamkoehrsen.medium.com/just-do-it-...,Productivity


In [3]:
columns = df.columns
columns

Index(['title', 'Author', 'Upvote', 'CommentCount', 'Publish Date',
       'Read Time', 'Paragraph', 'Links', 'Tag'],
      dtype='object')

In [4]:
df.isnull().sum()

title           0
Author          0
Upvote          0
CommentCount    0
Publish Date    0
Read Time       0
Paragraph       0
Links           0
Tag             0
dtype: int64

In [5]:
df.dtypes

title           object
Author          object
Upvote          object
CommentCount     int64
Publish Date    object
Read Time       object
Paragraph       object
Links           object
Tag             object
dtype: object

### Conversion Of Data Types

In [6]:
def convert_to_int(x):
    """
    Function Name: convert_to_int
    Parameters:
        x (str): A string representing a number, potentially suffixed with 'K' for thousands.
    Returns:
        int: An integer value converted from the input string.
    """
    if 'K' in x:
        return int(float(x.replace('K', '')) * 1000)
    else:
        return int(x)

In [7]:
df['Upvote'] = df['Upvote'].apply(convert_to_int)

In [8]:
df['Upvote'].head() ## Converted Upvote to int64

0     544
1     411
2    1400
3     464
4     398
Name: Upvote, dtype: int64

In [9]:
def convert_read_time(x):
    """
    Function Name: convert_read_time
    Parameters:
        x (str): A string representing a read time, typically in a format like "5 min" or "10 mins".
    Returns:
        int: An integer value representing the read time extracted from the input string.
    """
    x = x.split(" ")[0]
    return int(x)

In [10]:
df['Read Time'] = df["Read Time"].apply(convert_read_time)

In [11]:
df['Read Time'].head() # Converted Read Time

0    13
1     3
2    14
3    58
4    12
Name: Read Time, dtype: int64

In [12]:
from datetime import datetime # importing Datetime to Convert Publish Date

In [13]:
def process_date(date_str):
    """
    Function Name: process_date
    Parameters:
        date_str (str): A string representing a date in the format '%b %d, %Y' (e.g., 'Jan 01, 2023').
    Returns:
        tuple: A tuple containing formatted date, weekday, month, and year.
    """
    date_obj = datetime.strptime(date_str, '%b %d, %Y')
    weekday = date_obj.strftime('%A')
    formatted_date = date_obj.strftime('%d %b %Y')
    month = date_obj.strftime('%B')
    year = date_obj.year
    return formatted_date, weekday, month, year

In [14]:
df['Publish Date'], df['Weekday'], df['Month'], df['Year'] = zip(*df['Publish Date'].apply(process_date))

In [15]:
df.head(3)

Unnamed: 0,title,Author,Upvote,CommentCount,Publish Date,Read Time,Paragraph,Links,Tag,Weekday,Month,Year
0,DrivenData Interview,Will Koehrsen,544,2,14 Dec 2020,13,"In October 2020, I was interviewed by DrivenDa...",https://williamkoehrsen.medium.com/drivendata-...,Data Science,Monday,December,2020
1,A Data Science Conversation,Will Koehrsen,411,43,10 Mar 2020,3,Talking is a lot like writing in that it force...,https://williamkoehrsen.medium.com/a-data-scie...,Data Science,Tuesday,March,2020
2,"12 Lessons from 55,000 pages of books",Will Koehrsen,1400,10,02 Jan 2020,14,Reading 136 books in a year does not get you t...,https://williamkoehrsen.medium.com/12-lessons-...,Reading,Thursday,January,2020


In [16]:
def blog_length(blog):
    """
    Function Name: blog_length
    Parameters:
        blog (str): A string representing a blog post.
    Returns:
        int: The length of the blog post.
    """
    return len(blog)

In [17]:
df["BlogLength"] = df["Paragraph"].apply(blog_length)

In [18]:
df.head(2)

Unnamed: 0,title,Author,Upvote,CommentCount,Publish Date,Read Time,Paragraph,Links,Tag,Weekday,Month,Year,BlogLength
0,DrivenData Interview,Will Koehrsen,544,2,14 Dec 2020,13,"In October 2020, I was interviewed by DrivenDa...",https://williamkoehrsen.medium.com/drivendata-...,Data Science,Monday,December,2020,15939
1,A Data Science Conversation,Will Koehrsen,411,43,10 Mar 2020,3,Talking is a lot like writing in that it force...,https://williamkoehrsen.medium.com/a-data-scie...,Data Science,Tuesday,March,2020,1379


In [19]:
df

Unnamed: 0,title,Author,Upvote,CommentCount,Publish Date,Read Time,Paragraph,Links,Tag,Weekday,Month,Year,BlogLength
0,DrivenData Interview,Will Koehrsen,544,2,14 Dec 2020,13,"In October 2020, I was interviewed by DrivenDa...",https://williamkoehrsen.medium.com/drivendata-...,Data Science,Monday,December,2020,15939
1,A Data Science Conversation,Will Koehrsen,411,43,10 Mar 2020,3,Talking is a lot like writing in that it force...,https://williamkoehrsen.medium.com/a-data-scie...,Data Science,Tuesday,March,2020,1379
2,"12 Lessons from 55,000 pages of books",Will Koehrsen,1400,10,02 Jan 2020,14,Reading 136 books in a year does not get you t...,https://williamkoehrsen.medium.com/12-lessons-...,Reading,Thursday,January,2020,20927
3,Books of 2019,Will Koehrsen,464,1,01 Jan 2020,58,Before we get started: reading books does not ...,https://williamkoehrsen.medium.com/books-of-20...,Reading,Wednesday,January,2020,15010
4,“Just Do It” Won’t Get You to Your Goals,Will Koehrsen,398,2,27 Dec 2019,12,Rule number one for achieving goals: don’t tak...,https://williamkoehrsen.medium.com/just-do-it-...,Productivity,Friday,December,2019,12708
...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,The Triumph of Peace,Will Koehrsen,7900,43,05 Jul 2017,14,A review of The Better Angels of Our Nature: W...,https://williamkoehrsen.medium.com/the-triumph...,Books,Wednesday,July,2017,11006
151,Home of the Scared,Will Koehrsen,7900,43,01 Jul 2017,9,A review of A Culture of Fear: Why Americans a...,https://williamkoehrsen.medium.com/home-of-the...,Politics,Saturday,July,2017,12404
152,Capstone Project: Mercedes-Benz Greener Manufa...,Will Koehrsen,120,43,30 Jun 2017,42,Author’s Note: This is the report I completed ...,https://williamkoehrsen.medium.com/capstone-pr...,Machine Learning,Friday,June,2017,66826
153,"The Vanquishing of War, Plague and Famine",Will Koehrsen,70,43,18 Jun 2017,14,Part 1 of the Optimist’s Guide to the 21st Cen...,https://williamkoehrsen.medium.com/the-vanquis...,Climate Change,Sunday,June,2017,20856


In [21]:
df["Tag"].value_counts()

Tag
Data Science            48
Machine Learning        46
Programming              6
Python                   5
Books                    4
Data                     4
Education                4
Statistics               3
Reading                  3
Data Visualization       3
Climate Change           3
Tech                     2
Big Data                 2
Self Improvement         2
Psychology               2
Space                    1
Motivation               1
History                  1
Technology               1
Privacy                  1
College                  1
Social Media             1
Growth Mindset           1
Web Development          1
Apache Spark             1
Software Development     1
Docker                   1
Productivity             1
Science                  1
The Reality Project      1
Health                   1
Running                  1
Politics                 1
Name: count, dtype: int64

The Dataset is highly imbalanced and many similar topic are categorised as different therefore 

In [29]:
#Mapping the similar kind of Topic Into One
tag_mapping = {
    'Data Science': 'Data Science',
    'Machine Learning': 'Machine Learning',
    'Programming': 'Programming',
    'Python': 'Programming',
    'Books': 'Personal Development',
    'Data': 'Data Science',
    'Education': 'Education',
    'Statistics': 'Data Science',
    'Reading': 'Personal Development',
    'Data Visualization': 'Data Science',
    'Climate Change': 'Education',
    'Tech': 'Technology',
    'Big Data': 'Data Science',
    'Self Improvement': 'Personal Development',
    'Psychology': 'Education',
    'Space': 'Education',
    'Motivation': 'Personal Development',
    'History': 'Education',
    'Technology': 'Technology',
    'Privacy': 'Technology',
    'College': 'Education',
    'Social Media': 'Technology',
    'Growth Mindset': 'Personal Development',
    'Web Development': 'Technology',
    'Apache Spark': 'Technology',
    'Software Development': 'Technology',
    'Docker': 'Technology',
    'Productivity': 'Personal Development',
    'Science': 'Education',
    'The Reality Project': 'Education',
    'Health': 'Personal Development',
    'Running': 'Personal Development',
    'Politics': 'Education'
}

df['Tag'] = df['Tag'].map(tag_mapping)

In [30]:
df['Tag'].value_counts()

Tag
Data Science            60
Machine Learning        46
Education               15
Personal Development    14
Programming             11
Technology               9
Name: count, dtype: int64

In [31]:
df.to_csv('MediumDataAfterEDA.csv',index=False)