In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [3]:
#reading all fo the csv files


pd.set_option('display.max_rows',None)
data_sci = pd.read_csv("results_data_science.csv")
ios_dev = pd.read_csv("results_ios_dev.csv")
android_dev = pd.read_csv("results_andoid_dev.csv")
crypto = pd.read_csv("results_crypto.csv")
ml = pd.read_csv("results_ml.csv")
prog = pd.read_csv("results_programming.csv")
web_dev = pd.read_csv("results_webdev.csv")





In [4]:
# creating copies of the files so that i can carry out the cleaning process on the copied file 
# and not alter the original data file

data_sci1 = data_sci
ios_dev1 = ios_dev
android_dev1 = android_dev
crypto1 = crypto
ml1 = ml
prog1 = prog
web_dev1 = web_dev



In [5]:
# all the files have a missing column of "tag" which indicates on what category the blog was written in
# since all the tables will be merged together there is the need of separate "tag" column specifying the category
# in each file

# lets do that!

data_sci1["tag"] = "Data Science"
ios_dev1["tag"] = "Ios Development"
android_dev1["tag"] = "Android Development"
crypto1["tag"] = "Crytography"
ml1["tag"] = "Machine Learning"
prog1["tag"] = "Programming"
web_dev1["tag"] = "Web Development"


In [6]:
web_dev1.head()

Unnamed: 0,Statement,Content,Publication,Read_Time,Date,Year,Claps,comments,tag
0,Web development explained to a time traveler f...,,We’ve moved to freeCodeCamp.org/news,10 min read,"Oct 16,",2017.0,30K,91 responses,Web Development
1,The Ultimate Guide to Learning Full Stack Web ...,,codeburst,7 min read,"Oct 16,",2017.0,29K,82 responses,Web Development
2,Full-Stack Web Development — the Complete Roadmap,,HackerNoon.com,9 min read,"Aug 18,",2017.0,11.1K,39 responses,Web Development
3,Increase your web development skill-set: 150 a...,,Dev Channel,3 min read,"Aug 17,",2017.0,12.8K,21 responses,Web Development
4,Micro frontends—a microservice approach to fro...,,,,"Jul 6,",2017.0,6.9K,49 responses,Web Development


In [7]:
#lets merge all the tables together in one single table and then create a copy of the merged table.

blogs = pd.concat([data_sci1, ios_dev1, android_dev1, crypto1, ml1, prog1, web_dev1], axis = 0)
pd.set_option("max_rows", None)





In [8]:
blogs.shape

(6893, 9)

In [9]:
blogs.head()

Unnamed: 0,Statement,Content,Publication,Read_Time,Date,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,"Over the last year, I taught myself data scien...",Towards Data Science,9 min read,"Sep 15,",2018.0,22K,75 responses,Data Science
1,How to Build a Data Science Portfolio,How do you get a job in data science? Knowing ...,Towards Data Science,18 min read,"Jul 8,",2018.0,18.9K,92 responses,Data Science
2,The best Mario Kart character according to dat...,,The Civis Journal,6 min read,"Jun 8,",2018.0,19.4K,76 responses,Data Science
3,Essential Math for Data Science,,Towards Data Science,8 min read,"Aug 8,",2018.0,14K,25 responses,Data Science
4,"If you want to learn Data Science, start with ...",,We’ve moved to freeCodeCamp.org/news,14 min read,"Sep 26,",2016.0,7.3K,56 responses,Data Science


In [10]:
blogs.to_csv("blogs.csv", index = False)

In [11]:
data = pd.read_csv("blogs.csv")
data.shape

dataset = data

In [12]:
# In Data Analysis, we will analyse to find out the below stuff;

#  1. Missing values
#  2. All the numerical values
#  3. datatype of all of the features
#  4. Categorical variables





In [13]:
# lets find the percentages of missing values we have in this dataset

nan_values = [ feature for feature in dataset.columns if dataset[feature].isnull().sum() > 1 ]

for feature in nan_values:
    
    print(feature, np.round(dataset[feature].isnull().mean(), 4))

Statement 0.0216
Content 0.8526
Publication 0.2147
Read_Time 0.0065
Year 0.1594
Claps 0.0432
comments 0.1905


In [14]:
# The highest no. of missing values almost 85% of the total data is missing in "content" column.Moreover 
# I do not need the content column for any kind of questions or analysis so i can drop them.

dataset = dataset.drop(columns = ["Content"])

In [15]:
dataset.head()

Unnamed: 0,Statement,Publication,Read_Time,Date,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,Towards Data Science,9 min read,"Sep 15,",2018.0,22K,75 responses,Data Science
1,How to Build a Data Science Portfolio,Towards Data Science,18 min read,"Jul 8,",2018.0,18.9K,92 responses,Data Science
2,The best Mario Kart character according to dat...,The Civis Journal,6 min read,"Jun 8,",2018.0,19.4K,76 responses,Data Science
3,Essential Math for Data Science,Towards Data Science,8 min read,"Aug 8,",2018.0,14K,25 responses,Data Science
4,"If you want to learn Data Science, start with ...",We’ve moved to freeCodeCamp.org/news,14 min read,"Sep 26,",2016.0,7.3K,56 responses,Data Science


In [17]:
# lets check if we have any numerical features

numerical_vals = [ feature for feature in dataset.columns if dataset[feature].dtype != 'O']

print("The no of numerical values are:", len(numerical_vals))

The no of numerical values are: 1


In [18]:
dataset[numerical_vals].head()

Unnamed: 0,Year
0,2018.0
1,2018.0
2,2018.0
3,2018.0
4,2016.0


In [16]:
# lets check the unique years which are present

print(dataset["Year"].unique())

[2018. 2016. 2017. 2020. 2019. 2015.   nan 2014. 2011. 2012. 2013. 2010.]


In [17]:
# During web scraping of the data, the blogs of the year 2021 did not have any "year" tag which came out to "NAN"
# in the table so we are going to replace the "NaN" values within the "Year" feature as 2021.

dataset["Year"].fillna("2021", inplace = True)
dataset.head()

Unnamed: 0,Statement,Publication,Read_Time,Date,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,Towards Data Science,9 min read,"Sep 15,",2018.0,22K,75 responses,Data Science
1,How to Build a Data Science Portfolio,Towards Data Science,18 min read,"Jul 8,",2018.0,18.9K,92 responses,Data Science
2,The best Mario Kart character according to dat...,The Civis Journal,6 min read,"Jun 8,",2018.0,19.4K,76 responses,Data Science
3,Essential Math for Data Science,Towards Data Science,8 min read,"Aug 8,",2018.0,14K,25 responses,Data Science
4,"If you want to learn Data Science, start with ...",We’ve moved to freeCodeCamp.org/news,14 min read,"Sep 26,",2016.0,7.3K,56 responses,Data Science


In [18]:
# now the year feature also has years as 2020.0, 2019.0 etc. 
# so lets clean it by changing the datatype from float to int64

dataset["Year"] = dataset["Year"].astype("int64")

dataset.head()

Unnamed: 0,Statement,Publication,Read_Time,Date,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,Towards Data Science,9 min read,"Sep 15,",2018,22K,75 responses,Data Science
1,How to Build a Data Science Portfolio,Towards Data Science,18 min read,"Jul 8,",2018,18.9K,92 responses,Data Science
2,The best Mario Kart character according to dat...,The Civis Journal,6 min read,"Jun 8,",2018,19.4K,76 responses,Data Science
3,Essential Math for Data Science,Towards Data Science,8 min read,"Aug 8,",2018,14K,25 responses,Data Science
4,"If you want to learn Data Science, start with ...",We’ve moved to freeCodeCamp.org/news,14 min read,"Sep 26,",2016,7.3K,56 responses,Data Science


In [19]:
#filling the NaN values of comments with "0 responses"

dataset["comments"].fillna("0 responses", inplace = True)

In [20]:
#i want to remove all the "response" or "responses" string from "comments" column

comm = [comment for comment in dataset["comments"]]
    


In [21]:
#we will use regex to find all such occurences of response or responses and remove them

a = []
for comment in comm:
    comment_no = re.sub("[response|resposes]","",str(comment)) 
    
    #since it is of type object so type cast it to str, otherwise it will throw an error
    a.append(comment_no)
    
#print(a)

#replacing the new values in the dataset comments
dataset["comments"] = a

In [22]:
# Now we will remove all the extra character from the comments column like "," etc

comm = dataset["comments"]
lst = []
for i in comm:
    no = i.replace(',','')
    #print(no)
    lst.append(no)
    
dataset["comments"] = lst

In [23]:
# lets deal with read_time column now

#first lets check the missing values in read_time

dataset[dataset["Read_Time"].isnull()]


Unnamed: 0,Statement,Publication,Read_Time,Date,Year,Claps,comments,tag
594,,,,"Mar 30,",2018,614,5,Data Science
1014,,,,"Sep 4,",2020,1,0,Ios Development
1015,,,,"Jun 12,",2018,,0,Ios Development
1093,,,,"Aug 2,",2018,22,1,Ios Development
1097,,,,"Oct 31,",2016,1,1,Ios Development
1098,,,,"Aug 20,",2019,4,1,Ios Development
1102,,,,"May 2,",2017,,1,Ios Development
1110,,,,Mar 19,2021,,0,Ios Development
1114,,,,Jul 28,2021,,0,Ios Development
1258,,,,"Apr 18,",2020,,1,Ios Development


In [24]:
#The statements,publication,readtime, as well as many information where Read Time column has a NaN value are not present.
# Also checked with the medium website
# so we will drop them except the 5 rows which have some information

dataset = dataset.drop([594, 1014, 1015, 1093, 1097, 1098, 1102, 1110, 1114, 1258, 1263, 2053, 2451, 2472, 3263, 5186, 5531, 5587, 5656, 6170, 6172, 6185, 6196, 6197, 6200, 6207, 6209, 6226, 6230, 6231, 6232, 6234, 6236, 6237, 6238, 6239, 6240, 6593, 6618, 6795])

In [25]:
dataset["Read_Time"].isnull().sum()

5

In [27]:
dataset.head()

Unnamed: 0,Statement,Publication,Read_Time,Date,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,Towards Data Science,9 min read,"Sep 15,",2018,22K,75,Data Science
1,How to Build a Data Science Portfolio,Towards Data Science,18 min read,"Jul 8,",2018,18.9K,92,Data Science
2,The best Mario Kart character according to dat...,The Civis Journal,6 min read,"Jun 8,",2018,19.4K,76,Data Science
3,Essential Math for Data Science,Towards Data Science,8 min read,"Aug 8,",2018,14K,25,Data Science
4,"If you want to learn Data Science, start with ...",We’ve moved to freeCodeCamp.org/news,14 min read,"Sep 26,",2016,7.3K,56,Data Science


In [28]:
#lets change the data type of comments from object to numerical

dataset["comments"] = dataset["comments"].astype("int")

In [29]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6853 entries, 0 to 6892
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Statement    6744 non-null   object
 1   Publication  5413 non-null   object
 2   Read_Time    6848 non-null   object
 3   Date         6853 non-null   object
 4   Year         6853 non-null   int64 
 5   Claps        6575 non-null   object
 6   comments     6853 non-null   int64 
 7   tag          6853 non-null   object
dtypes: int64(2), object(6)
memory usage: 481.9+ KB


In [30]:
# reset the index because we removed many rows for no information of readtime,statement,publication.

dataset.reset_index(inplace= True, drop = True)

In [31]:
dataset.head()

Unnamed: 0,Statement,Publication,Read_Time,Date,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,Towards Data Science,9 min read,"Sep 15,",2018,22K,75,Data Science
1,How to Build a Data Science Portfolio,Towards Data Science,18 min read,"Jul 8,",2018,18.9K,92,Data Science
2,The best Mario Kart character according to dat...,The Civis Journal,6 min read,"Jun 8,",2018,19.4K,76,Data Science
3,Essential Math for Data Science,Towards Data Science,8 min read,"Aug 8,",2018,14K,25,Data Science
4,"If you want to learn Data Science, start with ...",We’ve moved to freeCodeCamp.org/news,14 min read,"Sep 26,",2016,7.3K,56,Data Science


In [32]:
# lets remove the pattern "min read" from "Read_Time" column and convert the column to numerical type

dataset["Read_Time"].fillna("0 min read", inplace= True)
read = [i for i in dataset["Read_Time"]]

#print(read)


a = []
for readtime in read:
    
    r_t = re.sub("[\smin\sread]","",str(readtime))
    a.append(r_t)
    
dataset["Read_Time"] = a
dataset["Read_Time"] = dataset["Read_Time"].astype("int")
dataset.head()

Unnamed: 0,Statement,Publication,Read_Time,Date,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,Towards Data Science,9,"Sep 15,",2018,22K,75,Data Science
1,How to Build a Data Science Portfolio,Towards Data Science,18,"Jul 8,",2018,18.9K,92,Data Science
2,The best Mario Kart character according to dat...,The Civis Journal,6,"Jun 8,",2018,19.4K,76,Data Science
3,Essential Math for Data Science,Towards Data Science,8,"Aug 8,",2018,14K,25,Data Science
4,"If you want to learn Data Science, start with ...",We’ve moved to freeCodeCamp.org/news,14,"Sep 26,",2016,7.3K,56,Data Science


In [33]:
#check for nan values in claps

dataset["Claps"].isnull().sum()

278

In [34]:
#replacing all the NAN values with 0 claps since the NaN came where the article received no claps

dataset["Claps"].fillna("0", inplace=True)
dataset["Claps"].head()

0      22K
1    18.9K
2    19.4K
3      14K
4     7.3K
Name: Claps, dtype: object

In [35]:
# lets clean the claps column and change all the no. like 22K,18.9K etc to 22000,18900 etc

clap_no = [i for i in dataset["Claps"]]



In [36]:
 

a = []

for clap in clap_no:
    
    if re.search("[K]", str(clap)):
        no_only = re.sub("[K]","",str(clap))
        no_only = float(no_only) * 1000
        a.append(no_only)
    else:
        a.append(clap)


In [37]:
#lets replace the values in the dataset["Claps"] column

dataset["Claps"] = a

In [38]:
#lest change the datatype of th claps column

dataset["Claps"]=dataset["Claps"].astype("int")

In [39]:
dataset.head()

Unnamed: 0,Statement,Publication,Read_Time,Date,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,Towards Data Science,9,"Sep 15,",2018,22000,75,Data Science
1,How to Build a Data Science Portfolio,Towards Data Science,18,"Jul 8,",2018,18900,92,Data Science
2,The best Mario Kart character according to dat...,The Civis Journal,6,"Jun 8,",2018,19400,76,Data Science
3,Essential Math for Data Science,Towards Data Science,8,"Aug 8,",2018,14000,25,Data Science
4,"If you want to learn Data Science, start with ...",We’ve moved to freeCodeCamp.org/news,14,"Sep 26,",2016,7300,56,Data Science


In [40]:
# For my analysis year feature is more relevant to me than the date so i ll drop the date column

dataset.drop(columns=["Date"], inplace=True)

In [41]:
dataset.head()

Unnamed: 0,Statement,Publication,Read_Time,Year,Claps,comments,tag
0,How To Learn Data Science If You’re Broke,Towards Data Science,9,2018,22000,75,Data Science
1,How to Build a Data Science Portfolio,Towards Data Science,18,2018,18900,92,Data Science
2,The best Mario Kart character according to dat...,The Civis Journal,6,2018,19400,76,Data Science
3,Essential Math for Data Science,Towards Data Science,8,2018,14000,25,Data Science
4,"If you want to learn Data Science, start with ...",We’ve moved to freeCodeCamp.org/news,14,2016,7300,56,Data Science


In [42]:
# i have to clean the statement column as well and drop those rows where there is 
# no statement as well as publication of the blog


indices = dataset[((dataset["Statement"].isnull()) & (dataset["Publication"].isnull()))].index.to_list

In [43]:
indices

<bound method IndexOpsMixin.tolist of Int64Index([  20,  893, 1051, 1087, 1092, 1094, 1099, 1192, 1206, 1226, 1237,
            1239, 1242, 1250, 1251, 1483, 1484, 2042, 2123, 2143, 2144, 2438,
            2443, 2521, 2991, 3086, 3114, 3414, 3468, 3597, 3690, 3926, 4271,
            4320, 4532, 5160, 5264, 5300, 5489, 5503, 5540, 5741, 5946, 5958,
            5995, 6104, 6129, 6150, 6163, 6180, 6182, 6190, 6193, 6194, 6296,
            6303, 6328, 6339, 6395, 6422, 6432, 6486, 6498, 6534, 6564, 6565,
            6568, 6569, 6570, 6593, 6601, 6603, 6604, 6606, 6667, 6774, 6810],
           dtype='int64')>

In [44]:
dataset.drop([20,  893, 1051, 1087, 1092, 1094, 1099, 1192, 1206, 1226, 1237,
            1239, 1242, 1250, 1251, 1483, 1484, 2042, 2123, 2143, 2144, 2438,
            2443, 2521, 2991, 3086, 3114, 3414, 3468, 3597, 3690, 3926, 4271,
            4320, 4532, 5160, 5264, 5300, 5489, 5503, 5540, 5741, 5946, 5958,
            5995, 6104, 6129, 6150, 6163, 6180, 6182, 6190, 6193, 6194, 6296,
            6303, 6328, 6339, 6395, 6422, 6432, 6486, 6498, 6534, 6564, 6565,
            6568, 6569, 6570, 6593, 6601, 6603, 6604, 6606, 6667, 6774, 6810], inplace=True)

In [45]:
# lets fill up the rest of the NaN values in statement and publication with some dummy values beacuse they contain 
# some considerable amounts of claps as well as comments so we cant remove those rows .We need to analyse them

dataset["Statement"].fillna("abc", inplace=True)

In [46]:
dataset["Publication"].fillna("bcd", inplace= True)

In [47]:
#resetiing indexes since rows has been dropped 
dataset.reset_index(inplace= True, drop = True)

In [48]:
# finally the data is cleaned!
# lets save the cleaned file to csv

dataset.to_csv("cleaned_blogs_data.csv", index=False)

In [49]:
df = pd.read_csv("cleaned_blogs_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6776 entries, 0 to 6775
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Statement    6776 non-null   object
 1   Publication  6776 non-null   object
 2   Read_Time    6776 non-null   int64 
 3   Year         6776 non-null   int64 
 4   Claps        6776 non-null   int64 
 5   comments     6776 non-null   int64 
 6   tag          6776 non-null   object
dtypes: int64(4), object(3)
memory usage: 370.7+ KB
