<a href="https://www.kaggle.com/code/ahmedanwar89/udemy-dateset-eda?scriptVersionId=151254432" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import Dataset

In [2]:
df = pd.read_csv('/kaggle/input/udemy-courses/udemy_courses.csv')

In [3]:
df.sample(5)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
3155,778226,Building HTML5 Canvas projects from scratch,https://www.udemy.com/building-html5-canvas-pr...,True,100,8968,35,25,Beginner Level,2.0,2016-03-03T20:36:35Z,Web Development
1109,1100746,≪虎の巻≫証券取引の「税金のお悩み」解消,https://www.udemy.com/kabucom_zeise/,True,25,3840,296,33,Intermediate Level,1.5,2017-02-21T17:14:03Z,Business Finance
2001,907966,Guitar: Learn 10 CCR Guitar Chord Progressions,https://www.udemy.com/classic-ccr-style/,True,50,2045,8,24,Beginner Level,2.0,2016-08-25T19:22:05Z,Musical Instruments
1932,663810,Piano Runs & Fills #4: C6 Rolling Waves & Wate...,https://www.udemy.com/piano-runs-fills-4-c6-ro...,True,70,834,3,35,All Levels,1.0,2015-11-10T20:59:52Z,Musical Instruments
3318,679158,Build your own Beautiful Blog in WordPress + SEO,https://www.udemy.com/learn-to-create-an-amazi...,True,200,2421,6,28,All Levels,3.0,2015-11-25T18:00:35Z,Web Development


# Data Clean

In [4]:
# check data type
df.dtypes

course_id                int64
course_title            object
url                     object
is_paid                   bool
price                    int64
num_subscribers          int64
num_reviews              int64
num_lectures             int64
level                   object
content_duration       float64
published_timestamp     object
subject                 object
dtype: object

In [5]:
# change (published_timestamp) to datetime
df['published_timestamp'] = pd.to_datetime(df['published_timestamp'])
# check change is done
df['published_timestamp'].dtypes

datetime64[ns, UTC]

In [6]:
# check null values
df.isnull().any(), df.isnull().sum()

(course_id              False
 course_title           False
 url                    False
 is_paid                False
 price                  False
 num_subscribers        False
 num_reviews            False
 num_lectures           False
 level                  False
 content_duration       False
 published_timestamp    False
 subject                False
 dtype: bool,
 course_id              0
 course_title           0
 url                    0
 is_paid                0
 price                  0
 num_subscribers        0
 num_reviews            0
 num_lectures           0
 level                  0
 content_duration       0
 published_timestamp    0
 subject                0
 dtype: int64)

In [7]:
# check duplicated values
df.duplicated().any(), df.duplicated().sum()

(True, 6)

In [8]:
# drop duplicated values
df.drop_duplicates(inplace=True)

In [9]:
# check that the duplicated values have been dropped
df.duplicated().any(), df.duplicated().sum()

(False, 0)

In [10]:
# check data validity for object columns
# print object columns that have unique values less than 15
for x in df.columns:
    if (df[x].nunique() <= 15) & (df[x].dtypes == 'object'):
        print(x)
        print(df[x].unique())

level
['All Levels' 'Intermediate Level' 'Beginner Level' 'Expert Level']
subject
['Business Finance' 'Graphic Design' 'Musical Instruments'
 'Web Development']


In [11]:
# check data validity for numerical columns
df.describe().round(2)

Unnamed: 0,course_id,price,num_subscribers,num_reviews,num_lectures,content_duration
count,3672.0,3672.0,3672.0,3672.0,3672.0,3672.0
mean,675897.7,66.1,3190.59,156.37,40.14,4.1
std,343071.95,61.04,9488.11,936.18,50.42,6.06
min,8324.0,0.0,0.0,0.0,0.0,0.0
25%,407761.5,20.0,111.75,4.0,15.0,1.0
50%,687692.0,45.0,912.0,18.0,25.0,2.0
75%,960814.0,95.0,2548.75,67.0,46.0,4.5
max,1282064.0,200.0,268923.0,27445.0,779.0,78.5


In [12]:
# drop record with (num_lectures) & (content_duration) = 0
df = df[(df['content_duration'] != 0) | (df['num_lectures'] != 0)]

In [13]:
# check that the drop has been happened
df.describe().round(2)

Unnamed: 0,course_id,price,num_subscribers,num_reviews,num_lectures,content_duration
count,3671.0,3671.0,3671.0,3671.0,3671.0,3671.0
mean,675910.93,66.12,3191.46,156.41,40.15,4.1
std,343117.75,61.04,9489.25,936.3,50.42,6.06
min,8324.0,0.0,0.0,0.0,4.0,0.13
25%,407727.0,20.0,112.0,4.0,15.0,1.0
50%,687742.0,45.0,912.0,18.0,25.0,2.0
75%,960842.0,95.0,2550.5,67.0,46.0,4.5
max,1282064.0,200.0,268923.0,27445.0,779.0,78.5


In [14]:
# check outliers
fig = make_subplots(rows=1, cols=len(df.select_dtypes(exclude=['object', 'bool', 'datetime64[ns, UTC]']).columns), shared_yaxes=False)

i = 1
for n in df.select_dtypes(exclude=['object', 'bool', 'datetime64[ns, UTC]']).columns:
    fig.add_trace(go.Box(y=df[n], name=n), row=1, col=i)
    i+=1

fig.show()

In [15]:
# drop records with (num_lectures) = 779
df = df[df['num_lectures'] < 779]

In [16]:
# check outliers again
fig = make_subplots(rows=1, cols=len(df.select_dtypes(exclude=['object', 'bool', 'datetime64[ns, UTC]']).columns), shared_yaxes=False)

i = 1
for n in df.select_dtypes(exclude=['object', 'bool', 'datetime64[ns, UTC]']).columns:
    fig.add_trace(go.Box(y=df[n], name=n), row=1, col=i)
    i+=1

fig.show()

In [17]:
# last check
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3670 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   course_id            3670 non-null   int64              
 1   course_title         3670 non-null   object             
 2   url                  3670 non-null   object             
 3   is_paid              3670 non-null   bool               
 4   price                3670 non-null   int64              
 5   num_subscribers      3670 non-null   int64              
 6   num_reviews          3670 non-null   int64              
 7   num_lectures         3670 non-null   int64              
 8   level                3670 non-null   object             
 9   content_duration     3670 non-null   float64            
 10  published_timestamp  3670 non-null   datetime64[ns, UTC]
 11  subject              3670 non-null   object             
dtypes: bool(1), datetime64[ns

In [18]:
# there is 3670 entries from 0 to 3677
# reset indexing
df.reset_index(drop=True, inplace=True)

In [19]:
# check again
# now there are 3670 entries from 0 to 3699
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3670 entries, 0 to 3669
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   course_id            3670 non-null   int64              
 1   course_title         3670 non-null   object             
 2   url                  3670 non-null   object             
 3   is_paid              3670 non-null   bool               
 4   price                3670 non-null   int64              
 5   num_subscribers      3670 non-null   int64              
 6   num_reviews          3670 non-null   int64              
 7   num_lectures         3670 non-null   int64              
 8   level                3670 non-null   object             
 9   content_duration     3670 non-null   float64            
 10  published_timestamp  3670 non-null   datetime64[ns, UTC]
 11  subject              3670 non-null   object             
dtypes: bool(1), datetime

# Data Analysis

In [20]:
df.sample(5)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
374,754028,Financial Accounting - A Brief Introduction,https://www.udemy.com/accounting-in-less-then-...,False,0,17847,1440,17,Beginner Level,1.0,2016-02-24 14:53:28+00:00,Business Finance
468,327788,Bitcoin - A Comprehensive Guide,https://www.udemy.com/bitcoin-a-comprehensive-...,True,20,283,13,35,All Levels,2.5,2015-01-27 04:10:17+00:00,Business Finance
3326,421018,Learning Object-Oriented JavaScript,https://www.udemy.com/learning-object-oriented...,True,75,533,54,23,Intermediate Level,3.0,2015-02-17 09:53:12+00:00,Web Development
2365,994050,Instant Harmonica - play Dylan's Like a Rollin...,https://www.udemy.com/instant-harmonica-play-d...,True,40,28,2,11,All Levels,1.5,2016-10-27 19:48:23+00:00,Musical Instruments
742,390472,How To Set Up A Limited Company In The UK,https://www.udemy.com/how-to-set-up-a-limited-...,False,0,2075,69,11,Beginner Level,0.733333,2015-04-20 23:31:47+00:00,Business Finance


## ***general insights***  
- very good relation between number of lectures and content duration.
- good relation between number of subscribers and number of reviews.

In [21]:
# visulization
px.imshow(df.corr(numeric_only=True),
          text_auto=True,
          color_continuous_scale='Blues',
          aspect=True,
          title='correlation coefficients between numerical columns')

In [22]:
# visulization
fig = px.scatter(data_frame=df,
                 x='num_lectures',
                 y='content_duration',
                 trendline='ols',
                 color='subject',
                 facet_col='subject',
                 facet_row='level',
                 height=950)

fig.show()

In [23]:
# visulization
fig = px.scatter(data_frame=df,
                 x='num_subscribers',
                 y='num_reviews',
                 trendline='ols',
                 color='subject',
                 facet_col='subject',
                 facet_row='level',
                 height=950)

fig.show()

## ***insight 1***  
- web development subject has the highiest count of courses which represent as 32.6 %.
- graphic design subject has the lowest count of courses which represent as 18.5 %.

In [24]:
# subject column univariate analysis
df['subject'].value_counts()

subject
Web Development        1198
Business Finance       1190
Musical Instruments     680
Graphic Design          602
Name: count, dtype: int64

In [25]:
# visulization
fig = px.bar(x=df['subject'].value_counts().index,
             y=df['subject'].value_counts().values,
             color=df['subject'].value_counts().index,
             title='count of courses for each subject')

fig.update_xaxes(title='subject')
fig.update_yaxes(title='# courses')

fig.show()

In [26]:
# visulization
fig = px.pie(names=df['subject'].value_counts().index,
             values=df['subject'].value_counts().values,
             color=df['subject'].value_counts().index,
             title='percentage of number of courses for each subject')

fig.show()

## ***insight 2***  
- all levels courses have the highiest count of courses in all subjects excepted musical instruments
- business finance subject has count of courses more than web development in all levels & biginner level & expert level.
- all level courses are more than 50 % of number of courses in web development and business finance and around 30 % in bigenner level and this indicate that customers start these levels at the most times so I will check number of subscribers for this subject later.

In [27]:
# subject column with level column pievariate analysis
df.pivot_table(index='subject', columns='level', values='course_id', aggfunc='count')

level,All Levels,Beginner Level,Expert Level,Intermediate Level
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business Finance,692,339,31,128
Graphic Design,298,242,5,57
Musical Instruments,276,296,7,101
Web Development,658,390,15,135


In [28]:
# visulization
fig = px.bar(df.pivot_table(index='subject', columns='level', values='course_id', aggfunc='count'),
             barmode='group',
             title='count of courses for each subject regarding to level')

fig.update_yaxes(title='# courses')

fig.show()

In [29]:
# visulization
px.pie(data_frame=df,
       names='level',
       facet_col='subject',
       title='percentage of number of courses for each subject regarding to level')

## ***insight 3***  
- paid courses in web development are more than in business finance.
- free courses in business finance are more than in web development regarding to number of free courses at all.
- take care that the percentage of free courses in business finance is 8.07 % of business finance courses & the percentage of free courses in web development is 11.1 % of web development courses so the free courses in web development is more than business finance regarding to number of courses for each subject.

In [30]:
# subject column with is_paid column pievariate analysis
df.pivot_table(index='subject', columns='is_paid', values='course_id', aggfunc='count')

is_paid,False,True
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
Business Finance,96,1094
Graphic Design,35,567
Musical Instruments,46,634
Web Development,133,1065


In [31]:
# visulization
fig = px.bar(df.pivot_table(index='subject', columns='is_paid', values='course_id', aggfunc='count'),
             barmode='group',
             title='count of free and paied courses for each subject')

fig.update_yaxes(title='count')

fig.show()

In [32]:
# visulization
px.pie(data_frame=df,
       names='is_paid',
       facet_col='subject',
       title='percentage of free courses for each subject')

## ***insight 4***  
- web development has the heighest sum of subscribers with percentage of 67.7 % of total num of subscribers then comes after busniess finance with 16 %.

In [33]:
# subject column with num_subscribers column pievariate analysis
df.groupby('subject').agg({'num_subscribers': 'sum'})

Unnamed: 0_level_0,num_subscribers
subject,Unnamed: 1_level_1
Business Finance,1868711
Graphic Design,1063148
Musical Instruments,846689
Web Development,7919117


In [34]:
# visulization
fig = px.bar(data_frame=df.groupby('subject').agg({'num_subscribers': 'sum'}),
             x=df.groupby('subject').agg({'num_subscribers': 'sum'}).index,
             y=df.groupby('subject').agg({'num_subscribers': 'sum'})['num_subscribers'],
             color=df.groupby('subject').agg({'num_subscribers': 'sum'}).index,
             title='sum of subscribers for each subject')

fig.update_xaxes(title='subject')
fig.update_yaxes(title='sum of subscribers')

fig.show()

In [35]:
# visulization
px.pie(data_frame=df,
       names='subject',
       values='num_subscribers',
       title='percentage of num of subscribers for each subject')

## ***insight 5***
- all levels & beginner level in web deveopment have the highest sum of subscribers.
- all levels courses in all subject have the highest subscribers and come after that bigenner level excepted the setuation in Musical Instruments.

In [36]:
# subject column with level and num_subscribers columns multivariate analysis
df.pivot_table(index='subject', columns='level', values='num_subscribers', aggfunc='sum')

level,All Levels,Beginner Level,Expert Level,Intermediate Level
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business Finance,1047208,647007,30146,144350
Graphic Design,688332,315757,1008,58051
Musical Instruments,328170,429030,5431,84058
Web Development,4808081,2641879,13611,455546


In [37]:
# visulization
fig = px.bar(df.pivot_table(index='subject', columns='level', values='num_subscribers', aggfunc='sum'),
             barmode='group',
             title='sum of subscribers for each subject regarding to level')

fig.update_yaxes(title='sum of subscribers')

fig.show()

In [38]:
# visulization
px.pie(data_frame=df,
       names='level',
       values='num_subscribers',
       facet_col='subject',
       title='percentage of num of subscribers for each subject regarding to level')

## ***insight 6***
- all destribuotion of prices are right skwed and not uniformal.
- some courses considered out of normal range of price.

In [39]:
df['price'].describe()

count    3670.000000
mean       66.079019
std        61.007770
min         0.000000
25%        20.000000
50%        45.000000
75%        95.000000
max       200.000000
Name: price, dtype: float64

In [40]:
px.box(data_frame=df,
       x='subject',
       y='price',
       boxmode='overlay',
       color='subject',
       facet_row='level',
       height=1500)

In [41]:
px.histogram(data_frame=df,
             x='price',
             color='subject',
             facet_col='subject',
             facet_row='level',
             nbins=20,
             height=1000)

In [42]:
df.groupby('subject').agg({'price': ['mean','median', 'max', 'min']}).round(2)

Unnamed: 0_level_0,price,price,price,price
Unnamed: 0_level_1,mean,median,max,min
subject,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Business Finance,68.74,47.5,200,0
Graphic Design,57.89,30.0,200,0
Musical Instruments,49.56,40.0,200,0
Web Development,76.93,50.0,200,0


## ***insight 7***
- web development subject makes more than 623 M as a profit which is equal 71 % of total profit.
- all level courses in each subject make the heighest profit and bigenner courses come after that.

In [43]:
# make new column named 'profit' and make a business senario
df['profit'] = df['price'] * df['num_subscribers']

In [44]:
# group df by subject and calculate sum of profit for each subject
df.groupby('subject', as_index=False).agg({'profit': 'sum'})

Unnamed: 0,subject,profit
0,Business Finance,123735315
1,Graphic Design,76983170
2,Musical Instruments,53359055
3,Web Development,623963400


In [45]:
# visulization
fig = px.bar(data_frame=df.groupby('subject', as_index=False).agg({'profit': 'sum'}),
             x=df.groupby('subject', as_index=False).agg({'profit': 'sum'})['subject'],
             y=df.groupby('subject', as_index=False).agg({'profit': 'sum'})['profit'],
             color=df.groupby('subject', as_index=False).agg({'profit': 'sum'})['subject'],
             title='sum of profit for each subject')

fig.update_xaxes(title='subject')
fig.update_yaxes(title='sum of profit')

fig.show()

In [46]:
# visulization
px.pie(data_frame=df.groupby('subject', as_index=False).agg({'profit': 'sum'}),
       names=df.groupby('subject', as_index=False).agg({'profit': 'sum'})['subject'],
       values=df.groupby('subject', as_index=False).agg({'profit': 'sum'})['profit'],
       title='percentage of profit of each subject from total profit')

In [47]:
# pivot table to calculate sum of profit for each subject regarding to level
df.pivot_table(index='subject', columns='level', values='profit', aggfunc='sum')

level,All Levels,Beginner Level,Expert Level,Intermediate Level
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business Finance,84084765,27250145,4029835,8370570
Graphic Design,47437070,26554730,25335,2966035
Musical Instruments,38547190,10427590,264175,4120100
Web Development,436724370,151811875,1149725,34277430


In [48]:
# visulization
px.bar(df.pivot_table(index='subject', columns='level', values='profit', aggfunc='sum'),
       barmode='group',
       title='sum of profit for each subject regarding to level')

In [49]:
# visulization
px.pie(data_frame=df,
       names='level',
       values='profit',
       facet_col='subject',
       title='percentage of sum of profit for each subject regarding to level')

# Time Series Analysis

In [50]:
df.sample(2)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit
260,64739,Professional Risk Manager (PRM) Certification:...,https://www.udemy.com/professional-risk-manage...,True,50,116,10,51,All Levels,24.0,2013-07-23 20:18:35+00:00,Business Finance,5800
2344,1196538,Piano Building Blocks: Learn Chord Additions &...,https://www.udemy.com/pbbchordvariations/,False,0,375,0,9,Beginner Level,1.0,2017-07-02 21:55:41+00:00,Musical Instruments,0


In [51]:
# change the index to 'published_timestamp' column
df.set_index(keys='published_timestamp', inplace=True)

In [52]:
# check the change
df.index

DatetimeIndex(['2017-01-18 20:58:58+00:00', '2017-03-09 16:34:20+00:00',
               '2016-12-19 19:26:30+00:00', '2017-05-30 20:07:24+00:00',
               '2016-12-13 14:57:18+00:00', '2014-05-02 15:13:30+00:00',
               '2016-02-21 18:23:12+00:00', '2015-01-30 22:13:03+00:00',
               '2015-05-28 00:14:03+00:00', '2017-04-18 18:13:32+00:00',
               ...
               '2014-08-10 20:19:10+00:00', '2015-11-29 22:02:02+00:00',
               '2014-12-19 21:38:54+00:00', '2015-11-19 17:22:47+00:00',
               '2016-10-10 22:00:32+00:00', '2016-06-14 17:36:46+00:00',
               '2017-03-10 22:24:30+00:00', '2015-12-30 16:41:42+00:00',
               '2016-08-11 19:06:15+00:00', '2014-09-28 19:51:11+00:00'],
              dtype='datetime64[ns, UTC]', name='published_timestamp', length=3670, freq=None)

## ***insight 8***
- profit increased from 2012 to 2016 then decreased in 2017 & 2018.

In [53]:
# calculate profit for each year
df.resample('y').agg({'profit': 'sum'})

Unnamed: 0_level_0,profit
published_timestamp,Unnamed: 1_level_1
2011-12-31 00:00:00+00:00,11643420
2012-12-31 00:00:00+00:00,11773470
2013-12-31 00:00:00+00:00,65771820
2014-12-31 00:00:00+00:00,106939045
2015-12-31 00:00:00+00:00,314510395
2016-12-31 00:00:00+00:00,276633190
2017-12-31 00:00:00+00:00,90769600


In [54]:
# visulization
fig = px.line(df.resample('y').agg({'profit': 'sum'}),
              markers=True,
              title='sum of profit by year')

fig.update_yaxes(title='sum of profit')
fig.update_xaxes(title='Year')

fig.show()

## ***insight 9***
- num of subscribers increased from 2012 to 2016 then decreased in 2017 & 2018 and this setuation is like profit setuation.

In [55]:
# calculate the AVG price for all courses during years
df.resample('y').agg({'num_subscribers': 'sum'})

Unnamed: 0_level_0,num_subscribers
published_timestamp,Unnamed: 1_level_1
2011-12-31 00:00:00+00:00,119028
2012-12-31 00:00:00+00:00,555339
2013-12-31 00:00:00+00:00,1661983
2014-12-31 00:00:00+00:00,1930406
2015-12-31 00:00:00+00:00,3475324
2016-12-31 00:00:00+00:00,2966644
2017-12-31 00:00:00+00:00,988941


In [56]:
# visulization
fig = px.line(df.resample('y').agg({'num_subscribers': 'sum'}),
              markers=True,
              title='num of subscribers by year')

fig.update_yaxes(title='num of subscribers')
fig.update_xaxes(title='Year')

fig.show()

## ***insight 10***
- only subject that makes profit in 2011 was web development.
- in all subject profit decreased in 2016 & 2017 excepted Musical Instruments decreased in 2015 & 2016 & 2017.

In [57]:
# pivot table to calculate sum of profit for each subject by year
df.pivot_table(index=df.index.year, columns='subject', values='profit', aggfunc='sum')

subject,Business Finance,Graphic Design,Musical Instruments,Web Development
published_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,,,,11643420.0
2012,190740.0,1329850.0,766405.0,9486475.0
2013,7298950.0,3085300.0,7479930.0,47907640.0
2014,35870820.0,8364490.0,20899910.0,41803825.0
2015,38702015.0,23273795.0,12363235.0,240171350.0
2016,30727750.0,23538210.0,7458615.0,214908615.0
2017,10945040.0,17391525.0,4390960.0,58042075.0


In [58]:
# visulization
fig = px.line(df.pivot_table(index=df.index.year, columns='subject', values='profit', aggfunc='sum'),
              markers=True,
              facet_col='subject',
              title='sum of profit for each subject by year')

fig.update_xaxes(title='Year')

fig.show()

## ***insight 11***
- only subject that makes subscribers in 2011 was web development.
- in all subject subscribers decreased in 2016 & 2017 excepted Graphic Design decreased in 2013 & 2016 & 2017 and in web development there are drop of subscribers in 2014 & 2016 & 2017.

In [59]:
# pivot table to calculate num of subscribers for each subject by year
df.pivot_table(index=df.index.year, columns='subject', values='num_subscribers', aggfunc='sum')

subject,Business Finance,Graphic Design,Musical Instruments,Web Development
published_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,,,,119028.0
2012,3620.0,100649.0,133635.0,317435.0
2013,311664.0,50133.0,150224.0,1149962.0
2014,494623.0,174582.0,156152.0,1105049.0
2015,501858.0,352856.0,190368.0,2430242.0
2016,426647.0,229587.0,148748.0,2161662.0
2017,130299.0,155341.0,67562.0,635739.0


In [60]:
# visulization
fig = px.line(df.pivot_table(index=df.index.year, columns='subject', values='num_subscribers', aggfunc='sum'),
              markers=True,
              facet_col='subject',
              title='num of subscribers for each subject by year')

fig.update_xaxes(title='Year')

fig.show()