In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Types of data analysis
###### 1. Descriptive Analysis
###### 2. Statistical Analysis
###### 3. Diagnostic Analysis
###### 4. Predictive Analysis
###### 5. Prescriptive Analysis
###### 6. Exploratory Data Analysis (EDA)
###### 7. Inferential Analysis

In [29]:
data = pd.read_csv('datasets.csv')

In [30]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,datasetName,about,link,categoryName,cloud,vintage
0,Microbiome Project,American Gut (Microbiome Project),https://github.com/biocore/American-Gut,Biology,GitHub,
1,GloBI,Global Biotic Interactions (GloBI),https://github.com/jhpoelen/eol-globi-data/wik...,Biology,GitHub,
2,Global Climate,Global Climate Data Since 1929,http://en.tutiempo.net/climate,Climate/Weather,,1929.0
3,CommonCraw 2012,3.5B Web Pages from CommonCraw 2012,http://www.bigdatanews.com/profiles/blogs/big-...,Computer Networks,,2012.0
4,Indiana Webclicks,53.5B Web clicks of 100K users in Indiana Univ.,http://cnets.indiana.edu/groups/nan/webtraffic...,Computer Networks,,


In [31]:
df.tail(6)

Unnamed: 0,datasetName,about,link,categoryName,cloud,vintage
55,ClueWeb09 FACC,ClueWeb09 FACC,http://lemurproject.org/clueweb09/FACC1/,Natural Language,,2009.0
56,ClueWeb12 FACC,ClueWeb12 FACC,http://lemurproject.org/clueweb12/FACC1/,Natural Language,,2012.0
57,Google Ngrams,Google Books Ngrams (2.2TB),https://aws.amazon.com/datasets/google-books-n...,Natural Language,Amazon,
58,EDRM Enron,"EDRM Enron EMail of 151 users, hosted on S3",https://aws.amazon.com/datasets/enron-email-data/,Social Networks,Amazon,
59,GetGlue,GetGlue - users rating TV shows,http://getglue-data.s3.amazonaws.com/getglue_s...,Social Networks,,
60,Twitter RepLab,Twitter Data for Online Reputation Management,http://nlp.uned.es/replab2013/,Social Networks,,2013.0


In [32]:
df.info() # info about the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   datasetName   61 non-null     object 
 1   about         61 non-null     object 
 2   link          61 non-null     object 
 3   categoryName  61 non-null     object 
 4   cloud         24 non-null     object 
 5   vintage       31 non-null     float64
dtypes: float64(1), object(5)
memory usage: 3.0+ KB


In [33]:
df.shape

(61, 6)

In [34]:
df.dtypes

datasetName      object
about            object
link             object
categoryName     object
cloud            object
vintage         float64
dtype: object

In [35]:
df.drop(columns=['datasetName'], inplace=True) # dropping non-numerical column

In [36]:
df

Unnamed: 0,about,link,categoryName,cloud,vintage
0,American Gut (Microbiome Project),https://github.com/biocore/American-Gut,Biology,GitHub,
1,Global Biotic Interactions (GloBI),https://github.com/jhpoelen/eol-globi-data/wik...,Biology,GitHub,
2,Global Climate Data Since 1929,http://en.tutiempo.net/climate,Climate/Weather,,1929.0
3,3.5B Web Pages from CommonCraw 2012,http://www.bigdatanews.com/profiles/blogs/big-...,Computer Networks,,2012.0
4,53.5B Web clicks of 100K users in Indiana Univ.,http://cnets.indiana.edu/groups/nan/webtraffic...,Computer Networks,,
...,...,...,...,...,...
56,ClueWeb12 FACC,http://lemurproject.org/clueweb12/FACC1/,Natural Language,,2012.0
57,Google Books Ngrams (2.2TB),https://aws.amazon.com/datasets/google-books-n...,Natural Language,Amazon,
58,"EDRM Enron EMail of 151 users, hosted on S3",https://aws.amazon.com/datasets/enron-email-data/,Social Networks,Amazon,
59,GetGlue - users rating TV shows,http://getglue-data.s3.amazonaws.com/getglue_s...,Social Networks,,


In [37]:
df.describe() # statistical summary of numerical columns

Unnamed: 0,vintage
count,31.0
mean,2000.677419
std,24.43684
min,1920.0
25%,2006.5
50%,2009.0
75%,2012.0
max,2016.0


In [38]:
df.rename(columns={'vintage':'Year'}, inplace=True)
df

Unnamed: 0,about,link,categoryName,cloud,Year
0,American Gut (Microbiome Project),https://github.com/biocore/American-Gut,Biology,GitHub,
1,Global Biotic Interactions (GloBI),https://github.com/jhpoelen/eol-globi-data/wik...,Biology,GitHub,
2,Global Climate Data Since 1929,http://en.tutiempo.net/climate,Climate/Weather,,1929.0
3,3.5B Web Pages from CommonCraw 2012,http://www.bigdatanews.com/profiles/blogs/big-...,Computer Networks,,2012.0
4,53.5B Web clicks of 100K users in Indiana Univ.,http://cnets.indiana.edu/groups/nan/webtraffic...,Computer Networks,,
...,...,...,...,...,...
56,ClueWeb12 FACC,http://lemurproject.org/clueweb12/FACC1/,Natural Language,,2012.0
57,Google Books Ngrams (2.2TB),https://aws.amazon.com/datasets/google-books-n...,Natural Language,Amazon,
58,"EDRM Enron EMail of 151 users, hosted on S3",https://aws.amazon.com/datasets/enron-email-data/,Social Networks,Amazon,
59,GetGlue - users rating TV shows,http://getglue-data.s3.amazonaws.com/getglue_s...,Social Networks,,


In [39]:
duplicate_rows_df=df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (0, 5)


In [40]:
# no duplicate rows found warna we would have used this line to remove them : df = df.drop_duplicates()
# checking for null values
print(df.isnull().sum())

about            0
link             0
categoryName     0
cloud           37
Year            30
dtype: int64


In [41]:
df.dropna(inplace=True) # removing null values
df

Unnamed: 0,about,link,categoryName,cloud,Year
32,FBI Hate Crime 2013 - aggregated data,https://github.com/emorisse/FBI-Hate-Crime-Sta...,Social Sciences,GitHub,2013.0
41,NYC Uber trip data April 2014 to September 2014,https://github.com/fivethirtyeight/uber-tlc-fo...,Transportation,GitHub,2014.0


In [42]:
data = pd.read_csv('TV_Shows.csv')

In [43]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1
1,1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1
2,2,Money Heist,2017,18+,8.4,91%,1,0,0,0,1
3,3,Sherlock,2010,16+,9.1,78%,1,0,0,0,1
4,4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,1


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5611 entries, 0 to 5610
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       5611 non-null   int64  
 1   Title            5611 non-null   object 
 2   Year             5611 non-null   int64  
 3   Age              3165 non-null   object 
 4   IMDb             4450 non-null   float64
 5   Rotten Tomatoes  1010 non-null   object 
 6   Netflix          5611 non-null   int64  
 7   Hulu             5611 non-null   int64  
 8   Prime Video      5611 non-null   int64  
 9   Disney+          5611 non-null   int64  
 10  type             5611 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 482.3+ KB


In [45]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,IMDb,Netflix,Hulu,Prime Video,Disney+,type
count,5611.0,5611.0,4450.0,5611.0,5611.0,5611.0,5611.0,5611.0
mean,2805.0,2011.02103,7.113258,0.344145,0.3126,0.382107,0.03208,1.0
std,1619.900511,11.005116,1.13206,0.475131,0.463594,0.485946,0.176228,0.0
min,0.0,1901.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,1402.5,2010.0,6.6,0.0,0.0,0.0,0.0,1.0
50%,2805.0,2015.0,7.3,0.0,0.0,0.0,0.0,1.0
75%,4207.5,2017.0,7.9,1.0,1.0,1.0,0.0,1.0
max,5610.0,2020.0,9.6,1.0,1.0,1.0,1.0,1.0


In [46]:
duplicate_rows_df=df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (0, 11)


In [47]:
df=df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1
1,1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1
2,2,Money Heist,2017,18+,8.4,91%,1,0,0,0,1
3,3,Sherlock,2010,16+,9.1,78%,1,0,0,0,1
4,4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
5509,5509,Diary of a Future President,2020,7+,5.5,100%,0,0,0,1,1
5517,5517,Encore!,2019,7+,7.4,68%,0,0,0,1,1
5522,5522,Spider-Man Unlimited,1999,7+,6.5,50%,0,0,0,1,1
5530,5530,The Super Hero Squad Show,2009,7+,6.1,50%,0,0,0,1,1


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, 0 to 5575
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       930 non-null    int64  
 1   Title            930 non-null    object 
 2   Year             930 non-null    int64  
 3   Age              930 non-null    object 
 4   IMDb             930 non-null    float64
 5   Rotten Tomatoes  930 non-null    object 
 6   Netflix          930 non-null    int64  
 7   Hulu             930 non-null    int64  
 8   Prime Video      930 non-null    int64  
 9   Disney+          930 non-null    int64  
 10  type             930 non-null    int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 87.2+ KB


In [49]:
df.rename(columns={'Age':'Rated'}, inplace=True)

In [50]:
df

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1
1,1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1
2,2,Money Heist,2017,18+,8.4,91%,1,0,0,0,1
3,3,Sherlock,2010,16+,9.1,78%,1,0,0,0,1
4,4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
5509,5509,Diary of a Future President,2020,7+,5.5,100%,0,0,0,1,1
5517,5517,Encore!,2019,7+,7.4,68%,0,0,0,1,1
5522,5522,Spider-Man Unlimited,1999,7+,6.5,50%,0,0,0,1,1
5530,5530,The Super Hero Squad Show,2009,7+,6.1,50%,0,0,0,1,1


In [51]:
df.rename(columns={'Unnamed: 0':'S.No'}, inplace=True)

In [52]:
df

Unnamed: 0,S.No,Title,Year,Rated,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1
1,1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1
2,2,Money Heist,2017,18+,8.4,91%,1,0,0,0,1
3,3,Sherlock,2010,16+,9.1,78%,1,0,0,0,1
4,4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
5509,5509,Diary of a Future President,2020,7+,5.5,100%,0,0,0,1,1
5517,5517,Encore!,2019,7+,7.4,68%,0,0,0,1,1
5522,5522,Spider-Man Unlimited,1999,7+,6.5,50%,0,0,0,1,1
5530,5530,The Super Hero Squad Show,2009,7+,6.1,50%,0,0,0,1,1


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, 0 to 5575
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   S.No             930 non-null    int64  
 1   Title            930 non-null    object 
 2   Year             930 non-null    int64  
 3   Rated            930 non-null    object 
 4   IMDb             930 non-null    float64
 5   Rotten Tomatoes  930 non-null    object 
 6   Netflix          930 non-null    int64  
 7   Hulu             930 non-null    int64  
 8   Prime Video      930 non-null    int64  
 9   Disney+          930 non-null    int64  
 10  type             930 non-null    int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 87.2+ KB
