# Analysing my Netflix data

## 1. Load Viewing activity associated with my Netflix account
-  Import ViewingActivity.csv as a pandas dataframe
-  Filter data associated with my profile 'Priyanka'
#### Result: my_netflix_raw

In [1]:
import pandas as pd
all_netflix = pd.read_csv('f:\\Python_projects\\netflix_data\\source_data\\ViewingActivity.csv')
my_netflix_raw = all_netflix[all_netflix['Profile Name']=='Priyanka']

## 2. Cleaning data
#### Result: my_shows and my_movies


### 2.1 Initial clean
-  Remove supplemental video types
-  Removing rows containing NA values
-  Only keep Start Time, Duration, Title and Country columns
-  Convert duration to minutes
#### Result: my_netflix

In [13]:
#Remove other supplemental video types
my_netflix_video = my_netflix_raw[my_netflix_raw['Supplemental Video Type'].isna()]
#Remove data not from India and UK
my_netflix_video = my_netflix_video[~((my_netflix_video['Country'] == 'US (United States)') | (my_netflix_video['Country'] == 'CA (Canada)'))]

#Only select columns I need
my_netflix = my_netflix_video[['Start Time', 'Duration', 'Title', 'Country']].copy()

In [14]:
#Define function to convert duration to minutes
def duration_to_hr (dur_string):
    s = dur_string.split(':')
    hr = int(s[0])
    mn = int(s[1])
    sc = int(s[2])
    tot = round((hr + mn/60 + sc/3600),2)
    return tot
#Apply function to Duration column
my_netflix['duration'] = my_netflix['Duration'].apply(duration_to_hr)
#Remove old Duration column
my_netflix= my_netflix.drop('Duration', axis=1)

### 2.2 Create new dataframes for movies and TV shows
-  TV show titles appear to contain two semicolons (;) to indicate show: season: episode
-  Movie titles appear to contain up to one semicolon
#### Result: my_shows and my_movies

#### 2.2.1 TV Shows DataFrame

In [15]:
# Define a regular expression pattern to match strings with two ':'
pattern = r'.*?:.*?:.*'

# Extract rows where 'Title column contains two ':' = shows
my_shows=pd.DataFrame()
my_shows = my_netflix[my_netflix['Title'].str.contains(pattern, na=False)]

#Add a new column for show names

def remove_colon(name):
    # Function to remove string after the first semi-colon
    result = name.split(':',1)[0].strip()
    return result
my_shows['Show name'] = my_shows['Title'].apply(remove_colon)
my_shows.head(2)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Start Time,Title,Country,duration,Show name
16696,2023-10-30 17:59:30,The Disastrous Life of Saiki K.: Season 1: Epi...,GB (United Kingdom),0.37,The Disastrous Life of Saiki K.
16697,2023-10-27 10:42:07,The Disastrous Life of Saiki K.: Season 1: Epi...,GB (United Kingdom),0.35,The Disastrous Life of Saiki K.


#### 2.2.2 Movies DataFrame

In [16]:
# Extract rows where 'Title' column does not contain two ':' = movies
my_movies=pd.DataFrame()
my_movies = my_netflix[~my_netflix['Title'].str.contains(pattern, na=False)]
my_movies.head(2)


Unnamed: 0,Start Time,Title,Country,duration
16727,2023-09-01 11:10:01,Flavors of Youth: International Version,GB (United Kingdom),0.31
16728,2023-08-31 17:52:10,Flavors of Youth: International Version,GB (United Kingdom),0.46


## 3. Plot most viewed TV show and Movie

### 3.1 Plot most watched TV Shows

In [103]:
import plotly.express as px

#### 3.1.1 Get a dataset of top 10 shows

In [53]:
#Group shows by show name, sort in descending order and convert to dataframe
my_shows_view  = my_shows.groupby('Show name').sum().sort_values(by='duration', ascending= False).reset_index()
#Keep only relevant columns
my_shows_view = my_shows_view[['Show name', 'duration']]

#Get a dataframe of just the first 10 most watched shows
top_10_shows = my_shows_view.iloc[0:10,].sort_values(by='duration', ascending=True)
#somehow, round has to be applied again
top_10_shows['duration']= round(top_10_shows['duration'],2)



#### 3.1.2 Plot a bar graph

In [104]:
fig = px.bar(top_10_shows,  x ='duration', y= 'Show name' ,title= 'My top 10 TV shows',
             color = 'duration' , color_continuous_scale=['white','purple','black'], color_continuous_midpoint=67,
             text='duration')

#Axis titles
fig.update_layout(
    xaxis_title="No. of hours watched",
    yaxis_title="TV Show",
)

#remove colorbar
fig.update_coloraxes(showscale=False)

fig.show()

### 3.2 Plot most watched movies

#### 3.2.1 Get a dataset of top 10 movies 

In [85]:
#Group shows by show name, sort in descending order and convert to dataframe
my_movies_view  = my_movies.groupby('Title').sum().sort_values(by='duration', ascending= False).reset_index()
#Keep only relevant columns
my_movies_view = my_movies_view[['Title', 'duration']]

#Get a dataframe of just the first 10 most watched shows
top_10_movies = my_movies_view.iloc[0:10,].sort_values(by='duration', ascending=True)
#somehow, round has to be applied again
top_10_movies ['duration']= round(top_10_movies['duration'],2)

#### Plot a bar graph

In [92]:
fig = px.bar(top_10_movies,  x ='duration', y= 'Title' ,title= 'My top 10 Movies',
             color = 'duration' , color_continuous_scale=['white','purple'], 
             text='duration')

#Axis titles
fig.update_layout(
    xaxis_title="No. of hours watched",
    yaxis_title="Movies",
)

#remove colorbar
fig.update_coloraxes(showscale=False)

fig.show()

# in progress