# Analysing my Netflix data

## 1. Load Viewing activity associated with my Netflix account
-  Import ViewingActivity.csv as a pandas dataframe
-  Filter data associated with my profile 'Priyanka'
#### Result: my_netflix_raw

In [1]:
import pandas as pd
all_netflix = pd.read_csv('f:\\Python_projects\\netflix_data\\source_data\\ViewingActivity.csv')
#Only take rows corresponding to my profile
my_netflix_raw = all_netflix[all_netflix['Profile Name']=='Priyanka']
#remove dataframew with all data
%reset_selective -f all_netflix

## 2. Cleaning data
#### Result: my_netflix


### 2.1 Initial clean
-  Remove supplemental video types + remove column
-  Only keep viewing activity from India and UK + simplify to IN or GB
-  Only keep Start Time, Duration, Title and Country columns
-  Convert duration to hours + remove old Duration column
-  Removing rows containing NA values
-  Remove variables not needed further
#### Reult: my_netflix_clean

In [3]:
#Remove other supplemental video types
my_netflix_video = my_netflix_raw[my_netflix_raw['Supplemental Video Type'].isna()]
my_netflix_video = my_netflix_video.drop(labels='Supplemental Video Type', axis=1)

#Remove data not from India and UK
my_netflix_video = my_netflix_video[(my_netflix_video['Country'] == 'IN (India)') | (my_netflix_video['Country'] == 'GB (United Kingdom)')]
my_netflix_video['Country'] = my_netflix_video['Country'].str[:2]

#Only select columns I need
my_netflix_clean = my_netflix_video[['Start Time', 'Duration', 'Title', 'Country']].copy()

#Define function to convert duration to hours
def duration_to_hr (dur_string):
    s = dur_string.split(':')
    hr = int(s[0])
    mn = int(s[1])
    sc = int(s[2])
    tot = round((hr + mn/60 + sc/3600),2)
    return tot

#Apply function to Duration column
my_netflix_clean['duration'] = my_netflix_clean['Duration'].apply(duration_to_hr)
#Remove old Duration column
my_netflix_clean= my_netflix_clean.drop('Duration', axis=1)

#Remove rows with NA values
my_netflix_clean = my_netflix_clean.dropna()

#Remove non-required variables and functions
%reset_selective -f my_netflix_video
%reset_selective -f duration_to_hr

my_netflix_clean.head(2)

Unnamed: 0,Start Time,Title,Country,duration
16696,2023-10-30 17:59:30,The Disastrous Life of Saiki K.: Season 1: Epi...,GB,0.37
16697,2023-10-27 10:42:07,The Disastrous Life of Saiki K.: Season 1: Epi...,GB,0.35


### 2.2 TV Shows and Movies categorization
-  Define functions to categorize TV show or movie
-  Define function to extract TV show name
-  New column to categorize TV shows and Movies
-  New column with movie titles + TV show names
-  Remove old Title column
#### Basis: 
-  TV show titles appear to contain two semicolons (;) to indicate show: season: episode
-  Movie titles appear to contain up to one semicolon
#### Result: my_netflix

In [5]:
#Creaate a copy dataframe to work with
my_netflix = my_netflix_clean.copy()

#Define function to classify as movie or TV show
def tv_or_movie(name):
    #Function to check for prescence of two semi colons (;)    
    if name.count(':')>=2:
        result = 'tv show'
    else:
        result = 'movie'
    return result

#Function to get show name
def get_show_name(name):
    #check if TV show
    if tv_or_movie(name) == 'tv show':
        result = name.split(':',1)[0].strip()
        return result
    else:
        result = name
    return result

#Use functions to create new columns
##Column to categorize as TV show or movie
my_netflix['category'] = my_netflix['Title'].apply(tv_or_movie)

##Column to save TV show and movies name
my_netflix['name'] = my_netflix['Title'].apply(get_show_name)

#Remove Title column
my_netflix = my_netflix.drop(labels= 'Title', axis=1)

#Remove non-required functions
%reset_selective -f tv_or_movie
%reset_selective -f get_show_name

my_netflix.head(2)


Unnamed: 0,Start Time,Country,duration,category,name
16696,2023-10-30 17:59:30,GB,0.37,tv show,The Disastrous Life of Saiki K.
16697,2023-10-27 10:42:07,GB,0.35,tv show,The Disastrous Life of Saiki K.


## 3. Data Visualization

In [7]:
import plotly.express as px

### 3.1 Bar graph showing most watched TV Shows and Movies
#### Define a function that: 
-  Gets dataframe of just the TV shows or movies
-  Group by show name and + sort in descending order of total watch duration
-  Plots a bar graph

In [20]:
def top_10(my_df, col, type):
    #from dataframe my_df, get top 10 most watched names for 'type' in column 'col'. 'type' is 'movie' or 'tv show'
    
    #Get dataframe of just type
    my_type = my_df[my_df[col]==type]
    
    #Group shows by movie name, sort in descending order and convert to dataframe
    my_type = my_type.groupby('name').sum().sort_values(by='duration', ascending= False).reset_index()
    
    #Keep only relevant columns
    my_type = my_type[['name', 'duration']]
    
    #Get a dataframe of just the first 10 most watched shows
    top_10_type = my_type.iloc[0:10,].sort_values(by='duration', ascending=True)
    #somehow, round has to be applied again
    top_10_type ['duration']= round(top_10_type['duration'],2)
    
    fig_type = px.bar(top_10_type,  x ='duration', y= 'name' ,title= 'My top 10 Movies',
                        color = 'duration' , color_continuous_scale=['red','black'], text='duration')
    #Axis titles
    fig_type.update_layout(xaxis_title="No. of hours watched", yaxis_title="Movies")
    
    #Remove colorbar
    fig_type.update_coloraxes(showscale=False)
    
    fig_type.show()

#### Top 10 movies

In [21]:
top_10(my_netflix, col='category', type= 'movie')

#### Top 10 TV shows

In [22]:
top_10(my_netflix, col='category', type= 'tv show')

# next step- do bar graph for genre. which  genre have i spend most time watch. with different color for movie or tv

# 4. Plot TV shows vs Movies

In [None]:
my_netflix.head(2)

In [None]:
fig = px.pie(my_netflix, values= 'duration',names='')

# 5. Plot 
#### Has my viewing pattern changed since I moved to the UK?