# **Import Libraries**

In [None]:
import pandas as pd
from google.colab import drive
from operator import itemgetter
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go


#set default values for graphs
px.defaults.width = 900
px.defaults.height = 400
px.defaults.template = "plotly_dark"

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **Import Dataset**

In [None]:
df = pd.read_csv("/content/gdrive/MyDrive/kovai.co assignment/prime.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# **Data Preperation**

**Check for missing values**

In [None]:
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


**Impute missing values using mode**

In [None]:
# Impute missing values in categorical columns with mode
for col in df.columns:
    if df[col].dtype == "object":
        df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
print(df.isnull().sum())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


# **Data Analysis & EDA**

## **Descriptive statistics**

**Shape of the dataset**

In [None]:
df.shape

(8807, 12)

**Info regarding dataset**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8807 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


**Summary of the numerical features**

In [None]:
print(df.describe())

       release_year
count   8807.000000
mean    2014.180198
std        8.819312
min     1925.000000
25%     2013.000000
50%     2017.000000
75%     2019.000000
max     2021.000000


**Count the number of unique values for each feature**

In [None]:
print(df.nunique())

show_id         8807
type               2
title           8807
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64


**Total number of unique values in each column**

In [None]:
for column in df.columns:
    if df[column].dtype == 'object':
        unique_vals = len(df[column].value_counts())
        print(f"Column name: {column} | Total unique values: {unique_vals}")
    else:
        continue

Column name: show_id | Total unique values: 8807
Column name: type | Total unique values: 2
Column name: title | Total unique values: 8807
Column name: director | Total unique values: 4528
Column name: cast | Total unique values: 7692
Column name: country | Total unique values: 748
Column name: date_added | Total unique values: 1767
Column name: rating | Total unique values: 17
Column name: duration | Total unique values: 220
Column name: listed_in | Total unique values: 514
Column name: description | Total unique values: 8775


**5 most common values for the 'listed_in' column**

In [None]:
print(df['listed_in'].value_counts().nlargest(5))

Dramas, International Movies                        362
Documentaries                                       359
Stand-Up Comedy                                     334
Comedies, Dramas, International Movies              274
Dramas, Independent Movies, International Movies    252
Name: listed_in, dtype: int64


**Top 5 directors with the most number of shows**

In [None]:
print(df['director'].value_counts().nlargest(5))

Rajiv Chilaka             2653
Raúl Campos, Jan Suter      18
Suhas Kadav                 16
Marcus Raboy                16
Jay Karas                   14
Name: director, dtype: int64


**Highest Movie Rating Category by Release Year**

In [None]:
df['rating'] = df['rating'].astype('category')
print(df.groupby('release_year')['rating'].apply(lambda x: x.mode().values[0]))

release_year
1925    TV-14
1942    TV-14
1943    TV-PG
1944    TV-14
1945    TV-14
        ...  
2017    TV-MA
2018    TV-MA
2019    TV-MA
2020    TV-MA
2021    TV-MA
Name: rating, Length: 74, dtype: object


**Genre Statistics for the Dataset**

In [None]:
df_genres = [x.split(",") for x in df["listed_in"]] 
print()

df_all_genres = []
for row in df_genres:
    for gen in row:
        df_all_genres.append(gen.strip())
        
top_genres = {i:df_all_genres.count(i) for i in df_all_genres}

N = 6
top_6 = dict(sorted(top_genres.items(), key = itemgetter(1), reverse = True)[:N])


print("Recollected genres from {} rows. All genres listed on the dataset: {}\n".format(len(df_genres), len(df_all_genres)) )
print("All unique values of genres on this dataset: {}\n\n{}\n".format(len(set(df_all_genres)), set(df_all_genres)) )
print("Most common genres: {}\n\n".format( dict(sorted(top_genres.items())) ) )
print("The top 6 most common genres are: " + str(top_6))


Recollected genres from 8807 rows. All genres listed on the dataset: 19323

All unique values of genres on this dataset: 42

{'TV Thrillers', 'Action & Adventure', 'Documentaries', 'Independent Movies', 'TV Horror', 'Music & Musicals', 'Stand-Up Comedy', 'Anime Series', 'International TV Shows', 'Reality TV', 'Classic Movies', 'Anime Features', 'TV Comedies', 'Thrillers', 'Comedies', 'LGBTQ Movies', 'TV Mysteries', 'Sports Movies', 'TV Sci-Fi & Fantasy', 'International Movies', 'Romantic Movies', 'Crime TV Shows', 'Stand-Up Comedy & Talk Shows', 'Faith & Spirituality', 'Science & Nature TV', 'Horror Movies', 'TV Dramas', 'Docuseries', 'Classic & Cult TV', 'Children & Family Movies', 'TV Shows', 'Cult Movies', 'Sci-Fi & Fantasy', 'Teen TV Shows', "Kids' TV", 'British TV Shows', 'Dramas', 'Romantic TV Shows', 'Korean TV Shows', 'TV Action & Adventure', 'Movies', 'Spanish-Language TV Shows'}

Most common genres: {'Action & Adventure': 859, 'Anime Features': 71, 'Anime Series': 176, 'Brit

### **Insights**
1. The release_year column has a minimum value of 1925 and a maximum value of 2021, indicating that the data includes movies and TV shows released over a 96-year period. The average release year is 2014, with a standard deviation of 8.82.


2. The data has several missing values, with the director column having the highest number of missing values (2634). Other columns with missing values include cast, country, date_added, rating, and duration.


3. The show_id and title columns have 8807 unique values, which is the same number of rows in the data, indicating that these columns can be used as a unique identifier for each show. Other columns with a high number of unique values include director, cast, and listed_in.


4. The listed_in column is a categorical variable and it appears that the most common values are "Dramas" and "International Movies".


5. The top 5 directors with the most shows in the data are Rajiv Chilaka, Raúl Campos and Jan Suter, Marcus Raboy, Suhas Kadav, and Jay Karas. This information could be useful in identifying which directors are popular among the users of the Amazon Prime service.

6. There were 8807 rows in the dataset and 19323 genres listed. Out of all the listed genres, there are 42 unique values. The most common genres are "International Movies" with 2752 occurrences, "Dramas" with 2427 occurrences, and "Comedies" with 1674 occurrences. The top 6 most common genres are "International Movies", "Dramas", "Comedies", "International TV Shows", "Documentaries", and "Action & Adventure".



## **Visualisations**

**Bar plot to visualise type**

In [None]:
# Group data by categorical column
grouped = df.groupby('type').size().reset_index(name='counts')

# Create bar plot
fig = px.bar(grouped, x='type', y='counts')
fig.show()

The dataset has more movies than TV shows. This could be because movies tend to be more popular among viewers and therefore, more of them are produced and added to the streaming platform.

**Histogram of release_year**

In [None]:
# Create a histogram of the "release_year" column
fig = px.histogram(df, x="release_year", nbins=50)
# Show the figure
fig.show()

From the histogram, we can see that there has been a significant increase in the number of movies being made available on the streaming platform, starting from the year 1992. This could be due to the increasing popularity and acceptance of streaming platforms, as well as the advent of new technologies and advancements in the film industry.

**Box plot for duration column (movies)**

In [None]:
# Filter the data to only show movies
movies_df = df[df["type"] == "Movie"]

# Make the box plot
fig = px.box(movies_df, y="duration")
fig.show()

The box plot shows us the duration of movies on the streaming platform, with the median being 92 minutes. The upper fence, which is the maximum value still within the range of normal data, is 26 minutes, while the maximum duration of a movie on the platform is 191 minutes. On the other hand, the minimum duration of a movie is 90 minutes, with the first quartile (Q1) being 116 minutes, and the third quartile (Q3) being 140 minutes. This suggests that most movies on the platform tend to be around the same length, with a few outliers on either end.

**Histogram for duration of tv series**

In [None]:
# Filter the data to only show movies
shows_df = df[df["type"] == "TV Show"]

# Make the box plot
fig = px.histogram(shows_df, y="duration")
fig.show()

The histogram of TV show durations shows that a large number of TV shows only last for 1 or 2 seasons. This could be due to the fact that many TV shows don't gain enough popularity or viewership to continue for a longer period

**Top 6 genres**

In [None]:
fig = px.bar(x=list(top_6.keys()), y=list(top_6.values()),
             labels={"x": "Genres", "y": "Count"})
fig.show()

The top 6 genres in the dataset are: International Movies, Dramas, Comedies, International TV Shows, Documentaries, and Action & Adventure. This could be because these genres are popular among viewers and therefore, more content is produced in these genres.

**Evolution of the Top 6 Genres starting from 1995**

In [None]:
# Group the data by year and genre
grouped = df[(df['release_year'] >= 1995) & (df['listed_in'].isin(list(top_6.keys())))].groupby(['release_year', 'listed_in']).size().reset_index(name='counts')

# Pivot the data to get the count of shows for each genre in each year
pivot_df = grouped.pivot(index='release_year', columns='listed_in', values='counts')

# Make the line chart
fig = px.line(pivot_df, title='Evolution of the Top 6 Genres starting from 1995')
fig.show()

The line graph of the top 6 genres' evolution over the years shows that the number of documentaries peaked in 2017, while comedies, adventure and dramas have had a slow and steady increase. This could be due to the changing preferences and demands of viewers over time. Additionally, the increasing number of documentaries being produced could also be a result of the growing interest in non-fiction content.

# **Sentiment Analysis on Description Column**

Sentiment analysis of the description column can provide valuable insights into the tone and emotions conveyed by the content descriptions. By understanding the sentiment of the descriptions, it is possible to determine which genres are typically associated with positive, negative, or neutral sentiments. For example, comedies may be more likely to have positive descriptions, while horror movies may be more likely to have negative descriptions. Additionally, by analyzing the sentiment of descriptions over time, we can determine if there have been any shifts in the overall sentiment of content offerings. This information can be useful for streaming platforms as it can help them understand what their customers are looking for and tailor their content offerings accordingly. It can also provide valuable insights for content creators and help them understand what kind of content is well received by audiences.

**Deriving sentiment categories from description column**

In [None]:
# Importing the TextBlob library to perform sentiment analysis
from textblob import TextBlob

# Defining a function to categorize the sentiment of the text
def sentiment_category(text):
    # Using TextBlob to get the sentiment polarity of the text
    sentiment = TextBlob(text).sentiment.polarity
    
    # Checking the polarity value and assigning a sentiment category based on it
    if sentiment < 0:
        return "Negative"
    elif sentiment == 0:
        return "Neutral"
    else:
        return "Positive"

# Applying the sentiment_category function to the 'description' column of the DataFrame
df["sentiment_category"] = df["description"].apply(sentiment_category)

In [None]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,sentiment_category
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,David Attenborough,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Positive
1,s2,TV Show,Blood & Water,Rajiv Chilaka,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Neutral
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Negative
3,s4,TV Show,Jailbirds New Orleans,Rajiv Chilaka,David Attenborough,United States,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",Negative
4,s5,TV Show,Kota Factory,Rajiv Chilaka,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,Neutral


**Visualising the occurences in each category**

In [None]:
# Count the number of occurrences of each sentiment category
sentiment_counts = df['sentiment_category'].value_counts()

# Create a bar chart
fig = px.bar(sentiment_counts, x=sentiment_counts.index, y=sentiment_counts.values)

# Set the x-axis label
fig.update_layout(xaxis_title="Sentiment Category")

# Show the chart
fig.show()

**Distribution of Sentiments over the Years (Starting from 1995)**

In [None]:
#Filter the data to only show movies released after 1995
df_filtered = df[df["release_year"] >= 1995]

#Group the data by year and sentiment
grouped = df_filtered.groupby(['release_year', 'sentiment_category']).size().reset_index(name='counts')

#Pivot the data to get the count of shows for each sentiment in each year
pivot_df = grouped.pivot(index='release_year', columns='sentiment_category', values='counts')

#Make the line chart
fig = px.line(pivot_df, title='Distribution of Sentiments over the Years (Starting from 1995)')
fig.show()

In [None]:
# Get only the rows with the top 6 genres
df_top_6 = df[df['listed_in'].isin(list(top_6.keys()))]

# Group the data by genre and sentiment
grouped = df_top_6.groupby(['listed_in', 'sentiment_category']).size().reset_index(name='counts')

# Pivot the data to get the count of shows for each sentiment in each genre
pivot_df = grouped.pivot(index='listed_in', columns='sentiment_category', values='counts')

# Make the bar chart
fig = px.bar(pivot_df, title='Distribution of Sentiments across the Top 6 Genres')
fig.show()

In [None]:
# Filter data to get only international movies and TV shows
df_intl = df[df["listed_in"].str.contains("International")]

# Get the descriptions and sentiment of the international movies and TV shows
df_intl = df_intl[["description", "sentiment_category"]]

# Display the filtered data
print(df_intl)

                                            description sentiment_category
1     After crossing paths at a party, a Cape Town t...            Neutral
2     To protect his family from a powerful drug lor...           Negative
4     In a city of coaching centers known to train I...            Neutral
7     On a photo shoot in Ghana, an American model s...           Negative
10    Sicily boasts a bold "Anti-Mafia" coalition. B...           Positive
...                                                 ...                ...
8798  A philandering small-town mechanic's political...           Positive
8799  A change in the leadership of a political part...           Negative
8800  Strong-willed, middle-class Kashaf and carefre...           Positive
8801  Recovering alcoholic Talal wakes up inside a s...           Negative
8806  A scrappy but poor boy worms his way into a ty...           Negative

[4103 rows x 2 columns]


In [None]:
genres = []
for genre in df['listed_in']:
    genres += genre.split(", ")

unique_genres = set(genres)

df_list = []
for genre in unique_genres:
    temp_df = df[df['listed_in'].str.contains(genre)]
    temp_df['genre'] = genre
    df_list.append(temp_df)
    
final_df = pd.concat(df_list)

sentiment_counts = final_df.groupby(['genre', 'sentiment_category']).size().reset_index(name='counts')

# Find the sentiment category with the highest count for each genre
result = sentiment_counts.loc[sentiment_counts.groupby(['genre'])['counts'].idxmax()]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
sentiment_counts = final_df.groupby(['genre', 'sentiment_category']).size().reset_index(name='counts')
pivot_table = sentiment_counts.pivot(index='genre', columns='sentiment_category', values='counts')
pivot_table = pivot_table.fillna(0)

print(pivot_table.to_string(formatters={'negative': '{:,.0f}'.format,
                                        'neutral': '{:,.0f}'.format,
                                        'positive': '{:,.0f}'.format}))

sentiment_category            Negative  Neutral  Positive
genre                                                    
Action & Adventure                 409      205       413
Anime Features                      23       16        32
Anime Series                        60       32        84
British TV Shows                    73       39       141
Children & Family Movies           175      114       352
Classic & Cult TV                    8        3        17
Classic Movies                      35       29        52
Comedies                           650      383      1222
Crime TV Shows                     194       83       193
Cult Movies                         24       14        33
Documentaries                      171      234       464
Docuseries                          89       78       228
Dramas                            1016      589      1585
Faith & Spirituality                12       21        32
Horror Movies                      198       48       111
Independent Mo

In [None]:
# Group the data by genre and get the sentiment category with the highest count for each genre
grouped_data = result.groupby('genre').apply(lambda x: x.loc[x['counts'].idxmax()])

# Iterate through the grouped data and print out the genre and its corresponding sentiment category
for index, row in grouped_data.iterrows():
    print(f"{row['genre']}  :  {row['sentiment_category']}")

Action & Adventure  :  Positive
Anime Features  :  Positive
Anime Series  :  Positive
British TV Shows  :  Positive
Children & Family Movies  :  Positive
Classic & Cult TV  :  Positive
Classic Movies  :  Positive
Comedies  :  Positive
Crime TV Shows  :  Negative
Cult Movies  :  Positive
Documentaries  :  Positive
Docuseries  :  Positive
Dramas  :  Positive
Faith & Spirituality  :  Positive
Horror Movies  :  Negative
Independent Movies  :  Positive
International Movies  :  Positive
International TV Shows  :  Positive
Kids' TV  :  Positive
Korean TV Shows  :  Positive
LGBTQ Movies  :  Positive
Movies  :  Positive
Music & Musicals  :  Positive
Reality TV  :  Positive
Romantic Movies  :  Positive
Romantic TV Shows  :  Positive
Sci-Fi & Fantasy  :  Negative
Science & Nature TV  :  Positive
Spanish-Language TV Shows  :  Positive
Sports Movies  :  Positive
Stand-Up Comedy  :  Positive
Stand-Up Comedy & Talk Shows  :  Positive
TV Action & Adventure  :  Positive
TV Comedies  :  Positive
TV Dram

## **Insights**

1. There are 4551 positive descriptions, 2658 negative and 1598 neutral ones. The majority of the descriptions have a positive sentiment, with positive descriptions outnumbering negative and neutral ones.

2. In the line graph, it can be observed that the positive, neutral and negative sentiment categories always maintain an equal distance between them. The equal distance between the lines could indicate that the descriptions have a balanced distribution of sentiments across the polarity scale.

3. The sentiment category for each genre has been listed. It can be seen that majority of the genres have a positive sentiment with a few exceptions such as "Crime TV Shows" and "Horror Movies" which have a negative sentiment and "Sci-Fi & Fantasy" and "Thrillers" which have a negative sentiment. This result could indicate that people generally tend to describe these genres in a negative light
