# Netflix Analysis using Movies Dataset from Kaggle

In [None]:
# Import libraries
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Import data
file = "netflix_titles.csv"
data = pd.read_csv(file)

# Since column names are rare, we rename columns
rename_data = data.rename(columns={"show_id": "Show ID", "type": "Type", "title": "Title", "director": "Director",
                            "cast": "Cast", "country": "Country", "date_added": "Date Added", "release_year": "Release Year",
                            "rating": "Rating", "duration": "Duration", "listed_in": "Genre", "description": "Description"})

In [None]:
# There is missing some information so we delete these data
clean_data = rename_data.replace(np.NaN)

In [None]:
# We separate the Movies data and visualize the information
movies_country = clean_data[clean_data["Type"] == 'Movie']
movies_country

In [None]:
# We visualize that the total of all data set is 6234 and after selecting the movies, we have 4265
print('Length of Total Dataset: ' + str(len(clean_data)))
print('Length of TV Shows Dataset: ' + str(len(movies_country)))

In [None]:
# We count the number of movies per genre and save all the genres we found in our dataset
genre_list_movie = [] # Movies
genre_count_movie = {} # Movies counting

for listed in movies_country['Genre']:
    split_gen = [x.strip() for x in listed.split(',')] # Split in list where there is more than one value
    #print(split_genres)
    
    # Counting dictionary
    for i in range(len(split_gen)): 
        # Check if exists in genre_list or not and add it
        if split_gen[i] not in genre_list_movie: 
            genre_list_movie.append(split_gen[i]) 
        # Counting    
        if split_gen[i] in genre_count_movie:
            genre_count_movie[split_gen[i]] = genre_count_movie[split_gen[i]] + 1
        else:
            genre_count_movie[split_gen[i]] = 1

In [None]:
# Ploting the total counts
# Save to dataframe
df_genres_movies = pd.DataFrame.from_dict(genre_count_movie, orient='index', columns=['Total Data'])
# Sort values
sorted_genres_movie = df_genres_movies.sort_values(by=['Total Data'], ascending=False)
sorted_genres_movie.head(10)
# Bar plot
x_genres_movies = np.arange(len(sorted_genres_movie['Total Data']))
tick_gmlocations = [value for value in x_genres_movies]
width = 0.55
#plt.figure(figsize=(20,10))
plt.bar(x_genres_movies, sorted_genres_movie['Total Data'], width=width, color='indigo', alpha=0.5) 
plt.xticks(tick_gmlocations, sorted_genres_movie.index, rotation="vertical", fontsize=6)
plt.autoscale(tight=True)
plt.xlabel("Genres")
plt.ylabel("Total data")
plt.title("Show Total Genres")
plt.tight_layout()
for i, v in enumerate(sorted_genres_movie['Total Data']):
    plt.text(i, v, str(v), color='k', ha='center', fontsize=7)
    
plt.savefig("TotalGenresMovie.png")

In [None]:
# Now we count the number of movies per country and extract the countries we have in the Netflix library
country_list_movie = [] # Countries
country_count_movie = {} # Count movies per each country

for listed_in in movies_country['Country']:
    split_country = [x.strip() for x in listed_in.split(',')] # Split in list if there are more than one collaboration
  
    # Save data
    for j in range(len(split_country)): 
        # Check if exists in country_list or not and add it
        if split_country[j] not in country_list_movie: 
            country_list_movie.append(split_country[j]) 
        # Count genres per country
        if split_country[j] in country_count_movie:
            country_count_movie[split_country[j]] = country_count_movie[split_country[j]] + 1
        else:
            country_count_movie[split_country[j]] = 1

In [None]:
# Now that we have the total of movies per genre and how many movies per country there are,
# we analyze the genres per country using a matrix where columns are genre list and rows are the countries
matrizMov = np.zeros((len(country_list_movie),len(genre_list_movie)))
# Save values of countries and genres
ctemp = movies_country['Country'].values
gtemp = movies_country['Genre'].values

for i in range(len(ctemp)): # We are in a row
    #print(i)
    cstmp=ctemp[i].split(',') # Separate the data if there is more than one country
    #print(cstmp)
    for k in range(len(cstmp)): # We jump to the next cell where there are the genres and separate them
        gctmp=gtemp[i].split(',') # Analyzing each country of the previous split, we are still in the same row
        #print(gctmp)
        for j in range(len(gctmp)): # Here, we are going to move through each genre for one country
            # We use the matrix as a xy coordinate system where x would be the index of countries, and y would be the columns of genre.
            # To get the index of each value, we use the list information of countries and genres and the function .index to look for the respective country and genre
            # We sum +1 when we get the xy coordinates
            try:
                matrizMov[country_list_movie.index(cstmp[k].strip()),genre_list_movie.index(gctmp[j].strip())]=matrizMov[country_list_movie.index(cstmp[k].strip()),genre_list_movie.index(gctmp[j].strip())]+1
            except:
                print('Error') # Just to verify that we gather all the data

In [None]:
# We convert the matrix information to a dataframe
df_movies = pd.DataFrame(data=matrizMov, index=country_list_movie, columns=genre_list_movie)
df_movies

In [None]:
# We analyze the top 10 countries with more movies
# To gather this information we sort the values using dataframe
country_df_movie = pd.DataFrame.from_dict(country_count_movie, orient='index', columns=['Total Data'])
sorted_countries_movie = country_df_movie.sort_values(by=['Total Data'], ascending=False)
sorted_countries_movie.head(10)

In [None]:
# We plot the top 10 countries with their respective counting movies by genre
for i in range(10):
    select_country = sorted_countries_movie.index[i] # Country
    country_genres = df_movies.loc[df_movies.index == select_country,:] # Data from that country in the matrix
    # Dataframe to sort the values
    df_sorted_genre = pd.DataFrame(country_genres).sort_values(by=select_country, axis=1, ascending=False)
    # We transpose the row data to column data to make easier the ploting 
    df_sorted_genre_T = df_sorted_genre.T
    
    # Bar Plot
    width = 0.55
    ax_bar = df_sorted_genre_T.plot(kind='bar', facecolor='slateblue', width=width, legend=None)
    plt.title(select_country+' Movies per Genre')
    plt.ylabel('Number of Titles')
    plt.xlabel('Genres')
    plt.tight_layout()
    # To enumarate the values in bars
    for i, v in enumerate(df_sorted_genre_T[select_country]):
        ax_bar.text(i, v, str(v), color='k', ha='center', fontsize=7)
        
    # Save figures
    plt.savefig('Movies_Country' + str(i) + '.png')

In [None]:
# Export dataframes of TV Shows data set and TV Shows per genre per country
movies_country.to_csv("Netflix_Movies_Data.csv", index=True, header=True)
df_movies.to_csv('Netflix_Movies_perCountry.csv', index=True, header=True)