In [None]:
 #importing lib.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def catigorize_col (df, col, labels):
    """
    catigorizes a certain column based on its quartiles
   
    Args:
        (df)     df   - dataframe we are proccesing
        (col)    str  - to be catigorized column's name 
        (labels) list - list of labels from min to max
    
    Returns:
        (df)     df   - dataframe with the categorized col
    """
    
    # setting the edges to cut the column accordingly
    edges = [df[col].describe()['min'],
             df[col].describe()['25%'],
             df[col].describe()['50%'],
             df[col].describe()['75%'],
             df[col].describe()['max']]
    
    df[col] = pd.cut(df[col], edges, labels = labels, duplicates='drop')
    return df

In [None]:
# loading data and viewing its first 5 rows
df = pd.read_csv('mymoviedb.csv', lineterminator='\n')
df.head()

In [None]:
# viewing dataset info
df.info()

In [None]:
# exploring genres column
df['Genre'].head()

In [None]:
# check for duplicated rows
df.duplicated().sum()

In [None]:
# exploring summary statistics
df.describe()

In [None]:
# casting column a
df['Release_Date'] = pd.to_datetime(df['Release_Date'])

# confirming changes
print(df['Release_Date'].dtypes)

In [None]:
df['Release_Date'] = df['Release_Date'].dt.year
df['Release_Date'].dtypes

In [None]:
df.head()

In [None]:
# making list of column to be dropped
cols = ['Overview', 'Original_Language', 'Poster_Url']

# dropping columns and confirming changes
df.drop(cols, axis = 1, inplace = True)
df.columns

In [None]:
# define labels for edges
labels = ['not_popular', 'below_avg', 'average', 'popular']

# categorize column based on labels and edges
catigorize_col(df, 'Vote_Average', labels)

# confirming changes
df['Vote_Average'].unique()

In [None]:
# exploring column
df['Vote_Average'].value_counts()

In [None]:
# dropping NaNs
df.dropna(inplace = True)

# confirming
df.isna().sum()

In [None]:
# split the strings into lists
df['Genre'] = df['Genre'].str.split(', ')

# explode the lists
df = df.explode('Genre').reset_index(drop=True)
df.head()

In [None]:
# casting column into category
df['Genre'] = df['Genre'].astype('category')

# confirming changes
df['Genre'].dtypes

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
# setting up seaborn configurations
sns.set_style('whitegrid') 

In [None]:
# showing stats. on genre column
df['Genre'].describe()

In [None]:
# visualizing genre column
sns.catplot(y = 'Genre', data = df, kind = 'count', 
            order = df['Genre'].value_counts().index,
            color = '#4287f5')
plt.title('genre column distribution')
plt.show()

In [None]:
# visualizing vote_average column
sns.catplot(y = 'Vote_Average', data = df, kind = 'count', 
            order = df['Vote_Average'].value_counts().index,
            color = '#4287f5')
plt.title('votes destribution')
plt.show()

In [None]:
# saperating popular movies
popular_movies = df[df['Vote_Average'] == 'popular']
print(popular_movies.shape)
popular_movies.head()

In [None]:
# visualizing genre againest vote_average
sns.catplot(x = 'Genre', data = popular_movies,
            kind = 'count', order = popular_movies['Genre'].value_counts().index,
            color = '#4287f5')
plt.title('popular genres distribution')
plt.xticks(rotation=90)
plt.show()

In [None]:
# checking max popularity in dataset
df[df['Popularity'] == df['Popularity'].max()]

In [None]:
df['Release_Date'].hist()
plt.title('Release_Date column distribution')
plt.show()