In [None]:
#Import Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
from datetime import date
import plotly.graph_objects as go
import plotly.express as px

In [None]:
#Analysis the Data
#Dataset: https://www.kaggle.com/datasets/luiscorter/netflix-original-films-imdb-scores

import os
os.listdir('/kaggle/input/')

df = pd.read_csv('../input/netflix-original-films-imdb-scores/NetflixOriginals.csv', encoding='latin-1')
df.head()

In [None]:
df.isna().sum()

In [None]:
df.dtypes

# Q1: According to the dataset, in which language are the long-running films created (Long-running films will be considered as 120 minutes and over.)?

In [None]:
df["Runtime"]=pd.to_numeric(df['Runtime'])

In [None]:
d1=df.groupby("Language").agg({"Runtime": "max"}).reset_index()

In [None]:
df1=d1.loc[(d1['Runtime'])>=120]

In [None]:
sns.barplot(x="Runtime", y="Language", data=df1,  palette="pastel").set_title("Long-running films by language");

# Q2: Find the IMDB values of the movies filmed in the 'Documentary' genre between January 2019 and June 2020.

In [None]:
d2=df.loc[(pd.DatetimeIndex(df['Premiere'])>"2019-01-01") & (pd.DatetimeIndex(df['Premiere'])<"2020-06-01")  & (df["Genre"]=="Documentary")]

In [None]:
(sns.barplot(x='IMDB Score', y='Title', data=d2.head(), palette= "Set2")
 .set_title("IMDB Score of Documentary Genre Between January 2019 and June 2020"));

# Q3: Which genre has the highest IMDB rating among movies filmed in only English?

In [None]:
d3=df.loc[df["Language"]=="English"].sort_values(["IMDB Score"], ascending=False)
d3.reset_index(drop=True)
d3[["Genre","IMDB Score"]].head(1)

# Q4: What is the average 'runtime' of movies filmed in 'Hindi'?

In [None]:
df.loc[df["Language"]=="Hindi"][["Language",'Runtime']].groupby(["Language"]).agg({"Runtime":np.mean})

# Q5: How many categories does the 'Genre' Column have and what are those categories?

In [None]:
d5=df["Genre"].value_counts().head()

In [None]:
sns.lineplot(y=d5.index, x=d5.values).set_title("Genre Categories")
plt.show()

# Q6: What are the three most used languages in movies?

In [None]:
df["count"]=1
d6=df.groupby(["Language"]).count()["count"].sort_values(ascending=False).head(3)

In [None]:
sns.barplot(x=d6.index, y=d6.values, palette="ocean").set_title("Most Used Languages In Movies");

# Q7: What are the top 10 movies with the highest IMDB rating?

In [None]:
df[["Title","IMDB Score"]].sort_values("IMDB Score", ascending=False).head(10).reset_index(drop=True)

# Q8: What is the correlation between IMDB score and 'Runtime'? Examine and visualize.

In [None]:
pear=scipy.stats.pearsonr(df["IMDB Score"], df["Runtime"])

In [None]:
spear=scipy.stats.spearmanr(df["IMDB Score"], df["Runtime"])

In [None]:
sns.scatterplot(x=df["IMDB Score"], y=df["Runtime"], color="green").set_title("Correlation Between 'IMDB Score' and 'Runtime");
print(f"Correlation of Pearson: {pear[0]}")
print(f"Correlation of Spearman: {spear[0]}")
print("👎There is no correlation between Runtime and IMDB Score because the correlation coefficient is too low.")

# Q9: What are the top 10 'Genre' with the highest IMDB Score? Visualize it.

In [None]:
d9= df.groupby("Genre").agg({"IMDB Score": "max"}).sort_values("IMDB Score", ascending=False)[
                0:10].reset_index()

In [None]:
sns.catplot(x="IMDB Score", y="Genre", data=d9, kind="point", color="purple");

# Q10: What are the top 10 movies with the highest 'runtime'? Visualize it.

In [None]:
d10=df[["Title","Runtime"]].sort_values("Runtime", ascending=False).head(10).reset_index(drop=True)

In [None]:
sns.barplot(y="Title", x="Runtime",errcolor="red", data=d10).set_title("Top 10 Movies With The Highest Runtime");

# Q11: In which year was the most movies released? Visualize it.

In [None]:
df['year'] = pd.DatetimeIndex(df.Premiere).year.astype(int)
year = df['year'].value_counts()

In [None]:
fig = px.bar(df, x=year.index, y=year.values, labels={"y":"Counts of Films", "x":"Year"})
fig.update_traces(marker_color='#aea1eb')
fig.show()

# Q12: which language that movies have the lowest average IMDB rating? Visualize it.

In [None]:
d12=df.groupby(["Language"]).agg({"IMDB Score":np.mean}).reset_index().sort_values("IMDB Score").head()

In [None]:
fig = px.bar(d12, y=d12["Language"], x=d12["IMDB Score"],  labels={"y":"Language", "x":"IMDB Score"})
fig.update_traces(marker_color='#ffb3ba')
fig.show()

# Q13: Which year has the greatest total runtime?

In [None]:
df[["year", "Runtime"]].groupby(["year"]).sum().sort_values(["Runtime"], ascending=False).head(1)

# Q14: What is the "Genre" where each language is used the most?

In [None]:
d14=(df.groupby(["Language", "Genre"]).size() 
   .sort_values(ascending=False) 
   .reset_index(name='Count') 
   .drop_duplicates(subset='Language'))
d14.head(10)

# Q15: Is there any outlier data?

In [None]:
def find_outlier(data):
    sns.boxplot(x=data);
    q1, q3= np.percentile(data, [25,75])
    iqr=q3-q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    data_outlier=data[(data<lower_bound) | (data>upper_bound)]
    count_of_outliers=data[(data<lower_bound) | (data>upper_bound)].count()
    print(f"Info about outlier data for {data.name}:")
    print(f"{count_of_outliers} outlier datas.")
    print("Outlier datas: ", data[(data<lower_bound) | (data>upper_bound)], sep="\n")
  

In [None]:
m=df["IMDB Score"]
n=df["Runtime"]

In [None]:
find_outlier(m)

In [None]:
find_outlier(n)