In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv("netflix_users.csv")
df.head()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre
0,1,James Martinez,18.0,France,Premium,80.26,Drama
1,2,John Miller,23.0,USA,Premium,321.75,Sci-Fi
2,3,Emma Davis,60.0,UK,Basic,35.89,Comedy
3,4,Emma Miller,44.0,USA,Premium,261.56,Documentary
4,5,Jane Smith,68.0,USA,Standard,909.3,Drama


In [2]:
df.tail()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre
24995,24996,David Miller,18.0,Australia,Premium,183.88,Horror
24996,24997,Jane Miller,17.0,USA,Premium,112.37,Drama
24997,24998,Sarah Miller,14.0,USA,Premium,351.8,Action
24998,24999,Michael Williams,71.0,USA,Basic,655.89,Action
24999,25000,James Jones,19.0,Germany,Premium,207.06,Documentary


In [3]:
print(df.shape)

(25000, 7)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   User_ID            25000 non-null  int64  
 1   Name               24983 non-null  object 
 2   Age                24992 non-null  float64
 3   Country            25000 non-null  object 
 4   Subscription_Type  24980 non-null  object 
 5   Watch_Time_Hours   24976 non-null  float64
 6   Favorite_Genre     25000 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 1.3+ MB


In [5]:
df.describe()

Unnamed: 0,User_ID,Age,Watch_Time_Hours
count,25000.0,24992.0,24976.0
mean,12500.5,46.484475,500.389758
std,7217.022701,19.59496,286.4062
min,1.0,13.0,0.0
25%,6250.75,29.0,256.4675
50%,12500.5,46.0,501.38
75%,18750.25,63.0,745.7075
max,25000.0,80.0,999.99


In [6]:
df1=df.copy(deep=True)

Finding Null Values

In [7]:
df.isna().sum()

User_ID               0
Name                 17
Age                   8
Country               0
Subscription_Type    20
Watch_Time_Hours     24
Favorite_Genre        0
dtype: int64

Eliminating Null Values

In [8]:
med=df1["Age"].median()
df1["Age"]=df1["Age"].fillna(med)
df1.isna().sum()

User_ID               0
Name                 17
Age                   0
Country               0
Subscription_Type    20
Watch_Time_Hours     24
Favorite_Genre        0
dtype: int64

In [9]:
me=df1["Watch_Time_Hours"].mean()
df1["Watch_Time_Hours"]=df1["Watch_Time_Hours"].fillna(me)
df1.isna().sum()

User_ID               0
Name                 17
Age                   0
Country               0
Subscription_Type    20
Watch_Time_Hours      0
Favorite_Genre        0
dtype: int64

In [10]:
df1["Subscription_Type"].fillna("Basic",inplace=True)
df1.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1["Subscription_Type"].fillna("Basic",inplace=True)


User_ID               0
Name                 17
Age                   0
Country               0
Subscription_Type     0
Watch_Time_Hours      0
Favorite_Genre        0
dtype: int64

In [11]:
df1.dropna(inplace=True)
df1.isna().sum()

User_ID              0
Name                 0
Age                  0
Country              0
Subscription_Type    0
Watch_Time_Hours     0
Favorite_Genre       0
dtype: int64

In [12]:
df1["Subscription_Type"].value_counts()

Subscription_Type
Premium     8394
Basic       8355
Standard    8234
Name: count, dtype: int64

In [13]:
df1["Favorite_Genre"].value_counts()

Favorite_Genre
Horror         3652
Documentary    3632
Action         3588
Romance        3570
Comedy         3560
Drama          3529
Sci-Fi         3452
Name: count, dtype: int64

In [14]:
df1["Country"].value_counts()

Country
UK           2588
Germany      2544
India        2505
USA          2503
Brazil       2502
Mexico       2491
Canada       2490
France       2471
Japan        2453
Australia    2436
Name: count, dtype: int64

Categorical to Numerical

In [15]:
mapping={"Basic":100,"Standard":200,"Premium":300}
df1["Subscription_Type"]=df1["Subscription_Type"].map(mapping)
df1.head()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre
0,1,James Martinez,18.0,France,300,80.26,Drama
1,2,John Miller,23.0,USA,300,321.75,Sci-Fi
2,3,Emma Davis,60.0,UK,100,35.89,Comedy
3,4,Emma Miller,44.0,USA,300,261.56,Documentary
4,5,Jane Smith,68.0,USA,200,909.3,Drama


In [16]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
cols=["Country","Favorite_Genre"]
for i in cols:
    df1[i]=le.fit_transform(df1[i])
df1.head()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre
0,1,James Martinez,18.0,3,300,80.26,3
1,2,John Miller,23.0,9,300,321.75,6
2,3,Emma Davis,60.0,8,100,35.89,1
3,4,Emma Miller,44.0,9,300,261.56,2
4,5,Jane Smith,68.0,9,200,909.3,3


Scaling

In [17]:
x=df1[["Age","Country","Watch_Time_Hours","Subscription_Type"]]
y=df1.Favorite_Genre

In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

K Nearest Neighbour Algorithm

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3,metric="euclidean")
knn.fit(x_train,y_train)

In [20]:
ypred=knn.predict(x_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,ypred))

0.14248549129477686


Decision Tree Algorithm

In [21]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
dtc.fit(x,y)

In [22]:
ypred=dtc.predict(x)
from sklearn.metrics import confusion_matrix
confusion_matrix(y,ypred)

array([[3588,    0,    0,    0,    0,    0,    0],
       [   0, 3560,    0,    0,    0,    0,    0],
       [   0,    0, 3632,    0,    0,    0,    0],
       [   0,    0,    0, 3529,    0,    0,    0],
       [   0,    0,    0,    0, 3652,    0,    0],
       [   0,    0,    0,    0,    1, 3569,    0],
       [   0,    0,    0,    0,    0,    0, 3452]], dtype=int64)

In [None]:
from sklearn.tree import plot_tree
plot_tree(dtc,feature_names=["Age","Country","Watch_Time_Hours","Subscription_Type"],class_names=["Horror","Documentary","Action","Romance","Comedy","Drama","Sci-Fi"])

[Text(0.600067336553765, 0.990909090909091, 'Age <= 67.5\ngini = 0.857\nsamples = 24983\nvalue = [3588.0, 3560.0, 3632.0, 3529.0, 3652.0, 3570.0, 3452.0]\nclass = Comedy'),
 Text(0.3043820066555807, 0.9727272727272728, 'Subscription_Type <= 150.0\ngini = 0.857\nsamples = 20266\nvalue = [2859.0, 2878.0, 2916.0, 2850.0, 3012.0, 2945.0, 2806.0]\nclass = Comedy'),
 Text(0.45222467160467283, 0.9818181818181819, 'True  '),
 Text(0.1213128776168396, 0.9545454545454546, 'Watch_Time_Hours <= 474.495\ngini = 0.857\nsamples = 6758\nvalue = [978.0, 970.0, 961.0, 966.0, 988.0, 1035.0, 860.0]\nclass = Drama'),
 Text(0.08012726696634212, 0.9363636363636364, 'Watch_Time_Hours <= 471.855\ngini = 0.857\nsamples = 3175\nvalue = [501.0, 445.0, 442.0, 466.0, 479.0, 438.0, 404.0]\nclass = Horror'),
 Text(0.04829608663587156, 0.9181818181818182, 'Watch_Time_Hours <= 7.36\ngini = 0.857\nsamples = 3157\nvalue = [495.0, 445.0, 439.0, 459.0, 479.0, 438.0, 402.0]\nclass = Horror'),
 Text(0.0010527992388651487, 0.