### Loading Libraries

In [None]:
# Basic Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import itertools as it
from collections import Counter
import pickle
import sys

# NLP libraries
import nltk
import re
import string


# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn libraries
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering 
import scipy.cluster.hierarchy as shc
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from xgboost import XGBClassifier
#tensorflow libraries

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.preprocessing import sequence

In [None]:
def embedding(x_train,x_test,method):

    if method == 'GLOVE':
        
        max_words = 20000
        # Max number of words in each tweet.
        max_seq = max(x_train.apply(lambda x:len(x)))
        
        tokenizer = Tokenizer(num_words=max_words,oov_token="<OOV>")
        tokenizer.fit_on_texts(x_train.values)
        word_index = tokenizer.word_index
        vocabulary = list(word_index.keys())
        x_train = tokenizer.texts_to_sequences(x_train)
        x_test = tokenizer.texts_to_sequences(x_test)
        x_train = pad_sequences(x_train, maxlen = 200)
        x_test = pad_sequences(x_test, maxlen = 200)
        
        embedding_matrix = np.zeros((len(vocabulary), 200))

        for word,index in word_index.items():
            try: 
                embedding_matrix[index,:] = glove_dictionary[word]
            except:
                pass
        return x_train, x_test,y_train,y_test, embedding_matrix
    
    elif method == 'W2V':
        max_words = 20000
        # Max number of words in each tweet.
        max_seq = max(x_train.apply(lambda x:len(x)))
        
        tokenizer = Tokenizer(num_words=max_words,oov_token="<OOV>")
        tokenizer.fit_on_texts(x_train.values)
        word_index = tokenizer.word_index
        vocabulary = list(word_index.keys())
        x_train = tokenizer.texts_to_sequences(x_train)
        x_test = tokenizer.texts_to_sequences(x_test)
        x_train = pad_sequences(x_train, maxlen = 200)
        x_test = pad_sequences(x_test, maxlen = 200)
        
        embedding_matrix = np.zeros((len(vocabulary), 200))

        for word,index in word_index.items():
            try: 
                embedding_matrix[index,:] = w2v_dictionary[word]
            except:
                pass
        return x_train, x_test,y_train,y_test, embedding_matrix
    
    elif method == 'W2V_ML':
        x_train_embeds = w2v_encoding(x_train)
        x_test_embeds = w2v_encoding(x_test)
        x_train_embeds = [np.array(x).mean(axis = 0) for x in x_train_embeds]
        x_test_embeds = [np.array(x).mean(axis = 0) for x in x_test_embeds]
        
        return x_train_embeds, x_test_embeds, []
    
    elif method == 'GLOVE_ML':
        x_train_embeds = glove_encoding(x_train)
        x_test_embeds = glove_encoding(x_test)
        x_train_embeds = [np.array(x).mean(axis = 0) for x in x_train_embeds]
        x_test_embeds = [np.array(x).mean(axis = 0) for x in x_test_embeds]
        
        return x_train_embeds, x_test_embeds, []
        



In [None]:
def preprocess_data(data, embedding_method, target_encoding):
    
    print("\n### CLEANING DATASET ###")
    print('Shape of dataset before cleaning', data.shape)
    
    data['description'] = data['description'].apply(clean_text)
    print('Shape of dataset after cleaning', data.shape)
    
    print("\n### TRAIN - TEST SPLIT ###")
    
    x = data.iloc[:,0]
    y = data.iloc[:,1]
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=100)
    print('Shape of x_train',x_train.shape)
    print('Shape of x_test',x_test.shape)
    print('Shape of y_train',y_train.shape)
    print('Shape of x_test',y_test.shape)
    
    print("\n### EMBEDDING TEXTS ###")
    
    x_train_embeds, x_test_embeds,embedding_matrix = embedding(x_train,x_test,embedding_method)
    
    print("\n ### Shape after embeddings ###")
    
    print('Shape of x_train',np.array(x_train_embeds).shape)
    print('Shape of x_test',np.array(x_test_embeds).shape)
    print('Shape of y_train',np.array(y_train).shape)
    print('Shape of x_test',np.array(y_test).shape)
    
    
    print("\n ### Shape of target after encoding ###")
    print("size of y_train: ",len(y_train))
    print("size of y_test: ",len(y_test))

    return x_train_embeds,x_test_embeds, y_train, y_test

In [None]:
def run_model(data, embedding_method, target_encoding,model):
    
    data = data.copy(deep = True)
    print("\n### CLEANING DATASET ###")
    print('Shape of dataset before cleaning', data.shape)
    print(data['description'][0])
    data['description'] = data['description'].apply(clean_text)
    print('Shape of dataset after cleaning', data.shape)
    print(data['description'][0])
    print("\n### TRAIN - TEST SPLIT ###")
    
    x = data.iloc[:,0]
    y = data.iloc[:,1]
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=100)
    print('Shape of x_train',x_train.shape)
    print('Shape of x_test',x_test.shape)
    print('Shape of y_train',y_train.shape)
    print('Shape of x_test',y_test.shape)
    
    print("\n### EMBEDDING TEXTS ###")
    
    x_train_embeds, x_test_embeds,embedding_matrix = embedding(x_train,x_test,embedding_method)
    
    print("\n ### Shape after embeddings ###")
    
    print('Shape of x_train',np.array(x_train_embeds).shape)
    print('Shape of x_test',np.array(x_test_embeds).shape)
    print('Shape of y_train',np.array(y_train).shape)
    print('Shape of x_test',np.array(y_test).shape)
    
    # print("\n### ENCODING TARGET VARIABLE ###")
    # y_train_enc,y_test_enc = target_encoder(y_train,y_test,target_encoding)
    
    print("\n ### Shape of target after encoding ###")
    print("size of y_train: ",len(y_train))
    print("size of y_test: ",len(y_test))
    
    if model == 'Gaussian NB':
        
        print("\n## TRAINING GAUSSIAN NAIVE BAYES ##")
        nb = GaussianNB()
        nb.fit(x_train_embeds, y_train)
        
        print("## PREDICTING ##")
        preds = nb.predict(x_test_embeds)
        
        print('### CLASSIFICATION REPORT ###')
        print(classification_report(y_test,preds))
    elif model == 'XGboost':

        print("\n## TRAINING XGboost ##")
        xgb = XGBClassifier()
        xgb.fit(np.array(x_train_embeds), np.array(y_train))

        print("## PREDICTING ##")
        preds = xgb.predict(np.array(x_test_embeds))

        print('### CLASSIFICATION REPORT ###')
        print(classification_report(y_test,preds))

        

### Loading Dataset

In [None]:
data_org = pd.read_csv("netflix_titles_bert_HC_2.csv")
data_org = data_org.dropna(subset = ['description'])
print(data_org.shape)
data_org.head()

(8807, 14)


Unnamed: 0.1,Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,cluster
0,0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",1
1,1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",0
2,2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,13
3,3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",0
4,4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,5


In [None]:
data = data_org[['description','cluster']]
print(data.shape)
data.head()

(8807, 2)


Unnamed: 0,description,cluster
0,"As her father nears the end of his life, filmm...",1
1,"After crossing paths at a party, a Cape Town t...",0
2,To protect his family from a powerful drug lor...,13
3,"Feuds, flirtations and toilet talk go down amo...",0
4,In a city of coaching centers known to train I...,5


# Running Model

In [None]:
with open('sentBert_embeddings_3.pkl', 'rb') as fp:
    sentBert_encodings = pickle.load(fp)
fp.close()

In [None]:
MM_X = MinMaxScaler()
bert_X = MM_X.fit_transform(sentBert_encodings)

In [None]:
y = data.iloc[:,1]
x_train, x_test, y_train, y_test = train_test_split(np.array(bert_X),y,test_size=0.25, random_state=100)
print('Shape of x_train',x_train.shape)
print('Shape of x_test',x_test.shape)
print('Shape of y_train',y_train.shape)
print('Shape of x_test',y_test.shape)

nb = GaussianNB()
nb.fit(x_train, y_train)

print("## PREDICTING ##")
preds = nb.predict(x_test)

print('### CLASSIFICATION REPORT ###')
print(classification_report(y_test,preds))

Shape of x_train (6605, 768)
Shape of x_test (2202, 768)
Shape of y_train (6605,)
Shape of x_test (2202,)
## PREDICTING ##
### CLASSIFICATION REPORT ###
              precision    recall  f1-score   support

           0       0.68      0.64      0.66       213
           1       0.63      0.50      0.56       314
           2       0.61      0.61      0.61       203
           3       0.66      0.69      0.67       143
           4       0.64      0.68      0.66       103
           5       0.39      0.50      0.44       103
           6       0.59      0.51      0.55       134
           7       0.73      0.62      0.67       263
           8       0.59      0.62      0.60       115
           9       0.48      0.73      0.58        97
          10       0.46      0.58      0.51       133
          11       0.79      0.81      0.80       108
          12       0.68      0.85      0.75        27
          13       0.78      0.76      0.77       246

    accuracy                       

In [None]:

print("\n## TRAINING GAUSSIAN NAIVE BAYES ##")
xgb = XGBClassifier(random_state = 100)
xgb.fit(x_train, np.array(y_train))

print("## PREDICTING ##")
preds = xgb.predict(x_test)

print('### CLASSIFICATION REPORT ###')
print(classification_report(y_test,preds))



## TRAINING GAUSSIAN NAIVE BAYES ##


### Deep Learning Model

In [None]:
encoder = preprocessing.OneHotEncoder()
encoder.fit(y_train.values.reshape(-1,1))
columns = encoder.categories_[0].tolist()

y_train_OH = encoder.transform(y_train.values.reshape(-1,1)).toarray()
y_test_OH = encoder.transform(y_test.values.reshape(-1,1)).toarray()

In [None]:
x_train.shape

(6605, 768)

In [None]:

model = Sequential()
model.add(Dense(units=512, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=15, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')

In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 512)               393728    
                                                                 
 dense_16 (Dense)            (None, 256)               131328    
                                                                 
 dense_17 (Dense)            (None, 128)               32896     
                                                                 
 dense_18 (Dense)            (None, 64)                8256      
                                                                 
 dense_19 (Dense)            (None, 15)                975       
                                                                 
Total params: 567,183
Trainable params: 567,183
Non-trainable params: 0
_________________________________________________________________


In [None]:
epochs = 500
batch_size = 128

LSTM_model = model.fit(x_train, y_train_OH, epochs=epochs, batch_size = batch_size,
                    validation_data=(x_test, y_test_OH))

Epoch 1/500


ValueError: ignored

In [None]:
### Prediting on test data for calculating performance metrics

preds = model.predict(x_test)
preds = np.argmax(preds,axis = 1)
lEncoder = preprocessing.LabelEncoder()
lEncoder.fit(y_train)
y_test = lEncoder.transform(y_test)
print(classification_report(y_test,preds))