## **Importing necessary libraries for building the model**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

## **Mounting Google Drive to access files**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Dataset**
Set up Kaggle API key and download datasets.

In [None]:
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/Colab\ Notebooks/Kaggle/kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download kazanova/sentiment140
!unzip sentiment140.zip
!kaggle datasets download yasserh/imdb-movie-ratings-sentiment-analysis
!unzip imdb-movie-ratings-sentiment-analysis.zip


Downloading sentiment140.zip to /content
 77% 62.0M/80.9M [00:00<00:00, 175MB/s]
100% 80.9M/80.9M [00:00<00:00, 177MB/s]
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  
Downloading imdb-movie-ratings-sentiment-analysis.zip to /content
 49% 10.0M/20.6M [00:00<00:00, 26.5MB/s]
100% 20.6M/20.6M [00:00<00:00, 47.6MB/s]
Archive:  imdb-movie-ratings-sentiment-analysis.zip
  inflating: movie.csv               


Importing pandas library and loading datasets.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'], encoding='latin-1')
df = df[['target', 'text']]

dfo = pd.read_csv('/content/movie.csv')
# df.head(5)

dfo=dfo.rename(columns={'text': 'target', 'label': 'text'})
dfo['text'], dfo['target'] = dfo['target'], dfo['text'] 

df = pd.concat([df, dfo], ignore_index=True)

# dfo.head(5)

In [None]:
# dfo

## Setting up the parameters for the neural network:
* vocabulary size
* embedding dimension
* maximum length of a sequence
* truncation and padding types
* out-of-vocabulary token
* size of the training set.

In [None]:
vocab_size = 20000
embedding_dim = 32
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

* Importing the Natural Language Toolkit (NLTK) library for natural language processing
* Downloading the stopwords and wordnet corpora using NLTK
* Unzipping the wordnet corpus to the appropriate directory.

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
!unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora/

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Archive:  /root/nltk_data/corpora/wordnet.zip
   creating: /root/nltk_data/corpora/wordnet/
  inflating: /root/nltk_data/corpora/wordnet/lexnames  
  inflating: /root/nltk_data/corpora/wordnet/data.verb  
  inflating: /root/nltk_data/corpora/wordnet/index.adv  
  inflating: /root/nltk_data/corpora/wordnet/adv.exc  
  inflating: /root/nltk_data/corpora/wordnet/index.verb  
  inflating: /root/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /root/nltk_data/corpora/wordnet/data.adj  
  inflating: /root/nltk_data/corpora/wordnet/index.adj  
  inflating: /root/nltk_data/corpora/wordnet/LICENSE  
  inflating: /root/nltk_data/corpora/wordnet/citation.bib  
  inflating: /root/nltk_data/corpora/wordnet/noun.exc  
  inflating: /root/nltk_data/corpora/wordnet/verb.exc  
  inflating: /root/nltk_data/corpora/wordnet/README  
  inflating: /root/nltk_data/corpora/wordnet/index.sense  
  inflating: /root/nltk_data/corpora/wordnet/data.noun  
  inflating: /root/nltk_data/corpora/wordnet/data.adv  


In [None]:
# df

* The function preprocess() takes in a text as input.
* It preprocesses the text by:
 * Replacing repeated punctuation signs with labels and adding spaces.
 * Adding spaces before and after single punctuation signs.
 * Lowercasing the text.
 * Removing stopwords.
 * Lemmatizing the text.
* The function returns the preprocessed text.

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess(text):
    text = ' '.join(text)
    # Replace repeated punctuation signs with labels and add spaces
    text = re.sub(r'(\.{2,})', r' multistop ', text)
    text = re.sub(r'(\!{2,})', r' multiexclamation ', text)
    text = re.sub(r'(\?{2,})', r' multiquestion ', text)
    # Add spaces before and after single punctuation signs
    text = re.sub(r'(\.|\!|\?|\,)', r' ', text)
    
    # Lower case the text
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [w for w in words if not w in stop_words]
    text = ' '.join(words)
    
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words]
    text = ' '.join(words)

    return text



In [None]:
# df["text"] = df["text"].apply(preprocess)

# df.head()

## **Function: load_emoticons(emo_filename)**

* Load emoticons and their polarity from a file
* Parameters:
 * emo_filename: path to the emoticon file
* Returns:
 * A dictionary containing the emoticons as keys and their polarity as values

## **Function: replace_emoticons(text, emoticon_dict=emoticon_dict)**

* Replace emoticons with their polarity and delete neutral ones
* Parameters:
 * text: input text containing emoticons
 * emoticon_dict: dictionary containing emoticons as keys and their polarity as values
* Returns:
 * A list of words obtained after replacing emoticons with their polarity and deleting neutral ones

In [None]:
def load_emoticons(emo_filename):
    # Load emoticons and their polarity from a file
    emoticon_dict = {}
    with open(emo_filename, 'r', encoding='latin-1') as file:
        for line in file:
            emoticon, polarity = line.strip().split('\t')
            emoticon_dict[emoticon] = polarity
    return emoticon_dict

# Load emoticons and their polarity from a file
emoticon_dict = load_emoticons('/content/EmoticonLookupTable.txt')

def replace_emoticons(text, emoticon_dict=emoticon_dict):
    # Replace emoticons with their polarity and delete neutral ones
    for emoticon, polarity in emoticon_dict.items():
        pattern = re.compile(re.escape(emoticon), re.IGNORECASE)
        if polarity == '1':
            text = pattern.sub("positive", text)
        elif polarity == '-1':
            text = pattern.sub("negative", text)
        else:
            text = pattern.sub('', text)
            
    text = re.sub(r'[^a-zA-Z\s]', '', text)
            
    return text.split()



In [None]:
# df["text"] = df["text"].apply(replace_emoticons)
# df.head()

* load_slang(slang_filename): loads a slang dictionary from a file and returns it as a Python dictionary.
* replace_slang(tokens, slang_dict=slang_dict): replaces slang words in a list of tokens with their corresponding meanings using the slang dictionary.


In [None]:
def load_slang(slang_filename):
    # Load emoticons and their polarity from a file
    slang_dict = {}
    with open(slang_filename, 'r', encoding='latin-1') as file:
        for line in file:
            slang, meaning = line.strip().split('\t')
            slang_dict[slang] = meaning
    return slang_dict

# Load emoticons and their polarity from a file
slang_dict = load_slang('/content/SlangLookupTable.txt')

def replace_slang(tokens, slang_dict=slang_dict):
    # Replace emoticons with their polarity and delete neutral ones
    for i, token in enumerate(tokens):
        if token in slang_dict:
            tokens[i] = slang_dict[token]
            
    return tokens



In [None]:
# df["text"] = df["text"].apply(replace_slang)
# df.head()

Code preprocesses the input text by labeling user mentions, hashtags, and URLs as PERSON, TOPIC, and URL respectively.

In [None]:
def label_user_topic(tokens):
    labeled_tokens = []
    for token in tokens:
        if token.startswith("@"):
            labeled_tokens.append("PERSON")
        elif token.startswith("#"):
            labeled_tokens.append("TOPIC")
        elif token.startswith("http"):
            labeled_tokens.append("URL")
        else:
            labeled_tokens.append(token)
    return labeled_tokens



In [None]:
# df['text'] = df['text'].apply(label_user_topic)
# df.head()

This code defines functions to reduce words to their base form and normalize words in a list of tokens. The reduce_word function checks if a word is in Roget's Thesaurus, and if not, iteratively removes repeated letters until it finds a match. The normalize_words function applies reduce_word to each word in a list of tokens and returns the normalized list.

In [None]:
from nltk.corpus import wordnet as wn

def reduce_word(word):
    # Check if the word is in Roget's Thesaurus
    synsets = wn.synsets(word)
    if synsets:
        return word
    
    # Iterate over the letters in the word, starting from the end
    for i in range(len(word)-1, 1, -1):
        # If the current letter is the same as the previous one,
        # remove the current letter and check if the resulting word
        # is in Roget's Thesaurus
        if word[i] == word[i-1]:
            word = word[:i] + word[i+1:]
            synsets = wn.synsets(word)
            if synsets:
                return "STRESSED " + word

        # If the current and previous letters are the same as the one before them,
        # remove the current letter and check if the resulting word
        # is in Roget's Thesaurus
        elif i > 2 and word[i] == word[i-2]:
            word = word[:i-1] + word[i:]
            synsets = wn.synsets(word)
            if synsets:
                return "STRESSED " + word
    
    # If no match is found, return the original word
    return word


def normalize_words(tokens):
    normalized_tokens = []
    for token in tokens:
        # Check if the token is a word
        if re.match(r'\b\w+\b', token):
            # Normalize the word
            normalized_word = reduce_word(token.lower())
            # If the normalized word is different from the original word,
            # add both versions to the list of tokens
            if normalized_word != token.lower():
                normalized_tokens.append(normalized_word)
            else:
                normalized_tokens.append(token)
        else:
            normalized_tokens.append(token)
            
#     normalized_tokens = [token.split() if 'STRESSED' in token else token for token in normalized_tokens]
#     normalized_tokens = [item if not isinstance(item, list) else item for sublist in normalized_tokens for item in sublist]
    return normalized_tokens


In [None]:

# df["text"] = df["text"].apply(normalize_words)
# df.tail()

Install the twython package and downloads the vader_lexicon, opinion_lexicon, and sentiwordnet resources from NLTK.

In [None]:
!pip install twython
nltk.download('vader_lexicon')
nltk.download('opinion_lexicon')
nltk.download('sentiwordnet')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting twython
  Downloading twython-3.9.1-py3-none-any.whl (33 kB)
Installing collected packages: twython
Successfully installed twython-3.9.1


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

This function matches affect words to tokens and returns the tokens with affect labels.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import opinion_lexicon, sentiwordnet


def match_affect_words(tokens):
    sia = SentimentIntensityAnalyzer()
    positive_words = set(opinion_lexicon.positive())
    negative_words = set(opinion_lexicon.negative())
    hpositive_words = set(word for synset in sentiwordnet.senti_synsets('', 'a') for word, pos in synset.lemmas() 
                          if synset.pos_score() > 0.75 and word not in positive_words)
    hnegative_words = set(word for synset in sentiwordnet.senti_synsets('', 'a') for word, pos in synset.lemmas() 
                          if synset.neg_score() > 0.75 and word not in negative_words)
    affect_labels = {'positive': positive_words, 'negative': negative_words, 
                     'hpositive': hpositive_words, 'hnegative': hnegative_words}
    
    # create a mapping from affect words to labels
    affect_words = set(word for label_words in affect_labels.values() for word in label_words)
    word_to_label = {}
    for word in affect_words:
        scores = sia.polarity_scores(word)
        if scores['compound'] >= 0.5:
            word_to_label[word] = 'hpositive'
        elif scores['compound'] > -0.5 and scores['compound'] < 0.5:
            word_to_label[word] = 'positive' if word in positive_words else 'negative'
        else:
            word_to_label[word] = 'hnegative'
    
    # match tokens to affect labels
    affect_set = set(affect_words)
    matched_tokens = []
    for token in tokens:
        if token in affect_set:
            label = word_to_label[token]
            matched_tokens.append(label)
        else:
            matched_tokens.append(token)
    
    return matched_tokens



In [None]:
# df["text"] = df["text"].apply(match_affect_words)
# df.tail()

This function matches modifier words in a list of tokens and returns a new list with the matched tokens and their corresponding modifier labels: "negator", "intensifier", or "diminisher". The modifier word lists are defined at the beginning of the function.

In [None]:
# define the lists of negation, intensification and diminishment expressions
negation_list = ["no", "not", "never", "none", "nobody", "nowhere", "nothing", "neither", "nor", "cannot", "can't", "don't", "doesn't", "didn't", "won't", "wouldn't", "shouldn't", "couldn't", "isn't", "aren't", "ain't", "hate", "dislike", "disapprove", "disapprove of", "disagree", "disagree with", "reject", "rejects", "rejected", "refuse", "refuses", "refused", "never", "rarely", "seldom", "hardly", "scarcely", "barely"]
intensification_list = ["very", "extremely", "super", "really", "quite", "most", "more", "quite", "too", "enough", "so", "such", "just", "almost", "absolutely", "completely", "totally", "utterly", "highly", "deeply", "greatly", "seriously", "intensely", "especially", "exceedingly", "exceptionally", "particularly", "unusually", "incredibly", "undeniably", "undeniable", "emphatically", "decidedly", "really", "truly", "hugely", "mega", "ultra", "majorly", "extraordinarily", "mightily", "fully", "mightily", "perfectly", "thoroughly", "utterly", "all", "way", "significantly", "terribly", "awfully", "fantastically"]
diminishment_list = ["little", "slightly", "somewhat", "kind", "sort", "bit", "little", "moderately", "marginally", "fairly", "reasonably", "comparatively", "relatively", "tad", "touch", "extent"]


def match_modifier_words(tokens):
    matched_tokens = []
    for token in tokens:
        if token in negation_list:
            matched_tokens.append("negator")
        elif token in intensification_list:
            matched_tokens.append("intensifier")
        elif token in diminishment_list:
            matched_tokens.append("diminisher")
        else:
            matched_tokens.append(token)
    return matched_tokens


In [None]:

# df["text"] = df["text"].apply(match_modifier_words)
# df.tail()

* Converting all text to lowercase
* Splitting text into tokens
* Matching modifier words (negation, intensification, and diminishment expressions) and labeling them accordingly
* Applying additional preprocessing steps, including replacing emoticons and slang, labeling user and topic mentions, and normalizing words
* Returning the preprocessed DataFrame

In [None]:
def process2(df):
    df['preprocessed_text'] = df['text'].str.lower()
    df['preprocessed_text'] = df['text'].str.split()
    text = df['preprocessed_text'].copy()
    text = text.apply(match_modifier_words)
    text = text.apply(preprocess)
    text = text.apply(replace_emoticons)
    text = text.apply(replace_slang)
    text = text.apply(label_user_topic)
    text = text.apply(normalize_words)
    df['preprocessed_text'] = text

    return df

In [None]:
# df.columns = 
# dfo.index = pd.Index(range(len(dfo)))
# print(dfo)
df = process2(df)
# dfo = dfo.rename(columns={0: 'target', 1: 'text'})
# dfo['text']

In [None]:
# data = df
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: ' '.join(x))
# df = data
df.head()

Unnamed: 0,target,text,preprocessed_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfot URL comyzl aw thats bummer shoulda g...
1,0,is upset that he can't update his Facebook by ...,upset negator update facebok texting multistop...
2,0,@Kenichan I dived many times for the ball. Man...,kenichan dived many time ball managed save res...
3,0,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",nationwideclas negator behaving im mad negator...


Save to the .csv file

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/processed_comment.csv', index=False)

The code is splitting a dataframe into train and test sets for a binary classification problem where the target variable is being converted from a range of [0, 4] to [0, 1]. The train_test_split function from sklearn is used for this purpose.

In [None]:
X = df["preprocessed_text"]
y = df["target"]

y = y.replace(4,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

Creates a tokenizer object with a vocabulary size of vocab_size and an out-of-vocabulary token of oov_tok.

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)


In [None]:
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

These lines of code use the Tokenizer object to fit on the preprocessed training text and generate a word index. The fit_on_texts method updates the internal vocabulary based on the frequency of each word in the training set, while the word_index attribute returns a dictionary containing the unique words in the training set as keys and their corresponding index as values.

We use the Tokenizer class to tokenize the training and testing data. First, we fit the tokenizer on the training data using tokenizer.fit_on_texts(X_train), which builds the vocabulary from the training text. Then, we use the tokenizer to convert the text to sequences using tokenizer.texts_to_sequences(), and pad the sequences to a fixed length using pad_sequences(). This creates numerical inputs that can be fed into a neural network for classification.

In [None]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

This code defines a sequential model in Keras with an embedding layer, two bidirectional LSTM layers, two dense layers, and a dropout layer. The model is then compiled with binary crossentropy loss, Adam optimizer, and accuracy metric.

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compiling the module
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

This code trains the model for 3 epochs with a batch size of 16, using the preprocessed training data and validation data. The training progress is stored in the history variable.

In [None]:
num_epochs = 3
history = model.fit(X_train_padded, y_train, epochs=num_epochs,batch_size=16, validation_data=(X_test_padded, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


The code below evaluates the trained model using the test set and prints the test accuracy.

In [None]:
test_loss, test_acc = model.evaluate(X_test_padded, y_test, verbose=2)
print("Test Accuracy: ", test_acc)

10250/10250 - 84s - loss: 0.4057 - accuracy: 0.8164 - 84s/epoch - 8ms/step
Test Accuracy:  0.8164024353027344


In [None]:
# Test with input entry
entry = ["I'm not sure if I don't like it or if I just don't understand it."]
test_text = tokenizer.texts_to_sequences(entry)
test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating=trunc_type)

prediction = model.predict(test_text_padded)

#0 : bad
#1 : good
print(prediction)

# entry = ['I hate this music so bad ! I just want to sleep rn']
# test_text = tokenizer.texts_to_sequences(entry)
# test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# prediction = model.predict(test_text_padded)


# #0 : bad
# #1 : good
# print(prediction)

[[0.468318]]


Saving the model as a .keras file

In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/model.keras')

Loading the saved model

In [None]:
load_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/model.keras')

# Show the model architecture
load_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           640000    
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         49664     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 6

In [None]:
# Test with input entry
entry = ["I disagree with some of the points made in this video."]
tokenizer.fit_on_texts(entry)
test_text = tokenizer.texts_to_sequences(entry)
test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating='post')

prediction = load_model.predict(test_text_padded)

#0 : bad
#1 : good
print(prediction)

[[0.3351707]]


Define a preprocess function for dataframe input

In [None]:
def process(df):
    df["text"] = df["text"].apply(match_modifier_words)
    df["text"] = df["text"].apply(preprocess)
    df["text"] = df["text"].apply(replace_emoticons)
    df["text"] = df["text"].apply(replace_slang)
    df['text'] = df['text'].apply(label_user_topic)
    df["text"] = df["text"].apply(normalize_words)
    
    return df

Define a preprocess function for text input

In [None]:
def p_text(text):
  text = text.lower()
  text = text.split()
  text = match_modifier_words(text)
  text = preprocess(text)
  text = replace_emoticons(text)
  text = replace_slang(text)
  text = label_user_topic(text)
  text = normalize_words(text)
  text = ' '.join(text)
  entry = [text]
  print(entry)
  tokenizer.fit_on_texts(entry)
  test_text = tokenizer.texts_to_sequences(entry)
  test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating='post')
  print(test_text_padded)
  prediction = load_model.predict(test_text_padded)
  return prediction

print(p_text("good work, keep it up!"))

[[2 4 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0.19152078]]


Connect to Youtube API and save comments to a .csv file

In [None]:
from googleapiclient.discovery import build
import pandas as pd
import csv
!pip install pytube
from pytube import extract

api_key = 'AIzaSyAWvcYyhifqwreIpseWyyFGMljWOEbO0lI'



number = 0
comm=[]

def video_comments(url):

  video_id = extract.video_id(url)

  comment_count = 0

  youtube = build('youtube', 'v3', developerKey=api_key)

  video_response = youtube.commentThreads().list(
  part = 'snippet,replies',
  videoId = video_id
  ).execute()

  while video_response:
    for item in video_response['items']:
    
      comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
      arr=[comment]
      comm.append(arr)

      comment_count += 1

    if 'nextPageToken' in video_response:
      video_response = youtube.commentThreads().list(
          part = 'snippet,replies',
          videoId = video_id,
          pageToken = video_response['nextPageToken']
        ).execute()
    else:
        break
  with open('/content/comment.csv', 'w', newline='') as filee:
    writer = csv.writer(filee)
    writer.writerow(["Comments"])
    writer.writerows(comm)
  filee.close();
#video_id = "iEqYnkhro8E"
# url = input("Enter Youtube video url \n: ")
video_id = extract.video_id("https://www.youtube.com/watch?v=qHdidWEuyVI")

video_comments('https://www.youtube.com/watch?v=qHdidWEuyVI')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytube
  Downloading pytube-12.1.3-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-12.1.3


In [None]:
# import csv

# # Open the CSV file
# with open('comment.csv', mode='r') as infile:
#     reader = csv.reader(infile)
#     rows = list(reader)

# # Change the header
# rows[0] = ['text']

# # Write the new header to the CSV file
# with open('comment.csv', mode='w', newline='') as outfile:
#     writer = csv.writer(outfile)
#     writer.writerows(rows)

In [None]:
df2 = pd.read_csv('comment.csv')
df2.head()
df2.columns = ['text']
df2 = df2.reset_index(drop=True)
# apply preprocess function to text column and store the result in new column
df2['preprocessed_text'] = process2(df2)
df2['preprocessed_text'] = df2['preprocessed_text'].apply(lambda x: ' '.join(x))
# save the updated dataframe to csv
df2.to_csv('comment.csv', index=False)

In [None]:
load_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/model.keras')

# Show the model architecture
load_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           640000    
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         49664     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 6

In [None]:
comments = df2['preprocessed_text']
tokenizer.fit_on_texts(comments)
test_text = tokenizer.texts_to_sequences(comments)
test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating=trunc_type)

prediction = load_model.predict(test_text_padded)

df2['prediction'] = prediction
# function to replace values greater than 0.5 with 1 and others with 0
replace_func = lambda x: 1 if x > 0.5 else 0

# apply the function to the column and store the result in a new column
df2['sentiment'] = df2['prediction'].apply(replace_func)
df2.to_csv('comment.csv', index=False)
print(df2['sentiment'])

0      0
1      0
2      1
3      0
4      0
      ..
248    0
249    0
250    0
251    1
252    1
Name: sentiment, Length: 253, dtype: int64
