In [1]:
import pandas as pd
import numpy as np
import requests
import os
import tensorflow as tf

In [2]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [3]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [4]:
API_KEY = os.environ.get('API_KEY')

In [5]:
BIGdf = pd.read_csv('comments/final_df_CLEAN.csv')

In [6]:
BIGdf = BIGdf.head(400)

In [7]:
BIGdf

Unnamed: 0.1,Unnamed: 0,comment,author,likecount,date,replies,video_id
0,0,Respect to Dortmund fans must be sad losing him.,Uh Idk,1932,2021-07-01T10:33:16Z,59,--0bCF-iK2E
1,1,A real talent hope he doesn't turn into anoth...,7h35h96u9,617,2021-07-01T10:28:25Z,34,--0bCF-iK2E
2,2,wish him well in manchester united,Guncius,2583,2021-07-01T10:01:00Z,94,--0bCF-iK2E
3,3,One of the most talented players Bundesliga ha...,Rishi Joe Sanu,57,2021-07-01T12:46:06Z,1,--0bCF-iK2E
4,4,What a spell in the Bundesliga Goodluck in th...,H2KAL5859,66,2021-07-01T10:01:31Z,0,--0bCF-iK2E
...,...,...,...,...,...,...,...
395,395,Nice job seth love the vids,Bossnoopdawg,1,2021-03-03T15:00:49Z,0,-024Swollbc
396,396,When I clicked on the video I realised how tir...,Callum Musgrave,0,2021-03-04T00:14:31Z,0,-024Swollbc
397,397,i remember when the bike went upto $AU70 000 i...,tookken,0,2021-03-03T21:24:40Z,0,-024Swollbc
398,398,Keep up the good work,Lisaferrypilatesfit,1,2021-03-03T15:10:57Z,0,-024Swollbc


In [8]:
BIGdf['video_id']

0      --0bCF-iK2E
1      --0bCF-iK2E
2      --0bCF-iK2E
3      --0bCF-iK2E
4      --0bCF-iK2E
          ...     
395    -024Swollbc
396    -024Swollbc
397    -024Swollbc
398    -024Swollbc
399    -024Swollbc
Name: video_id, Length: 400, dtype: object

In [9]:
BIGdf.value_counts('video_id').keys()

Index(['--0bCF-iK2E', '--DKkzWVh-E', '-024Swollbc'], dtype='object', name='video_id')

In [10]:
likes = np.nan

In [11]:
def making_weights(num):
    '''This function makes weights for each comment based on its like count (num)'''
    if num == 0:
        return 1
    elif num > 0 and num <= np.median(likes[:len(likes)//2]):
        return 2
    elif num > np.median(likes[:len(likes)//2]) and num <= np.median(likes):
        return 3
    elif num > np.median(likes) and num < np.median(likes[len(likes)//2:]):
        return 4
    else:
        return 5

In [12]:
def df_cutter(df):
    IDs_list = df.value_counts('video_id').keys()
    cut_dfs = []
    for i, video in enumerate(IDs_list):
        cut_df = df[df['video_id'] == IDs_list[i]]
        global likes
        likes = sorted(list(cut_df[cut_df['likecount'] > 0]['likecount']))
        cut_df['weight'] = cut_df['likecount'].apply(making_weights)
        cut_dfs.append(cut_df)
    return cut_dfs

In [13]:
cut_dfs = df_cutter(BIGdf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cut_df['weight'] = cut_df['likecount'].apply(making_weights)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cut_df['weight'] = cut_df['likecount'].apply(making_weights)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cut_df['weight'] = cut_df['likecount'].apply(making_weights)


In [14]:
def sentiment_score_comment(df):

    '''This function predicts the sentiment score of each youtube video!'''
    
    model_name = "cardiffnlp/twitter-roberta-base-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

    # Lists to store the sentiment analysis results
    sentiment_list = []
    negative_list = []
    neutral_list = []
    positive_list = []
    scalar_value_list = []
    weighted_SV = []
    weight = list(df['likecount'].apply(making_weights))

    # Iterate over the comments in the DataFrame
    for i, text in enumerate(df['comment']):
        
        # Tokenization, Sentiment Prediction, and Interpretation
        tokens = tokenizer.encode_plus(text, add_special_tokens=True, padding='longest', truncation=True, max_length=512, return_tensors='tf')
        outputs = model(tokens.input_ids)
        logits = outputs.logits
        prediction = np.array(tf.nn.softmax(logits)[0])
        predicted_class = tf.argmax(logits, axis=1).numpy()[0]
        sentiment_labels = ["Negative", "Neutral", "Positive"]
        predicted_sentiment = sentiment_labels[predicted_class]

        # Append the sentiment analysis results to the respective lists
        sentiment_list.append(predicted_sentiment)
        negative_list.append(round(prediction[0]*100, 2))
        neutral_list.append(round(prediction[1]*100, 2))
        positive_list.append(round(prediction[2]*100, 2))
        scalar_value_val = round((prediction[0])*-1+(prediction[2]*1),2)
        scalar_value_list.append(scalar_value_val)
        weighted_SV.append(df['weight'].iloc[i] * scalar_value_val)


    # Create a new DataFrame with the sentiment analysis results
    results_df = pd.DataFrame({
        'Comment': df['comment'],
        'Sentiment': sentiment_list,
        'Negative (%)': negative_list,
        'Neutral (%)': neutral_list,
        'Positive (%)': positive_list,
        'Scaler_value': scalar_value_list,
        'weighted_SV': weighted_SV,
        'weight': weight
    })

    # Return the new DataFrame
    return results_df


In [15]:
results_list = []
for df in cut_dfs:
    results = sentiment_score_comment(df)
    results['video_id'] = df['video_id']
    results_list.append(results)
results_list

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the mode

[                                               Comment Sentiment  \
 0     Respect to Dortmund fans must be sad losing him.   Neutral   
 1    A real talent  hope he doesn't turn into anoth...   Neutral   
 2                   wish him well in manchester united  Positive   
 3    One of the most talented players Bundesliga ha...  Positive   
 4    What a spell in the Bundesliga  Goodluck in th...  Positive   
 ..                                                 ...       ...   
 170         Has potential to be the next Ronaldo/messi  Positive   
 171                              Sancho <3  Thank you.  Positive   
 172  Welcome to Manchester United the theatre of dr...  Positive   
 173  I hope he became much stronger and healthier i...  Positive   
 174  All i can say is  good luck on your next desti...  Positive   
 
      Negative (%)  Neutral (%)  Positive (%)  Scaler_value  weighted_SV  \
 0           33.80        50.22         15.98         -0.18        -0.90   
 1           10.69

In [16]:
results_list[0]

Unnamed: 0,Comment,Sentiment,Negative (%),Neutral (%),Positive (%),Scaler_value,weighted_SV,weight,video_id
0,Respect to Dortmund fans must be sad losing him.,Neutral,33.80,50.22,15.98,-0.18,-0.90,5,--0bCF-iK2E
1,A real talent hope he doesn't turn into anoth...,Neutral,10.69,63.89,25.42,0.15,0.75,5,--0bCF-iK2E
2,wish him well in manchester united,Positive,0.31,11.78,87.91,0.88,4.40,5,--0bCF-iK2E
3,One of the most talented players Bundesliga ha...,Positive,0.22,1.68,98.10,0.98,4.90,5,--0bCF-iK2E
4,What a spell in the Bundesliga Goodluck in th...,Positive,0.18,3.56,96.26,0.96,4.80,5,--0bCF-iK2E
...,...,...,...,...,...,...,...,...,...
170,Has potential to be the next Ronaldo/messi,Positive,0.86,35.08,64.07,0.63,0.63,1,--0bCF-iK2E
171,Sancho <3 Thank you.,Positive,0.21,7.30,92.49,0.92,3.68,4,--0bCF-iK2E
172,Welcome to Manchester United the theatre of dr...,Positive,0.17,6.92,92.91,0.93,1.86,2,--0bCF-iK2E
173,I hope he became much stronger and healthier i...,Positive,0.14,2.31,97.56,0.97,1.94,2,--0bCF-iK2E


In [29]:
comment_score_df = pd.concat(results_list)

In [18]:
IDs_df = pd.DataFrame(BIGdf.value_counts('video_id').keys())

In [19]:
IDs_df['positivity_score'] = np.nan

In [20]:
IDs_df

Unnamed: 0,video_id,positivity_score
0,--0bCF-iK2E,
1,--DKkzWVh-E,
2,-024Swollbc,


In [21]:
for i, df in enumerate(cut_dfs):
    
    df['weight'] = df['likecount'].apply(making_weights)
    
    score_df = sentiment_score_comment(df)
    
    positivity_score = score_df['weighted_SV'].mean()
    
    IDs_df['positivity_score'][i] = positivity_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weight'] = df['likecount'].apply(making_weights)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  IDs_df['positivity_score'][i] = positivity_score
A value is tr

In [22]:
results = sentiment_score_comment(df)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [30]:
def df_to_csv(results_df):
    IDs_df.to_csv('video_score.csv')
    comment_score_df.to_csv('comment_score.csv')
    
    

In [31]:
df_to_csv(IDs_df)

In [27]:
# video_id = IDs_df.iloc[0].video_id
# IDs_df.to_csv(f'video_score{video_id}.csv')

In [28]:
# IDs_df