In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df1= pd.read_csv("Desktop/emoint tweet/joy-ratings-0to1.train.txt", delimiter='\t', header=None)
df1.columns = ['Id', 'tweet', 'emotion', 'score']

In [3]:
print(df1)


        Id                                              tweet emotion  score
0    30000  Just got back from seeing @GaryDelaney in Burs...     joy  0.980
1    30001  Oh dear an evening of absolute hilarity I don'...     joy  0.958
2    30002  Been waiting all week for this game ❤️❤️❤️ #ch...     joy  0.940
3    30003  @gardiner_love : Thank you so much, Gloria! Yo...     joy  0.938
4    30004  I feel so blessed to work with the family that...     joy  0.938
..     ...                                                ...     ...    ...
818  30818  It's just the lack of company and liveliness o...     joy  0.058
819  30819             Quinn's short hair makes me sad. #glee     joy  0.040
820  30820  hate overthinking e v e r y t h i n g like i j...     joy  0.040
821  30821  People who cheer for sports teams completely o...     joy  0.020
822  30822  @DamnPatriot You're a POS for rejoicing in som...     joy  0.019

[823 rows x 4 columns]


In [4]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import emoji
import re

In [5]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df1['tweet'] = df1['tweet'].apply(preprocess_tweet)

In [6]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df1['tweet'] = df1['tweet'].apply(remove_mentions)


In [7]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text)  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df1['tweet'] = df1['tweet'].apply(clean_text)



In [8]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,30000,just got back from seeing in burslem amazing f...,joy,0.980
1,30001,oh dear an evening of absolute hilarity i dont...,joy,0.958
2,30002,been waiting all week for this game #cheer #fr...,joy,0.940
3,30003,thank you so much gloria youre so sweet and th...,joy,0.938
4,30004,i feel so blessed to work with the family that...,joy,0.938
...,...,...,...,...
818,30818,its just the lack of company and liveliness ou...,joy,0.058
819,30819,quinns short hair makes me sad #glee,joy,0.040
820,30820,hate overthinking e v e r y t h i n g like i j...,joy,0.040
821,30821,people who cheer for sports teams completely o...,joy,0.020


In [9]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text

In [10]:
df1['tweet'] = df1['tweet'].apply(convert_emoji)

In [11]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df1['tweet'] = df1['tweet'].apply(tokenize_tweets)

In [12]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,30000,"[just, got, back, from, seeing, in, burslem, a...",joy,0.980
1,30001,"[oh, dear, an, evening, of, absolute, hilarity...",joy,0.958
2,30002,"[been, waiting, all, week, for, this, game, #c...",joy,0.940
3,30003,"[thank, you, so, much, gloria, youre, so, swee...",joy,0.938
4,30004,"[i, feel, so, blessed, to, work, with, the, fa...",joy,0.938
...,...,...,...,...
818,30818,"[its, just, the, lack, of, company, and, livel...",joy,0.058
819,30819,"[quinns, short, hair, makes, me, sad, #glee]",joy,0.040
820,30820,"[hate, overthinking, e, v, e, r, y, t, h, i, n...",joy,0.040
821,30821,"[people, who, cheer, for, sports, teams, compl...",joy,0.020


In [13]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'this', 'all',
    'am', 'or', 'but', 'if', 'my', 'me', 'we', 'us', 'our', 'we', 'up', 'down', 'out', 'just', 'how', 'why',
    'when', 'where', 'here', 'there', 'about', 'more', 'most', 'some', 'any', 'few', 'many', 'much', 'not',
    'only', 'other', 'same', 'such', 'no', 'nor', 'too', 'very', 'can', 'cannot', 'could', 'should', 'would',
    'might', 'must', 'shall', 'will', 'isn', 'hasn', 'doesn', 'haven', 'didn', 'hadn', 'wasn', 'weren',
    'wouldn', 'shouldn', 'ain', 'aren', 'ma'
]

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens

df1['tweet'] = df1['tweet'].apply(remove_stopwords)



In [14]:
df1


Unnamed: 0,Id,tweet,emotion,score
0,30000,"[got, back, seeing, burslem, amazing, face, st...",joy,0.980
1,30001,"[oh, dear, evening, absolute, hilarity, dont, ...",joy,0.958
2,30002,"[been, waiting, week, game, #cheer, #friday]",joy,0.940
3,30003,"[thank, gloria, youre, sweet, thoughtful, made...",joy,0.938
4,30004,"[feel, blessed, work, family, nanny, nothing, ...",joy,0.938
...,...,...,...,...
818,30818,"[lack, company, liveliness, makes, bored]",joy,0.058
819,30819,"[quinns, short, hair, makes, sad, #glee]",joy,0.040
820,30820,"[hate, overthinking, like, jus, wanna, happy, ...",joy,0.040
821,30821,"[people, who, cheer, sports, teams, completely...",joy,0.020


In [15]:
import torch
from transformers import BertModel, BertTokenizer

model_name='bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def extract_features(tweet):
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    
    return features

df1['features'] = df1['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
df1

Unnamed: 0,Id,tweet,emotion,score,features
0,30000,"[got, back, seeing, burslem, amazing, face, st...",joy,0.980,"[[-0.30311838, 0.07581233, 0.5529415, -0.48722..."
1,30001,"[oh, dear, evening, absolute, hilarity, dont, ...",joy,0.958,"[[0.07647736, 0.09434516, -0.023025593, -0.175..."
2,30002,"[been, waiting, week, game, #cheer, #friday]",joy,0.940,"[[-0.14617108, -0.14809571, -0.03178546, 0.225..."
3,30003,"[thank, gloria, youre, sweet, thoughtful, made...",joy,0.938,"[[-0.19459502, 0.07583713, -0.14486797, 0.0292..."
4,30004,"[feel, blessed, work, family, nanny, nothing, ...",joy,0.938,"[[-0.40701014, 0.22444889, 0.32654092, -0.4076..."
...,...,...,...,...,...
818,30818,"[lack, company, liveliness, makes, bored]",joy,0.058,"[[-0.31410608, 0.5868328, -0.071265765, -0.115..."
819,30819,"[quinns, short, hair, makes, sad, #glee]",joy,0.040,"[[-0.32033375, -0.24209923, 0.10508025, -0.109..."
820,30820,"[hate, overthinking, like, jus, wanna, happy, ...",joy,0.040,"[[-0.2011238, 0.43611714, -0.2035487, 0.024425..."
821,30821,"[people, who, cheer, sports, teams, completely...",joy,0.020,"[[-0.10141636, 0.20870517, -0.026847256, 0.103..."


In [17]:
df1.features[0].shape

(15, 768)

In [18]:

df1.head(10)

Unnamed: 0,Id,tweet,emotion,score,features
0,30000,"[got, back, seeing, burslem, amazing, face, st...",joy,0.98,"[[-0.30311838, 0.07581233, 0.5529415, -0.48722..."
1,30001,"[oh, dear, evening, absolute, hilarity, dont, ...",joy,0.958,"[[0.07647736, 0.09434516, -0.023025593, -0.175..."
2,30002,"[been, waiting, week, game, #cheer, #friday]",joy,0.94,"[[-0.14617108, -0.14809571, -0.03178546, 0.225..."
3,30003,"[thank, gloria, youre, sweet, thoughtful, made...",joy,0.938,"[[-0.19459502, 0.07583713, -0.14486797, 0.0292..."
4,30004,"[feel, blessed, work, family, nanny, nothing, ...",joy,0.938,"[[-0.40701014, 0.22444889, 0.32654092, -0.4076..."
5,30005,"[today, reached, subscribers, yt, #goodday, #t...",joy,0.926,"[[-0.05542418, -0.051494613, 0.16199583, -0.01..."
6,30006,"[good, morning, love, happy, first, day, fall,...",joy,0.924,"[[-0.14538582, -0.053934813, 0.12582867, -0.07..."
7,30007,"[#bridgetjonesbaby, best, thing, ive, seen, ag...",joy,0.922,"[[-0.34465936, 0.017047953, -0.1802155, -0.035..."
8,30008,"[got, back, seeing, burslem, amazing, face, st...",joy,0.92,"[[-0.085962094, 0.22028987, 0.35717702, -0.506..."
9,30009,"[thought, holidays, get, cheerful, then, met, ...",joy,0.917,"[[-0.06376603, 0.19012895, -0.005626023, 0.023..."


In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features = df1['features'].tolist()
padded_features = pad_sequences(features, padding='post')
padded_df = df1.copy()
padded_df['features'] = padded_features.tolist()

In [20]:
X = np.stack(padded_df['features'])
print('Input feature shape:', X.shape)

Input feature shape: (823, 41, 768)


In [21]:
y = np.array(padded_df['score']) 
print("Input shape:", X.shape)
print("Output shape:", y.shape)

Input shape: (823, 41, 768)
Output shape: (823,)


In [22]:
X = np.reshape(X, (823, 41 * 768)) 
y = np.reshape(y, (823,))

In [23]:
X.shape

(823, 31488)

In [24]:
Y=df1['score'].copy()

     

In [25]:
X.shape

(823, 31488)

In [26]:
max_sequence_length = len(max(X, key=len))


In [27]:
import tensorflow as tf

In [28]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(8096, input_shape=(31488,), activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(4048, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(2024, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(1012, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(506, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(1, activation='linear'))

early_stopping = EarlyStopping(monitor='mean_squared_error', patience=1, mode='min')
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8096)              254934944 
                                                                 
 dropout (Dropout)           (None, 8096)              0         
                                                                 
 dense_1 (Dense)             (None, 4048)              32776656  
                                                                 
 dropout_1 (Dropout)         (None, 4048)              0         
                                                                 
 dense_2 (Dense)             (None, 2024)              8195176   
                                                                 
 dropout_2 (Dropout)         (None, 2024)              0         
                                                                 
 dense_3 (Dense)             (None, 1012)              2

In [29]:
X.shape

(823, 31488)

In [30]:
df1['tweet'] = df1['tweet'].apply(lambda tokens: ' '.join(tokens))
array = df1['tweet'].values
tensor = tf.convert_to_tensor(array)

In [31]:
X.shape

(823, 31488)

In [32]:
y.shape

(823,)

In [33]:
history = model.fit(X, Y, batch_size=27, epochs=10, shuffle=True, verbose=1, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
model.save('emointjoy.h5','/Home')

In [35]:
from keras.models import load_model

loaded_model = load_model('emointjoy.h5')

In [36]:
gold= pd.read_csv("Desktop/emoint tweet/joy-ratings-0to1.dev.gold.txt", delimiter='\t', header=None)
gold.columns = ['Id', 'tweet', 'emotion', 'score']

In [37]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,30823,"@theclobra lol I thought maybe, couldn't decid...",joy,0.312
1,30824,Nawaz Sharif is getting more funnier than @kap...,joy,0.700
2,30825,Nawaz Sharif is getting more funnier than @kap...,joy,0.580
3,30826,@tomderivan73 😁...I'll just people watch and e...,joy,0.438
4,30827,I love my family so much #lucky #grateful #sma...,joy,0.936
...,...,...,...,...
74,30897,It feels good to get outside for a minute and ...,joy,0.580
75,30898,@r0Ils ppl get triggered over u smiling they'r...,joy,0.170
76,30899,@GigaFag @pipertownsend_ snapchat new would be...,joy,0.396
77,30900,@GigaFag @pipertownsend_ snapchat new would be...,joy,0.156


In [38]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
gold['tweet'] = gold['tweet'].apply(preprocess_tweet)

In [39]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,30823,"@theclobra lol i thought maybe, couldn't decid...",joy,0.312
1,30824,nawaz sharif is getting more funnier than @kap...,joy,0.700
2,30825,nawaz sharif is getting more funnier than @kap...,joy,0.580
3,30826,@tomderivan73 😁...i'll just people watch and e...,joy,0.438
4,30827,i love my family so much #lucky #grateful #sma...,joy,0.936
...,...,...,...,...
74,30897,it feels good to get outside for a minute and ...,joy,0.580
75,30898,@r0ils ppl get triggered over u smiling they'r...,joy,0.170
76,30899,@gigafag @pipertownsend_ snapchat new would be...,joy,0.396
77,30900,@gigafag @pipertownsend_ snapchat new would be...,joy,0.156


In [40]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

gold['tweet'] = gold['tweet'].apply(remove_mentions)
gold

Unnamed: 0,Id,tweet,emotion,score
0,30823,"lol i thought maybe, couldn't decide if there...",joy,0.312
1,30824,nawaz sharif is getting more funnier than day...,joy,0.700
2,30825,nawaz sharif is getting more funnier than day...,joy,0.580
3,30826,😁...i'll just people watch and enjoy a rare s...,joy,0.438
4,30827,i love my family so much #lucky #grateful #sma...,joy,0.936
...,...,...,...,...
74,30897,it feels good to get outside for a minute and ...,joy,0.580
75,30898,ppl get triggered over u smiling they're irre...,joy,0.170
76,30899,snapchat new would beg to differ #optimism,joy,0.396
77,30900,snapchat new would beg to differ,joy,0.156


In [41]:
def clean_gold(gold):
    gold  = re.sub(r'[^\w\s#@]', '', gold ) 
    gold  = re.sub(r'\d+', '', gold)  
    gold  = re.sub(r'\s+', ' ', gold).strip()
    return gold

gold['tweet'] = gold['tweet'].apply(clean_gold)


In [42]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,30823,lol i thought maybe couldnt decide if there wa...,joy,0.312
1,30824,nawaz sharif is getting more funnier than day ...,joy,0.700
2,30825,nawaz sharif is getting more funnier than day ...,joy,0.580
3,30826,ill just people watch and enjoy a rare show of...,joy,0.438
4,30827,i love my family so much #lucky #grateful #sma...,joy,0.936
...,...,...,...,...
74,30897,it feels good to get outside for a minute and ...,joy,0.580
75,30898,ppl get triggered over u smiling theyre irrele...,joy,0.170
76,30899,snapchat new would beg to differ #optimism,joy,0.396
77,30900,snapchat new would beg to differ,joy,0.156


In [43]:
def convert_emoji(tweet):
    gold = emoji.demojize(tweet)
    return gold

gold['tweet'] = gold['tweet'].apply(convert_emoji)

In [44]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

gold['tweet'] = gold['tweet'].apply(tokenize_tweets)

In [45]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,30823,"[lol, i, thought, maybe, couldnt, decide, if, ...",joy,0.312
1,30824,"[nawaz, sharif, is, getting, more, funnier, th...",joy,0.700
2,30825,"[nawaz, sharif, is, getting, more, funnier, th...",joy,0.580
3,30826,"[ill, just, people, watch, and, enjoy, a, rare...",joy,0.438
4,30827,"[i, love, my, family, so, much, #lucky, #grate...",joy,0.936
...,...,...,...,...
74,30897,"[it, feels, good, to, get, outside, for, a, mi...",joy,0.580
75,30898,"[ppl, get, triggered, over, u, smiling, theyre...",joy,0.170
76,30899,"[snapchat, new, would, beg, to, differ, #optim...",joy,0.396
77,30900,"[snapchat, new, would, beg, to, differ]",joy,0.156


In [46]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'this', 'all',
    'am', 'or', 'but', 'if', 'my', 'me', 'we', 'us', 'our', 'we', 'up', 'down', 'out', 'just', 'how', 'why',
    'when', 'where', 'here', 'there', 'about', 'more', 'most', 'some', 'any', 'few', 'many', 'much', 'not',
    'only', 'other', 'same', 'such', 'no', 'nor', 'too', 'very', 'can', 'cannot', 'could', 'should', 'would',
    'might', 'must', 'shall', 'will', 'isn', 'hasn', 'doesn', 'haven', 'didn', 'hadn', 'wasn', 'weren',
    'wouldn', 'shouldn', 'ain', 'aren', 'ma'
]


def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens

gold['tweet'] = gold['tweet'].apply(remove_stopwords)



In [47]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,30823,"[lol, thought, maybe, couldnt, decide, levity]",joy,0.312
1,30824,"[nawaz, sharif, getting, funnier, than, day, d...",joy,0.700
2,30825,"[nawaz, sharif, getting, funnier, than, day, d...",joy,0.580
3,30826,"[ill, people, watch, enjoy, rare, show, optimism]",joy,0.438
4,30827,"[love, family, #lucky, #grateful, #smartassfam...",joy,0.936
...,...,...,...,...
74,30897,"[feels, good, get, outside, minute, get, fresh...",joy,0.580
75,30898,"[ppl, get, triggered, over, smiling, theyre, i...",joy,0.170
76,30899,"[snapchat, new, beg, differ, #optimism]",joy,0.396
77,30900,"[snapchat, new, beg, differ]",joy,0.156


In [48]:
import torch
from transformers import BertModel, BertTokenizer


model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def extract_features(tweet):
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    return features


gold['features'] = gold['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [49]:
gold["features"][0].shape

(11, 768)

In [50]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
features =gold['features'].tolist()

padded_features = pad_sequences(features, padding='post')

padded_df = gold.copy()
padded_df['features'] = padded_features.tolist()

In [51]:
X_gold= np.stack(padded_df['features'])
print('Input feature shape:', X_gold.shape)

Input feature shape: (79, 31, 768)


In [52]:

from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = 41

padded_gold_data = pad_sequences(X_gold, maxlen=max_sequence_length, padding='post', truncating='post')


In [53]:
X = np.reshape(padded_gold_data, (79, 41 * 768))  


In [54]:
gold_features = np.array(X)

In [55]:
print(gold_features.shape)

(79, 31488)


In [56]:

predictions = loaded_model.predict(gold_features)



In [57]:
print(predictions)

[[0.4685372 ]
 [0.61145604]
 [0.5191612 ]
 [0.4675858 ]
 [0.6709027 ]
 [0.7274162 ]
 [0.43736863]
 [0.37775514]
 [0.39909938]
 [0.5066243 ]
 [0.37499645]
 [0.43481347]
 [0.48885235]
 [0.62734646]
 [0.59113586]
 [0.4598058 ]
 [0.67339915]
 [0.498628  ]
 [0.5715491 ]
 [0.6636152 ]
 [0.4299132 ]
 [0.538083  ]
 [0.56842226]
 [0.40880743]
 [0.38107437]
 [0.514882  ]
 [0.52154917]
 [0.426864  ]
 [0.52707624]
 [0.5514012 ]
 [0.6609685 ]
 [0.5425085 ]
 [0.79018694]
 [0.7825342 ]
 [0.5061928 ]
 [0.6223644 ]
 [0.39851847]
 [0.45391616]
 [0.38856572]
 [0.61677384]
 [0.5365201 ]
 [0.53053457]
 [0.60949296]
 [0.8791053 ]
 [0.60204005]
 [0.6763834 ]
 [0.50036764]
 [0.53793126]
 [0.51924235]
 [0.4454118 ]
 [0.574791  ]
 [0.45432708]
 [0.35014048]
 [0.4854249 ]
 [0.3843422 ]
 [0.44997802]
 [0.67829484]
 [0.4518516 ]
 [0.47378403]
 [0.54662806]
 [0.52348626]
 [0.38146752]
 [0.42971396]
 [0.47046363]
 [0.53409773]
 [0.48533362]
 [0.47526255]
 [0.45969468]
 [0.47071177]
 [0.3800769 ]
 [0.52397335]
 [0.66

In [58]:
gold['prediction']=pd.DataFrame(predictions)

In [59]:
gold

Unnamed: 0,Id,tweet,emotion,score,features,prediction
0,30823,"[lol, thought, maybe, couldnt, decide, levity]",joy,0.312,"[[-0.26824242, 0.33785543, -0.06891726, 0.1938...",0.468537
1,30824,"[nawaz, sharif, getting, funnier, than, day, d...",joy,0.700,"[[-0.78701603, -0.18422477, 0.050351318, -0.26...",0.611456
2,30825,"[nawaz, sharif, getting, funnier, than, day, d...",joy,0.580,"[[-0.8860845, -0.23212545, 0.053397615, -0.089...",0.519161
3,30826,"[ill, people, watch, enjoy, rare, show, optimism]",joy,0.438,"[[0.06172238, 0.24399737, 0.23058358, 0.041538...",0.467586
4,30827,"[love, family, #lucky, #grateful, #smartassfam...",joy,0.936,"[[-0.050798904, 0.1252638, 0.031203147, 0.0878...",0.670903
...,...,...,...,...,...,...
74,30897,"[feels, good, get, outside, minute, get, fresh...",joy,0.580,"[[0.23290265, 0.111647464, 0.13098754, -0.2015...",0.487767
75,30898,"[ppl, get, triggered, over, smiling, theyre, i...",joy,0.170,"[[-0.38961452, 0.20775628, -0.038733605, 0.066...",0.422194
76,30899,"[snapchat, new, beg, differ, #optimism]",joy,0.396,"[[-0.33827773, -0.048740894, 0.14746279, 0.104...",0.445762
77,30900,"[snapchat, new, beg, differ]",joy,0.156,"[[-0.51852673, -0.14575958, -0.2489785, -0.145...",0.370084


In [60]:
from sklearn.metrics import mean_squared_error

In [61]:
mse = mean_squared_error(gold['score'], gold['prediction'])
print("Mean square Error:", mse)


Mean square Error: 0.03538888310600196


In [62]:
test= pd.read_csv("Desktop/emoint tweet/joy-ratings-0to1.test.target.txt", delimiter='\t', header=None)
test.columns = ['Id', 'tweet', 'emotion', 'score']

In [63]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
test['tweet'] = test['tweet'].apply(preprocess_tweet)

In [64]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

test['tweet'] = test['tweet'].apply(remove_mentions)
test

Unnamed: 0,Id,tweet,emotion,score
0,30902,you must be knowing #blithe means (adj.) happ...,joy,NONE
1,30903,old saying 'a #smile shared is one gained for ...,joy,NONE
2,30904,bridget jones' baby was bloody hilarious 😅 #br...,joy,NONE
3,30905,sparkling water makes your life sparkly,joy,NONE
4,30906,i'm tired of everybody telling me to chill out...,joy,NONE
...,...,...,...,...
709,31611,with a very tired body and mind and sparkling ...,joy,NONE
710,31612,i refuse to be a chirp chirp girl,joy,NONE
711,31613,it was very hard to stifle my laughter after i...,joy,NONE
712,31614,"while i was walking, a little boy in a red shi...",joy,NONE


In [65]:
def clean_test(test):
    test  = re.sub(r'[^\w\s#@]', '', test ) 
    test  = re.sub(r'\d+', '', test)  
    test  = re.sub(r'\s+', ' ', test).strip()
    return test

test['tweet'] = test['tweet'].apply(clean_test)


In [66]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

test['tweet'] = test['tweet'].apply(tokenize_tweets)

In [67]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'this', 'all',
    'am', 'or', 'but', 'if', 'my', 'me', 'we', 'us', 'our', 'we', 'up', 'down', 'out', 'just', 'how', 'why',
    'when', 'where', 'here', 'there', 'about', 'more', 'most', 'some', 'any', 'few', 'many', 'much', 'not',
    'only', 'other', 'same', 'such', 'no', 'nor', 'too', 'very', 'can', 'cannot', 'could', 'should', 'would',
    'might', 'must', 'shall', 'will', 'isn', 'hasn', 'doesn', 'haven', 'didn', 'hadn', 'wasn', 'weren',
    'wouldn', 'shouldn', 'ain', 'aren', 'ma'
]

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens


test['tweet'] = test['tweet'].apply(remove_stopwords)



In [68]:
import torch
from transformers import BertModel, BertTokenizer


model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def extract_features(tweet):
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    return features


test['features'] = test['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
test["features"][0].shape

(12, 768)

In [70]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
features =test['features'].tolist()

padded_features = pad_sequences(features, padding='post')

padded_df = test.copy()
padded_df['features'] = padded_features.tolist()

In [71]:
X_test= np.stack(padded_df['features'])
print('Input feature shape:', X_test.shape)

Input feature shape: (714, 50, 768)


In [72]:

from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = 41
truncated_test_data = pad_sequences(X_test, maxlen=max_sequence_length, padding='post', truncating='post')

print(truncated_test_data.shape)

(714, 41, 768)


In [73]:
X = np.reshape(truncated_test_data, (714, 41 * 768)) 

In [74]:
test_features = np.array(X)

In [75]:
print(test_features.shape)

(714, 31488)


In [76]:

predictions = loaded_model.predict(test_features)



In [77]:
print(predictions)

[[0.5982035 ]
 [0.49336487]
 [0.57207036]
 [0.43248385]
 [0.5398857 ]
 [0.42533195]
 [0.44496948]
 [0.6318033 ]
 [0.7397249 ]
 [0.51311886]
 [0.42802098]
 [0.54241884]
 [0.61189425]
 [0.45689744]
 [0.5412038 ]
 [0.53461796]
 [0.52317566]
 [0.6612517 ]
 [0.60472864]
 [0.5335077 ]
 [0.4317347 ]
 [0.53358185]
 [0.49101242]
 [0.67790484]
 [0.53069335]
 [0.4308138 ]
 [0.4315634 ]
 [0.41566002]
 [0.4543231 ]
 [0.39435515]
 [0.489212  ]
 [0.4279508 ]
 [0.57464474]
 [0.4917474 ]
 [0.47110412]
 [0.4527013 ]
 [0.4546577 ]
 [0.5227037 ]
 [0.5297692 ]
 [0.5480236 ]
 [0.61406857]
 [0.51795095]
 [0.44723684]
 [0.50561583]
 [0.50421673]
 [0.5559261 ]
 [0.54466367]
 [0.37568852]
 [0.4959059 ]
 [0.5337396 ]
 [0.5230463 ]
 [0.68582296]
 [0.68221635]
 [0.486689  ]
 [0.59920186]
 [0.45133424]
 [0.652332  ]
 [0.4078581 ]
 [0.5894756 ]
 [0.510418  ]
 [0.42955983]
 [0.83454674]
 [0.66864276]
 [0.45680818]
 [0.43869364]
 [0.48748443]
 [0.36442584]
 [0.55373204]
 [0.54096365]
 [0.473415  ]
 [0.48957694]
 [0.38

In [78]:
test['score']=pd.DataFrame(predictions)

In [79]:
test

Unnamed: 0,Id,tweet,emotion,score,features
0,30902,"[knowing, #blithe, means, adj, happy, cheerful]",joy,0.598203,"[[0.06667158, 0.123148575, 0.20162007, -0.0633..."
1,30903,"[old, saying, #smile, shared, one, gained, ano...",joy,0.493365,"[[0.05411899, 0.09224143, 0.1278078, 0.0228617..."
2,30904,"[bridget, jones, baby, bloody, hilarious, #bri...",joy,0.572070,"[[-0.2968768, 0.16157933, 0.04914654, -0.04029..."
3,30905,"[sparkling, water, makes, life, sparkly]",joy,0.432484,"[[-0.025124839, 0.22698666, 0.04356417, 0.1450..."
4,30906,"[im, tired, everybody, telling, chill, everyth...",joy,0.539886,"[[-0.05045792, 0.482266, 0.18940437, -0.572387..."
...,...,...,...,...,...
709,31611,"[tired, body, mind, sparkling, teeth, say, fol...",joy,0.534691,"[[-0.0030458397, 0.1917828, 0.29441363, -0.136..."
710,31612,"[refuse, chirp, chirp, girl]",joy,0.479709,"[[-0.3924269, -0.31344494, -0.31823176, 0.0956..."
711,31613,"[hard, stifle, laughter, after, overheard, com...",joy,0.366179,"[[-0.042962182, 0.29848492, 0.035863742, 0.020..."
712,31614,"[while, walking, little, boy, red, shirt, year...",joy,0.488568,"[[-0.4283314, 0.22758897, -0.2881539, -0.34742..."


In [80]:
test=test.drop('features',axis=1)

In [81]:
test




Unnamed: 0,Id,tweet,emotion,score
0,30902,"[knowing, #blithe, means, adj, happy, cheerful]",joy,0.598203
1,30903,"[old, saying, #smile, shared, one, gained, ano...",joy,0.493365
2,30904,"[bridget, jones, baby, bloody, hilarious, #bri...",joy,0.572070
3,30905,"[sparkling, water, makes, life, sparkly]",joy,0.432484
4,30906,"[im, tired, everybody, telling, chill, everyth...",joy,0.539886
...,...,...,...,...
709,31611,"[tired, body, mind, sparkling, teeth, say, fol...",joy,0.534691
710,31612,"[refuse, chirp, chirp, girl]",joy,0.479709
711,31613,"[hard, stifle, laughter, after, overheard, com...",joy,0.366179
712,31614,"[while, walking, little, boy, red, shirt, year...",joy,0.488568
