In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df1= pd.read_csv("Desktop/emoint tweet/fear-ratings-0to1.train.txt", delimiter='\t', header=None)
df1.columns = ['Id', 'tweet', 'emotion', 'score']

In [3]:
print(df1)


         Id                                              tweet emotion  score
0     20000  I feel like I am drowning. #depression #anxiet...    fear  0.979
1     20001  I get so nervous even thinking about talking t...    fear  0.979
2     20002                     I lost my blinders .... #panic    fear  0.975
3     20003  I feel like I am drowning. #depression  #falur...    fear  0.938
4     20004  This is the scariest American Horror Story out...    fear  0.938
...     ...                                                ...     ...    ...
1142  21142     Pull over #tonight and make your car #shake 😋💦    fear  0.104
1143  21143  @Melanie_Pierce @HunterHayes awe ain't he a sw...    fear  0.083
1144  21144         @FraserKeegan just had a steak pie supper     fear  0.083
1145  21145      @annalisewrobel_ awe thank you so much love 💕    fear  0.062
1146  21146                             Omg he kissed her🙈  #w    fear  0.062

[1147 rows x 4 columns]


In [4]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import emoji
import re

In [5]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df1['tweet'] = df1['tweet'].apply(preprocess_tweet)

In [6]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df1['tweet'] = df1['tweet'].apply(remove_mentions)


In [7]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text)  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df1['tweet'] = df1['tweet'].apply(clean_text)



In [8]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,20000,i feel like i am drowning #depression #anxiety...,fear,0.979
1,20001,i get so nervous even thinking about talking t...,fear,0.979
2,20002,i lost my blinders #panic,fear,0.975
3,20003,i feel like i am drowning #depression #falure ...,fear,0.938
4,20004,this is the scariest american horror story out...,fear,0.938
...,...,...,...,...
1142,21142,pull over #tonight and make your car #shake,fear,0.104
1143,21143,awe aint he a sweetheart hes adorable,fear,0.083
1144,21144,just had a steak pie supper,fear,0.083
1145,21145,awe thank you so much love,fear,0.062


In [9]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text

In [10]:
df1['tweet'] = df1['tweet'].apply(convert_emoji)

In [11]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df1['tweet'] = df1['tweet'].apply(tokenize_tweets)

In [12]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,20000,"[i, feel, like, i, am, drowning, #depression, ...",fear,0.979
1,20001,"[i, get, so, nervous, even, thinking, about, t...",fear,0.979
2,20002,"[i, lost, my, blinders, #panic]",fear,0.975
3,20003,"[i, feel, like, i, am, drowning, #depression, ...",fear,0.938
4,20004,"[this, is, the, scariest, american, horror, st...",fear,0.938
...,...,...,...,...
1142,21142,"[pull, over, #tonight, and, make, your, car, #...",fear,0.104
1143,21143,"[awe, aint, he, a, sweetheart, hes, adorable]",fear,0.083
1144,21144,"[just, had, a, steak, pie, supper]",fear,0.083
1145,21145,"[awe, thank, you, so, much, love]",fear,0.062


In [13]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'this', 'all',
    'am', 'or', 'but', 'if', 'my', 'me', 'we', 'us', 'our', 'we', 'up', 'down', 'out', 'just', 'how', 'why',
    'when', 'where', 'here', 'there', 'about', 'more', 'most', 'some', 'any', 'few', 'many', 'much', 'not',
    'only', 'other', 'same', 'such', 'no', 'nor', 'too', 'very', 'can', 'cannot', 'could', 'should', 'would',
    'might', 'must', 'shall', 'will', 'isn', 'hasn', 'doesn', 'haven', 'didn', 'hadn', 'wasn', 'weren',
    'wouldn', 'shouldn', 'ain', 'aren', 'ma'
]

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens

df1['tweet'] = df1['tweet'].apply(remove_stopwords)



In [14]:
df1


Unnamed: 0,Id,tweet,emotion,score
0,20000,"[feel, like, drowning, #depression, #anxiety, ...",fear,0.979
1,20001,"[get, nervous, even, thinking, talking, wanna,...",fear,0.979
2,20002,"[lost, blinders, #panic]",fear,0.975
3,20003,"[feel, like, drowning, #depression, #falure, #...",fear,0.938
4,20004,"[scariest, american, horror, story, them, im, ...",fear,0.938
...,...,...,...,...
1142,21142,"[pull, over, #tonight, make, car, #shake]",fear,0.104
1143,21143,"[awe, aint, sweetheart, hes, adorable]",fear,0.083
1144,21144,"[had, steak, pie, supper]",fear,0.083
1145,21145,"[awe, thank, love]",fear,0.062


In [15]:
import torch
from transformers import BertModel, BertTokenizer

model_name='bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def extract_features(tweet):
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    
    return features

df1['features'] = df1['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
df1

Unnamed: 0,Id,tweet,emotion,score,features
0,20000,"[feel, like, drowning, #depression, #anxiety, ...",fear,0.979,"[[0.037809342, 0.254312, -0.06298909, -0.16685..."
1,20001,"[get, nervous, even, thinking, talking, wanna,...",fear,0.979,"[[-0.11003713, 0.12191104, 0.17953236, -0.1630..."
2,20002,"[lost, blinders, #panic]",fear,0.975,"[[-0.168674, 0.12842081, -0.0014389177, -0.082..."
3,20003,"[feel, like, drowning, #depression, #falure, #...",fear,0.938,"[[0.06456666, 0.26413164, -0.106887944, -0.123..."
4,20004,"[scariest, american, horror, story, them, im, ...",fear,0.938,"[[-0.21259005, 0.08201837, 0.38263053, 0.04178..."
...,...,...,...,...,...
1142,21142,"[pull, over, #tonight, make, car, #shake]",fear,0.104,"[[-0.1298164, 0.16981113, 0.039434116, 0.13802..."
1143,21143,"[awe, aint, sweetheart, hes, adorable]",fear,0.083,"[[-0.15620765, 0.42569926, -0.16897485, 0.0595..."
1144,21144,"[had, steak, pie, supper]",fear,0.083,"[[-0.33264166, -0.13432035, -0.14950287, 0.282..."
1145,21145,"[awe, thank, love]",fear,0.062,"[[-0.10937966, 0.33537936, 0.05516939, -0.0624..."


In [17]:
df1.features[0].shape

(15, 768)

In [18]:

df1.head(10)

Unnamed: 0,Id,tweet,emotion,score,features
0,20000,"[feel, like, drowning, #depression, #anxiety, ...",fear,0.979,"[[0.037809342, 0.254312, -0.06298909, -0.16685..."
1,20001,"[get, nervous, even, thinking, talking, wanna,...",fear,0.979,"[[-0.11003713, 0.12191104, 0.17953236, -0.1630..."
2,20002,"[lost, blinders, #panic]",fear,0.975,"[[-0.168674, 0.12842081, -0.0014389177, -0.082..."
3,20003,"[feel, like, drowning, #depression, #falure, #...",fear,0.938,"[[0.06456666, 0.26413164, -0.106887944, -0.123..."
4,20004,"[scariest, american, horror, story, them, im, ...",fear,0.938,"[[-0.21259005, 0.08201837, 0.38263053, 0.04178..."
5,20005,"[nearly, started, crying, having, full, panic,...",fear,0.938,"[[-0.6936238, -0.16869353, 0.12774356, -0.1491..."
6,20006,"[have, finally, tell, therapist, sexuality, la...",fear,0.938,"[[-0.2576109, 0.28046265, -0.046328988, -0.192..."
7,20007,"[dont, think, ive, ever, moved, fast, panic, l...",fear,0.938,"[[-0.27630442, 0.398449, -0.18360381, -0.18951..."
8,20008,"[bus, car, crash, im, still, shaking, bit, wee...",fear,0.938,"[[-0.30670476, 0.43248904, -0.079600625, -0.24..."
9,20009,"[bus, car, crash, im, still, shaking, bit, wee...",fear,0.92,"[[-0.23693994, 0.47858533, -0.1310488, -0.0193..."


In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features = df1['features'].tolist()
padded_features = pad_sequences(features, padding='post')
padded_df = df1.copy()
padded_df['features'] = padded_features.tolist()

In [20]:
X = np.stack(padded_df['features'])
print('Input feature shape:', X.shape)

Input feature shape: (1147, 41, 768)


In [21]:
y = np.array(padded_df['score']) 
print("Input shape:", X.shape)
print("Output shape:", y.shape)

Input shape: (1147, 41, 768)
Output shape: (1147,)


In [23]:
X = np.reshape(X, (1147, 41 * 768)) 
y = np.reshape(y, (1147,))

In [24]:
X.shape

(1147, 31488)

In [25]:
Y=df1['score'].copy()

     

In [26]:
X.shape

(1147, 31488)

In [27]:
max_sequence_length = len(max(X, key=len))


In [28]:
import tensorflow as tf

In [29]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(8096, input_shape=(31488,), activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(4048, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(2024, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(1012, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(506, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(1, activation='linear'))

early_stopping = EarlyStopping(monitor='mean_squared_error', patience=1, mode='min')
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8096)              254934944 
                                                                 
 dropout (Dropout)           (None, 8096)              0         
                                                                 
 dense_1 (Dense)             (None, 4048)              32776656  
                                                                 
 dropout_1 (Dropout)         (None, 4048)              0         
                                                                 
 dense_2 (Dense)             (None, 2024)              8195176   
                                                                 
 dropout_2 (Dropout)         (None, 2024)              0         
                                                                 
 dense_3 (Dense)             (None, 1012)              2

In [30]:
X.shape

(1147, 31488)

In [31]:
df1['tweet'] = df1['tweet'].apply(lambda tokens: ' '.join(tokens))
array = df1['tweet'].values
tensor = tf.convert_to_tensor(array)

In [32]:
X.shape

(1147, 31488)

In [33]:
y.shape

(1147,)

In [34]:
history = model.fit(X, Y, batch_size=27, epochs=10, shuffle=True, verbose=1, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [35]:
model.save('emointjoy.h5','/Home')

In [36]:
from keras.models import load_model

loaded_model = load_model('emointjoy.h5')

In [37]:
gold= pd.read_csv("Desktop/emoint tweet/fear-ratings-0to1.dev.gold.txt", delimiter='\t', header=None)
gold.columns = ['Id', 'tweet', 'emotion', 'score']

In [38]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,21147,I know this is going to be one of those nights...,fear,0.771
1,21148,This is #horrible: Lewis Dunk has begun networ...,fear,0.479
2,21149,"@JeffersonLake speaking of ex cobblers, saw Ri...",fear,0.417
3,21150,@1johndes ball watching &amp; Rojo'd header wa...,fear,0.475
4,21151,"Really.....#Jumanji 2....w/ The Rock, Jack Bla...",fear,0.542
...,...,...,...,...
105,21252,Staff on @ryainair FR1005. Asked for info and ...,fear,0.312
106,21253,Staff on @ryainair FR1005. Asked for info and ...,fear,0.271
107,21254,An adviser to the #European #Union’s top #cour...,fear,0.500
108,21255,So about 18mths ago i signed up to @Lumo_Energ...,fear,0.479


In [39]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
gold['tweet'] = gold['tweet'].apply(preprocess_tweet)

In [40]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,21147,i know this is going to be one of those nights...,fear,0.771
1,21148,this is #horrible: lewis dunk has begun networ...,fear,0.479
2,21149,"@jeffersonlake speaking of ex cobblers, saw ri...",fear,0.417
3,21150,@1johndes ball watching &amp; rojo'd header wa...,fear,0.475
4,21151,"really.....#jumanji 2....w/ the rock, jack bla...",fear,0.542
...,...,...,...,...
105,21252,staff on @ryainair fr1005. asked for info and ...,fear,0.312
106,21253,staff on @ryainair fr1005. asked for info and ...,fear,0.271
107,21254,an adviser to the #european #union’s top #cour...,fear,0.500
108,21255,so about 18mths ago i signed up to @lumo_energ...,fear,0.479


In [41]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

gold['tweet'] = gold['tweet'].apply(remove_mentions)
gold

Unnamed: 0,Id,tweet,emotion,score
0,21147,i know this is going to be one of those nights...,fear,0.771
1,21148,this is #horrible: lewis dunk has begun networ...,fear,0.479
2,21149,"speaking of ex cobblers, saw ricky holmes at ...",fear,0.417
3,21150,ball watching &amp; rojo'd header was equally...,fear,0.475
4,21151,"really.....#jumanji 2....w/ the rock, jack bla...",fear,0.542
...,...,...,...,...
105,21252,staff on fr1005. asked for info and told to l...,fear,0.312
106,21253,staff on fr1005. asked for info and told to l...,fear,0.271
107,21254,an adviser to the #european #union’s top #cour...,fear,0.500
108,21255,so about 18mths ago i signed up to for their ...,fear,0.479


In [42]:
def clean_gold(gold):
    gold  = re.sub(r'[^\w\s#@]', '', gold ) 
    gold  = re.sub(r'\d+', '', gold)  
    gold  = re.sub(r'\s+', ' ', gold).strip()
    return gold

gold['tweet'] = gold['tweet'].apply(clean_gold)


In [43]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,21147,i know this is going to be one of those nights...,fear,0.771
1,21148,this is #horrible lewis dunk has begun network...,fear,0.479
2,21149,speaking of ex cobblers saw ricky holmes at ch...,fear,0.417
3,21150,ball watching amp rojod header was equally dre...,fear,0.475
4,21151,really#jumanji w the rock jack black and kevin...,fear,0.542
...,...,...,...,...
105,21252,staff on fr asked for info and told to look on...,fear,0.312
106,21253,staff on fr asked for info and told to look on...,fear,0.271
107,21254,an adviser to the #european #unions top #court...,fear,0.500
108,21255,so about mths ago i signed up to for their vel...,fear,0.479


In [44]:
def convert_emoji(tweet):
    gold = emoji.demojize(tweet)
    return gold

gold['tweet'] = gold['tweet'].apply(convert_emoji)

In [45]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

gold['tweet'] = gold['tweet'].apply(tokenize_tweets)

In [46]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,21147,"[i, know, this, is, going, to, be, one, of, th...",fear,0.771
1,21148,"[this, is, #horrible, lewis, dunk, has, begun,...",fear,0.479
2,21149,"[speaking, of, ex, cobblers, saw, ricky, holme...",fear,0.417
3,21150,"[ball, watching, amp, rojod, header, was, equa...",fear,0.475
4,21151,"[really, #jumanji, w, the, rock, jack, black, ...",fear,0.542
...,...,...,...,...
105,21252,"[staff, on, fr, asked, for, info, and, told, t...",fear,0.312
106,21253,"[staff, on, fr, asked, for, info, and, told, t...",fear,0.271
107,21254,"[an, adviser, to, the, #european, #unions, top...",fear,0.500
108,21255,"[so, about, mths, ago, i, signed, up, to, for,...",fear,0.479


In [47]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'this', 'all',
    'am', 'or', 'but', 'if', 'my', 'me', 'we', 'us', 'our', 'we', 'up', 'down', 'out', 'just', 'how', 'why',
    'when', 'where', 'here', 'there', 'about', 'more', 'most', 'some', 'any', 'few', 'many', 'much', 'not',
    'only', 'other', 'same', 'such', 'no', 'nor', 'too', 'very', 'can', 'cannot', 'could', 'should', 'would',
    'might', 'must', 'shall', 'will', 'isn', 'hasn', 'doesn', 'haven', 'didn', 'hadn', 'wasn', 'weren',
    'wouldn', 'shouldn', 'ain', 'aren', 'ma'
]


def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens

gold['tweet'] = gold['tweet'].apply(remove_stopwords)



In [48]:
gold

Unnamed: 0,Id,tweet,emotion,score
0,21147,"[know, going, one, those, nights, takes, act, ...",fear,0.771
1,21148,"[#horrible, lewis, dunk, begun, networking, ne...",fear,0.479
2,21149,"[speaking, ex, cobblers, saw, ricky, holmes, c...",fear,0.417
3,21150,"[ball, watching, amp, rojod, header, equally, ...",fear,0.475
4,21151,"[really, #jumanji, rock, jack, black, kevin, h...",fear,0.542
...,...,...,...,...
105,21252,"[staff, fr, asked, info, told, look, online, g...",fear,0.312
106,21253,"[staff, fr, asked, info, told, look, online, g...",fear,0.271
107,21254,"[adviser, #european, #unions, top, #court, sai...",fear,0.500
108,21255,"[mths, ago, signed, their, velocity, ff, deal,...",fear,0.479


In [49]:
import torch
from transformers import BertModel, BertTokenizer


model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def extract_features(tweet):
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    return features


gold['features'] = gold['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
gold["features"][0].shape

(12, 768)

In [51]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
features =gold['features'].tolist()

padded_features = pad_sequences(features, padding='post')

padded_df = gold.copy()
padded_df['features'] = padded_features.tolist()

In [52]:
X_gold= np.stack(padded_df['features'])
print('Input feature shape:', X_gold.shape)

Input feature shape: (110, 40, 768)


In [53]:

from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = 41

padded_gold_data = pad_sequences(X_gold, maxlen=max_sequence_length, padding='post', truncating='post')


In [55]:
X = np.reshape(padded_gold_data, (110, 41 * 768))  


In [56]:
gold_features = np.array(X)

In [57]:
print(gold_features.shape)

(110, 31488)


In [58]:

predictions = loaded_model.predict(gold_features)



In [59]:
print(predictions)

[[0.5156709 ]
 [0.46329266]
 [0.48386526]
 [0.48698324]
 [0.4597918 ]
 [0.5117012 ]
 [0.55958515]
 [0.49263036]
 [0.5181461 ]
 [0.5651391 ]
 [0.5719571 ]
 [0.5850078 ]
 [0.52914345]
 [0.46887633]
 [0.49627924]
 [0.51505166]
 [0.46910596]
 [0.61082125]
 [0.5784122 ]
 [0.562088  ]
 [0.42101935]
 [0.4267837 ]
 [0.49372813]
 [0.5099395 ]
 [0.4629725 ]
 [0.44091418]
 [0.61902326]
 [0.51560116]
 [0.4711641 ]
 [0.51210064]
 [0.47374916]
 [0.43829653]
 [0.45188868]
 [0.4966091 ]
 [0.47757944]
 [0.5735184 ]
 [0.5487175 ]
 [0.5747378 ]
 [0.5403084 ]
 [0.52733374]
 [0.50542456]
 [0.563625  ]
 [0.47232357]
 [0.5260281 ]
 [0.46255192]
 [0.5965444 ]
 [0.5484059 ]
 [0.6394912 ]
 [0.4945289 ]
 [0.51672506]
 [0.49168605]
 [0.49469286]
 [0.6925488 ]
 [0.4873956 ]
 [0.4502115 ]
 [0.44138297]
 [0.4800793 ]
 [0.5998421 ]
 [0.5758794 ]
 [0.47859108]
 [0.56929463]
 [0.40510198]
 [0.40537232]
 [0.4268184 ]
 [0.4473968 ]
 [0.47793162]
 [0.46722105]
 [0.5847932 ]
 [0.49396613]
 [0.50410324]
 [0.47221458]
 [0.51

In [60]:
gold['prediction']=pd.DataFrame(predictions)

In [61]:
gold

Unnamed: 0,Id,tweet,emotion,score,features,prediction
0,21147,"[know, going, one, those, nights, takes, act, ...",fear,0.771,"[[-0.20082852, 0.23499466, 0.26666716, -0.2509...",0.515671
1,21148,"[#horrible, lewis, dunk, begun, networking, ne...",fear,0.479,"[[0.0052761673, 0.09974696, 0.24677433, 0.1428...",0.463293
2,21149,"[speaking, ex, cobblers, saw, ricky, holmes, c...",fear,0.417,"[[-0.1114298, 0.11400322, 0.12179627, -0.36932...",0.483865
3,21150,"[ball, watching, amp, rojod, header, equally, ...",fear,0.475,"[[-0.43671992, 0.057505418, -0.19090301, -0.25...",0.486983
4,21151,"[really, #jumanji, rock, jack, black, kevin, h...",fear,0.542,"[[-0.33436796, 0.09009981, -0.15924482, -0.018...",0.459792
...,...,...,...,...,...,...
105,21252,"[staff, fr, asked, info, told, look, online, g...",fear,0.312,"[[-0.1426141, 0.0785741, 0.31430638, -0.066660...",0.443310
106,21253,"[staff, fr, asked, info, told, look, online, g...",fear,0.271,"[[-0.16974156, 0.0924546, 0.22362891, -0.04701...",0.481825
107,21254,"[adviser, #european, #unions, top, #court, sai...",fear,0.500,"[[-0.6108634, 0.1406612, -0.09995471, -0.29053...",0.579078
108,21255,"[mths, ago, signed, their, velocity, ff, deal,...",fear,0.479,"[[-0.73062104, 0.021704849, 0.4836302, 0.01967...",0.473991


In [62]:
from sklearn.metrics import mean_squared_error

In [63]:
mse = mean_squared_error(gold['score'], gold['prediction'])
print("Mean square Error:", mse)


Mean square Error: 0.033003402965125085


In [64]:
test= pd.read_csv("Desktop/emoint tweet/fear-ratings-0to1.test.target.txt", delimiter='\t', header=None)
test.columns = ['Id', 'tweet', 'emotion', 'score']

In [65]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
test['tweet'] = test['tweet'].apply(preprocess_tweet)

In [66]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

test['tweet'] = test['tweet'].apply(remove_mentions)
test

Unnamed: 0,Id,tweet,emotion,score
0,21257,#matthew 25; 1-13\ncould somebody shoot a #vid...,fear,NONE
1,21258,which really sucks because typing on a mobil...,fear,NONE
2,21259,be #afraid of the #quiet ones they are the one...,fear,NONE
3,21260,he's a horrible person and now i gag when i s...,fear,NONE
4,21261,what we fear doing most is usually what we mos...,fear,NONE
...,...,...,...,...
990,22247,"9 -9 vs atlanta this yr, 2 - 11 vs rockies an...",fear,NONE
991,22248,i'm shaking now.,fear,NONE
992,22249,me: are you guys dating yet #trans #nervous #b...,fear,NONE
993,22250,she: why are you listening to the eurythmics?\...,fear,NONE


In [67]:
def clean_test(test):
    test  = re.sub(r'[^\w\s#@]', '', test ) 
    test  = re.sub(r'\d+', '', test)  
    test  = re.sub(r'\s+', ' ', test).strip()
    return test

test['tweet'] = test['tweet'].apply(clean_test)


In [68]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

test['tweet'] = test['tweet'].apply(tokenize_tweets)

In [69]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'this', 'all',
    'am', 'or', 'but', 'if', 'my', 'me', 'we', 'us', 'our', 'we', 'up', 'down', 'out', 'just', 'how', 'why',
    'when', 'where', 'here', 'there', 'about', 'more', 'most', 'some', 'any', 'few', 'many', 'much', 'not',
    'only', 'other', 'same', 'such', 'no', 'nor', 'too', 'very', 'can', 'cannot', 'could', 'should', 'would',
    'might', 'must', 'shall', 'will', 'isn', 'hasn', 'doesn', 'haven', 'didn', 'hadn', 'wasn', 'weren',
    'wouldn', 'shouldn', 'ain', 'aren', 'ma'
]

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens


test['tweet'] = test['tweet'].apply(remove_stopwords)



In [70]:
import torch
from transformers import BertModel, BertTokenizer


model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def extract_features(tweet):
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    return features


test['features'] = test['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [71]:
test["features"][0].shape

(22, 768)

In [72]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
features =test['features'].tolist()

padded_features = pad_sequences(features, padding='post')

padded_df = test.copy()
padded_df['features'] = padded_features.tolist()

In [73]:
X_test= np.stack(padded_df['features'])
print('Input feature shape:', X_test.shape)

Input feature shape: (995, 38, 768)


In [74]:

from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = 41
truncated_test_data = pad_sequences(X_test, maxlen=max_sequence_length, padding='post', truncating='post')

print(truncated_test_data.shape)

(995, 41, 768)


In [76]:
X = np.reshape(truncated_test_data, (995, 41 * 768)) 

In [77]:
test_features = np.array(X)

In [78]:
print(test_features.shape)

(995, 31488)


In [79]:

predictions = loaded_model.predict(test_features)



In [80]:
print(predictions)

[[0.50026184]
 [0.5481893 ]
 [0.5215363 ]
 [0.4408408 ]
 [0.563223  ]
 [0.5123622 ]
 [0.5534655 ]
 [0.5848606 ]
 [0.5374498 ]
 [0.543158  ]
 [0.6040136 ]
 [0.55956805]
 [0.5181162 ]
 [0.5662841 ]
 [0.5511442 ]
 [0.53213876]
 [0.5410506 ]
 [0.4854576 ]
 [0.4529214 ]
 [0.4973553 ]
 [0.5497619 ]
 [0.4713636 ]
 [0.5279356 ]
 [0.54562026]
 [0.52188855]
 [0.537604  ]
 [0.4972305 ]
 [0.58875924]
 [0.5560321 ]
 [0.45461798]
 [0.45070103]
 [0.49043754]
 [0.4762447 ]
 [0.4232683 ]
 [0.47513476]
 [0.52878857]
 [0.5637623 ]
 [0.7293352 ]
 [0.5697651 ]
 [0.52456087]
 [0.4633235 ]
 [0.46756002]
 [0.51122046]
 [0.5574097 ]
 [0.42910868]
 [0.4861828 ]
 [0.419192  ]
 [0.50288814]
 [0.5670861 ]
 [0.4938065 ]
 [0.48123902]
 [0.5124925 ]
 [0.5535056 ]
 [0.51496863]
 [0.57860893]
 [0.5478402 ]
 [0.5001272 ]
 [0.5517594 ]
 [0.5204648 ]
 [0.45577502]
 [0.41979095]
 [0.489095  ]
 [0.5100465 ]
 [0.51027256]
 [0.6097312 ]
 [0.5569446 ]
 [0.46642476]
 [0.3641048 ]
 [0.42972398]
 [0.5477738 ]
 [0.53355587]
 [0.50

In [81]:
test['score']=pd.DataFrame(predictions)

In [82]:
test

Unnamed: 0,Id,tweet,emotion,score,features
0,21257,"[#matthew, ncould, somebody, shoot, #video, it...",fear,0.500262,"[[0.089441895, 0.4333227, 0.16138215, -0.14375..."
1,21258,"[which, really, sucks, because, typing, mobile...",fear,0.548189,"[[0.23424134, 0.25051132, 0.11579197, -0.18509..."
2,21259,"[#afraid, #quiet, ones, they, ones, who, actua...",fear,0.521536,"[[0.06307171, 0.042609587, 0.071758054, -0.120..."
3,21260,"[hes, horrible, person, now, gag, see, people,...",fear,0.440841,"[[0.12389825, 0.647158, 0.07964105, 0.02110767..."
4,21261,"[what, fear, doing, usually, what, need, do, t...",fear,0.563223,"[[-0.45358893, 0.08992868, 0.26345077, -0.0969..."
...,...,...,...,...,...
990,22247,"[vs, atlanta, yr, vs, rockies, dbacks, yr, tha...",fear,0.363890,"[[-1.0402302, -0.45375535, -0.13067436, -0.273..."
991,22248,"[im, shaking, now]",fear,0.515992,"[[-0.13404948, 0.59542483, 0.16308185, -0.3877..."
992,22249,"[guys, dating, yet, #trans, #nervous, #blowjob...",fear,0.618405,"[[-0.4921757, -0.14156745, 0.11549412, 0.06342..."
993,22250,"[she, listening, eurythmicsnme, polish, gothic...",fear,0.509278,"[[-0.24457596, 0.50185347, 0.3598852, -0.57467..."


In [83]:
test=test.drop('features',axis=1)

In [84]:
test

Unnamed: 0,Id,tweet,emotion,score
0,21257,"[#matthew, ncould, somebody, shoot, #video, it...",fear,0.500262
1,21258,"[which, really, sucks, because, typing, mobile...",fear,0.548189
2,21259,"[#afraid, #quiet, ones, they, ones, who, actua...",fear,0.521536
3,21260,"[hes, horrible, person, now, gag, see, people,...",fear,0.440841
4,21261,"[what, fear, doing, usually, what, need, do, t...",fear,0.563223
...,...,...,...,...
990,22247,"[vs, atlanta, yr, vs, rockies, dbacks, yr, tha...",fear,0.363890
991,22248,"[im, shaking, now]",fear,0.515992
992,22249,"[guys, dating, yet, #trans, #nervous, #blowjob...",fear,0.618405
993,22250,"[she, listening, eurythmicsnme, polish, gothic...",fear,0.509278
