In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df1= pd.read_csv(r"C:\Users\i\Downloads\fear-ratings-0to1.train.txt", delimiter='\t', header=None)
df1.columns = ['Id', 'tweet', 'emotion', 'score']

In [3]:
print(df1)


         Id                                              tweet emotion  score
0     20000  I feel like I am drowning. #depression #anxiet...    fear  0.979
1     20001  I get so nervous even thinking about talking t...    fear  0.979
2     20002                     I lost my blinders .... #panic    fear  0.975
3     20003  I feel like I am drowning. #depression  #falur...    fear  0.938
4     20004  This is the scariest American Horror Story out...    fear  0.938
...     ...                                                ...     ...    ...
1142  21142     Pull over #tonight and make your car #shake 😋💦    fear  0.104
1143  21143  @Melanie_Pierce @HunterHayes awe ain't he a sw...    fear  0.083
1144  21144         @FraserKeegan just had a steak pie supper     fear  0.083
1145  21145      @annalisewrobel_ awe thank you so much love 💕    fear  0.062
1146  21146                             Omg he kissed her🙈  #w    fear  0.062

[1147 rows x 4 columns]


In [4]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import emoji
import re

In [5]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df1['tweet'] = df1['tweet'].apply(preprocess_tweet)

In [6]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df1['tweet'] = df1['tweet'].apply(remove_mentions)


In [7]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text) 
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df1['tweet'] = df1['tweet'].apply(clean_text)



In [8]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,20000,i feel like i am drowning #depression #anxiety...,fear,0.979
1,20001,i get so nervous even thinking about talking t...,fear,0.979
2,20002,i lost my blinders #panic,fear,0.975
3,20003,i feel like i am drowning #depression #falure ...,fear,0.938
4,20004,this is the scariest american horror story out...,fear,0.938
...,...,...,...,...
1142,21142,pull over #tonight and make your car #shake,fear,0.104
1143,21143,awe aint he a sweetheart hes adorable,fear,0.083
1144,21144,just had a steak pie supper,fear,0.083
1145,21145,awe thank you so much love,fear,0.062


In [9]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text

In [10]:
df1['tweet'] = df1['tweet'].apply(convert_emoji)

In [11]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df1['tweet'] = df1['tweet'].apply(tokenize_tweets)

In [12]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,20000,"[i, feel, like, i, am, drowning, #depression, ...",fear,0.979
1,20001,"[i, get, so, nervous, even, thinking, about, t...",fear,0.979
2,20002,"[i, lost, my, blinders, #panic]",fear,0.975
3,20003,"[i, feel, like, i, am, drowning, #depression, ...",fear,0.938
4,20004,"[this, is, the, scariest, american, horror, st...",fear,0.938
...,...,...,...,...
1142,21142,"[pull, over, #tonight, and, make, your, car, #...",fear,0.104
1143,21143,"[awe, aint, he, a, sweetheart, hes, adorable]",fear,0.083
1144,21144,"[just, had, a, steak, pie, supper]",fear,0.083
1145,21145,"[awe, thank, you, so, much, love]",fear,0.062


In [13]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'all',
    'about', 'above', 'after', 'again', 'against', 'ain', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can',
    'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't",
    'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't",
    'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how',
    'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma',
    'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no',
    'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn',
    "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them',
    'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until',
    'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
    'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you',
    "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens

df1['tweet'] = df1['tweet'].apply(remove_stopwords)

In [14]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,20000,"[feel, like, drowning, #depression, #anxiety, ...",fear,0.979
1,20001,"[get, nervous, even, thinking, talking, wanna,...",fear,0.979
2,20002,"[lost, blinders, #panic]",fear,0.975
3,20003,"[feel, like, drowning, #depression, #falure, #...",fear,0.938
4,20004,"[scariest, american, horror, story, im, gonna,...",fear,0.938
...,...,...,...,...
1142,21142,"[pull, #tonight, make, car, #shake]",fear,0.104
1143,21143,"[awe, aint, sweetheart, hes, adorable]",fear,0.083
1144,21144,"[steak, pie, supper]",fear,0.083
1145,21145,"[awe, thank, much, love]",fear,0.062


In [15]:
import torch
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def extract_features(tweet):
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    
    return features

df1['features'] = df1['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
from sklearn.utils import shuffle
df1 = shuffle(df1)
df1

Unnamed: 0,Id,tweet,emotion,score,features
481,20481,"[peoples, deepest, passions, often, scare, muc...",fear,0.521,"[[-0.10522045, 0.3467327, -0.115954235, 0.0466..."
779,20779,"[worry, emphasis, keeping, family, together, g...",fear,0.396,"[[0.02802974, 0.16173235, -0.29791534, -0.2053..."
1111,21111,"[mattmilne, thank, letting, us, know, please, ...",fear,0.146,"[[-0.44774464, 0.15478161, 0.22762844, -0.2787..."
713,20713,"[says, teufel, reassigned, within, organizatio...",fear,0.417,"[[-0.993639, -0.15127987, -0.2219348, -0.25596..."
753,20753,"[follow, amazing, australian, author, #fiction...",fear,0.396,"[[-0.003132152, 0.0018000547, -0.03818727, 0.2..."
...,...,...,...,...,...
147,20147,"[go, back, weeks, start, seriously, dreadful]",fear,0.729,"[[0.18815905, 0.42615047, -0.043639872, 0.1192..."
327,20327,"[overtime, #teamna, #wch, #nervous]",fear,0.604,"[[-0.74993813, -0.17208055, -0.07791391, -0.13..."
432,20432,"[terrorism, intimidate, populace, case, held, ...",fear,0.562,"[[0.16953777, 0.2876624, -0.24679744, -0.06413..."
885,20885,"[ending, met, mother, dreadful]",fear,0.333,"[[-0.22394809, -0.0057124486, -0.115094736, -0..."


In [17]:
df1.features[0].shape

(15, 768)

In [18]:

df1.head(10)

Unnamed: 0,Id,tweet,emotion,score,features
481,20481,"[peoples, deepest, passions, often, scare, muc...",fear,0.521,"[[-0.10522045, 0.3467327, -0.115954235, 0.0466..."
779,20779,"[worry, emphasis, keeping, family, together, g...",fear,0.396,"[[0.02802974, 0.16173235, -0.29791534, -0.2053..."
1111,21111,"[mattmilne, thank, letting, us, know, please, ...",fear,0.146,"[[-0.44774464, 0.15478161, 0.22762844, -0.2787..."
713,20713,"[says, teufel, reassigned, within, organizatio...",fear,0.417,"[[-0.993639, -0.15127987, -0.2219348, -0.25596..."
753,20753,"[follow, amazing, australian, author, #fiction...",fear,0.396,"[[-0.003132152, 0.0018000547, -0.03818727, 0.2..."
125,20125,"[focal, points, war, lie, #terrorism, #un, nee...",fear,0.75,"[[-0.3313101, 0.19597748, -0.13628939, -0.1737..."
939,20939,"[walk, right, see, way, past, dont, even, hesi...",fear,0.312,"[[-0.12020782, 0.26181188, -0.15412205, -0.371..."
320,20320,"[first, take, room, wanna, beat, #bully]",fear,0.604,"[[-0.29258454, -0.11945913, 0.15943666, -0.004..."
358,20358,"[im, shy, girl]",fear,0.583,"[[-0.39446625, 0.20099491, -0.23982096, -0.095..."
274,20274,"[librarians, scare]",fear,0.646,"[[-0.3052356, 0.12111623, -0.3220107, -0.06530..."


In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features = df1['features'].tolist()
padded_features = pad_sequences(features, padding='post')
padded_df = df1.copy()
padded_df['features'] = padded_features.tolist()

In [20]:
x = np.stack(padded_df['features'])

In [21]:
y = np.array(padded_df['score'])    
print("Input shape:", x.shape)
print("Output shape:", y.shape)

Input shape: (1147, 41, 768)
Output shape: (1147,)


In [24]:
x = np.reshape(x, (1147, 41 * 768))  
y = np.reshape(y, (1147,))

In [25]:
y

array([0.521, 0.396, 0.146, ..., 0.562, 0.333, 0.271])

In [26]:
X=x.copy()  
Y=y.copy()  

In [27]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

alpha = 2000
model = Ridge(alpha=alpha)
model.fit(X, Y)

Ridge(alpha=2000)

In [28]:
y_pred = model.predict(X)
mse = mean_squared_error(Y, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.024659146675577955


In [29]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(Y, y_pred)

print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.12857036427267948


In [30]:
import joblib
save_path = 'regression_model.pkl'
joblib.dump(model, save_path)


['regression_model.pkl']

In [31]:
df2= pd.read_csv(r"C:\Users\i\Downloads\fear-ratings-0to1.dev.gold.txt", delimiter='\t', header=None)
df2.columns = ['Id', 'tweet', 'emotion', 'score']

In [32]:
df2

Unnamed: 0,Id,tweet,emotion,score
0,21147,I know this is going to be one of those nights...,fear,0.771
1,21148,This is #horrible: Lewis Dunk has begun networ...,fear,0.479
2,21149,"@JeffersonLake speaking of ex cobblers, saw Ri...",fear,0.417
3,21150,@1johndes ball watching &amp; Rojo'd header wa...,fear,0.475
4,21151,"Really.....#Jumanji 2....w/ The Rock, Jack Bla...",fear,0.542
...,...,...,...,...
105,21252,Staff on @ryainair FR1005. Asked for info and ...,fear,0.312
106,21253,Staff on @ryainair FR1005. Asked for info and ...,fear,0.271
107,21254,An adviser to the #European #Union’s top #cour...,fear,0.500
108,21255,So about 18mths ago i signed up to @Lumo_Energ...,fear,0.479


In [33]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df2['tweet'] = df2['tweet'].apply(preprocess_tweet)

In [34]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df2['tweet'] = df2['tweet'].apply(remove_mentions)


In [35]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text)  
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df2['tweet'] = df2['tweet'].apply(clean_text)


In [36]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text
df2['tweet'] = df2['tweet'].apply(convert_emoji)

In [37]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df2['tweet'] = df2['tweet'].apply(tokenize_tweets)

In [38]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'all',
    'about', 'above', 'after', 'again', 'against', 'ain', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can',
    'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't",
    'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't",
    'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how',
    'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma',
    'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no',
    'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn',
    "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them',
    'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until',
    'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
    'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you',
    "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']


def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens
df2['tweet'] = df2['tweet'].apply(remove_stopwords)

In [39]:
import torch
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


def extract_features(tweet):
    
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    
    return features
df2['features'] = df2['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features2 = df2['features'].tolist()
padded_features2 = pad_sequences(features2, padding='post')
padded_df2 = df2.copy()
padded_df2['features'] = padded_features2.tolist()

In [41]:
X = np.stack(padded_df2['features'])
print('Input feature shape:', X.shape)

Input feature shape: (110, 40, 768)


In [42]:
print('Input feature shape:', X.shape)

Input feature shape: (110, 40, 768)


In [43]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_sequence_length=41
X_=pad_sequences(X,maxlen=max_sequence_length,padding="post",truncating='post')

In [44]:
X_ = np.reshape(X_, (110, 41 * 768))  

In [45]:
saved_model_path = 'regression_model.pkl'
loaded_model = joblib.load(saved_model_path)

In [46]:
predictions = loaded_model.predict(X_)
predictions

array([0.47317431, 0.51554596, 0.51838605, 0.48937823, 0.46480545,
       0.4940073 , 0.46339023, 0.46041275, 0.51100973, 0.57354942,
       0.54629037, 0.50375776, 0.54503522, 0.51344387, 0.51606033,
       0.50013619, 0.5203113 , 0.49215348, 0.47690336, 0.53233144,
       0.45409032, 0.40398657, 0.53771556, 0.51697993, 0.5133957 ,
       0.49163083, 0.5535666 , 0.52349757, 0.48908679, 0.49028713,
       0.46914896, 0.49873005, 0.4455776 , 0.53374283, 0.52689141,
       0.49410597, 0.53443457, 0.49314568, 0.50862165, 0.53405874,
       0.49104613, 0.52270607, 0.47841745, 0.51828496, 0.45073503,
       0.53704695, 0.50829522, 0.53490113, 0.49431941, 0.48801483,
       0.50556369, 0.52735146, 0.50163971, 0.4270643 , 0.47559902,
       0.48699087, 0.48636851, 0.48628448, 0.52353165, 0.50077538,
       0.48705047, 0.4627847 , 0.38544998, 0.48306581, 0.50358646,
       0.48745773, 0.47107445, 0.52047463, 0.49071267, 0.49484818,
       0.51687043, 0.48300814, 0.49893933, 0.50645369, 0.51864

In [47]:
df2['predict']=pd.DataFrame(predictions)

In [48]:
mse=mean_squared_error(df2['score'],df2['predict'])
print("mean_square_error:",mse)

mean_square_error: 0.030899198410693333


In [49]:
mae = mean_absolute_error(df2['score'],df2['predict'])
print("Mean Absolute Error:", mae)


Mean Absolute Error: 0.1430721146543378


In [50]:
df2

Unnamed: 0,Id,tweet,emotion,score,features,predict
0,21147,"[know, going, one, nights, takes, act, god, fa...",fear,0.771,"[[-0.3144856, 0.15769926, 0.43722436, -0.39113...",0.473174
1,21148,"[#horrible, lewis, dunk, begun, networking, ne...",fear,0.479,"[[-0.25642213, 0.11772158, 0.17571282, -0.0315...",0.515546
2,21149,"[speaking, ex, cobblers, saw, ricky, holmes, c...",fear,0.417,"[[-0.11143019, 0.11400332, 0.12179603, -0.3693...",0.518386
3,21150,"[ball, watching, amp, rojod, header, equally, ...",fear,0.475,"[[-0.43671992, 0.057505384, -0.19090268, -0.25...",0.489378
4,21151,"[really, #jumanji, rock, jack, black, kevin, h...",fear,0.542,"[[-0.3343676, 0.090099744, -0.1592446, -0.0180...",0.464805
...,...,...,...,...,...,...
105,21252,"[staff, fr, asked, info, told, look, online, g...",fear,0.312,"[[-0.1450364, 0.06649841, 0.24776919, -0.07256...",0.473111
106,21253,"[staff, fr, asked, info, told, look, online, g...",fear,0.271,"[[-0.14554901, 0.13766387, 0.14240709, -0.0481...",0.496907
107,21254,"[adviser, #european, #unions, top, #court, sai...",fear,0.500,"[[-0.66618294, 0.1709193, -0.10170758, -0.2963...",0.526565
108,21255,"[mths, ago, signed, velocity, ff, deal, months...",fear,0.479,"[[-0.97019094, -0.03449127, 0.4789242, -0.0327...",0.505192


In [51]:
df3= pd.read_csv(r"C:\Users\i\Downloads\fear-ratings-0to1.test.target.txt", delimiter='\t', header=None)
df3.columns = ['Id', 'tweet', 'emotion', 'score']

In [52]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df3['tweet'] = df3['tweet'].apply(preprocess_tweet)

In [53]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df3['tweet'] = df3['tweet'].apply(remove_mentions)

In [54]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text)  
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df3['tweet'] = df3['tweet'].apply(clean_text)


In [55]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text
df3['tweet'] = df3['tweet'].apply(convert_emoji)

In [56]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df3['tweet'] = df3['tweet'].apply(tokenize_tweets)

In [57]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'all',
    'about', 'above', 'after', 'again', 'against', 'ain', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can',
    'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't",
    'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't",
    'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how',
    'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma',
    'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no',
    'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn',
    "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them',
    'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until',
    'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
    'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you',
    "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']


def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens
df3['tweet'] = df3['tweet'].apply(remove_stopwords)

In [58]:
import torch
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


def extract_features(tweet):
    
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    
    return features
df3['features'] = df3['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [59]:
df3['features'][713].shape

(7, 768)

In [60]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features3 = df3['features'].tolist()
padded_features3 = pad_sequences(features3, padding='post')
padded_df3 = df3.copy()
padded_df3['features'] = padded_features3.tolist()

In [61]:
X = np.stack(padded_df3['features'])
print('Input feature shape:', X.shape)

Input feature shape: (995, 38, 768)


In [62]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_sequence_length=41
X_=pad_sequences(X,maxlen=max_sequence_length,padding="post",truncating='post')

In [63]:
X_ = np.reshape(X_, (995, 41 * 768))  

In [64]:
saved_model_path = 'regression_model.pkl'
loaded_model = joblib.load(saved_model_path)

In [65]:
predicttest = loaded_model.predict(X_)
predicttest

array([0.56163864, 0.51306322, 0.4911582 , 0.48265492, 0.50799626,
       0.48488318, 0.55193031, 0.49922372, 0.53275937, 0.59387914,
       0.53834522, 0.50110795, 0.51439792, 0.53761572, 0.5607375 ,
       0.52954809, 0.49246717, 0.50880134, 0.44178479, 0.49924436,
       0.46037391, 0.44879661, 0.49210093, 0.49768345, 0.5224496 ,
       0.50648248, 0.51934193, 0.52158265, 0.47562154, 0.48426472,
       0.43132165, 0.55888839, 0.52327494, 0.47467362, 0.45285242,
       0.52749383, 0.48436272, 0.53138231, 0.58127494, 0.55932536,
       0.4564952 , 0.5140102 , 0.51263206, 0.55578844, 0.51086681,
       0.56617973, 0.45498749, 0.48492568, 0.51306675, 0.44088841,
       0.51016353, 0.5084152 , 0.51109518, 0.52650817, 0.5334673 ,
       0.53578055, 0.50776358, 0.48758918, 0.50693228, 0.45353677,
       0.40191551, 0.53343769, 0.51720829, 0.52467952, 0.56076556,
       0.53943856, 0.51978126, 0.45764097, 0.50910672, 0.483982  ,
       0.43510129, 0.51312987, 0.5722104 , 0.45628035, 0.47976

In [66]:
df3['predicttest']=pd.DataFrame(predicttest)

In [67]:
df3=df3.drop(['score','features'],axis=1)

In [68]:
df3

Unnamed: 0,Id,tweet,emotion,predicttest
0,21257,"[#matthew, ncould, somebody, shoot, #video, it...",fear,0.561639
1,21258,"[really, sucks, typing, mobile, device, always...",fear,0.513063
2,21259,"[#afraid, #quiet, ones, ones, actually, #think]",fear,0.491158
3,21260,"[hes, horrible, person, gag, see, people, quote]",fear,0.482655
4,21261,"[fear, usually, need, tim, ferriss, #inspiring...",fear,0.507996
...,...,...,...,...
990,22247,"[vs, atlanta, yr, vs, rockies, dbacks, yr, tha...",fear,0.451459
991,22248,"[im, shaking]",fear,0.507103
992,22249,"[guys, dating, yet, #trans, #nervous, #blowjob...",fear,0.591805
993,22250,"[listening, eurythmicsnme, polish, gothic, met...",fear,0.526628
