In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df1= pd.read_csv(r"C:\Users\i\Downloads\sadness-ratings-0to1.train.txt", delimiter='\t', header=None)
df1.columns = ['Id', 'tweet', 'emotion', 'score']

In [3]:
print(df1)


        Id                                              tweet  emotion  score
0    40000                      Depression sucks! #depression  sadness  0.958
1    40001            Feeling worthless as always #depression  sadness  0.958
2    40002                       Feeling worthless as always   sadness  0.958
3    40003  My #Fibromyalgia has been really bad lately wh...  sadness  0.946
4    40004  Im think ima lay in bed all day and sulk. Life...  sadness  0.934
..     ...                                                ...      ...    ...
781  40781  @VivienLloyd Thank you so much! Just home - st...  sadness  0.104
782  40782              Just put the winter duvet on ☃️❄️🌬☔️   sadness  0.104
783  40783  @SilkInSide @TommyJoeRatliff that's so pretty!...  sadness  0.088
784  40784  @BluesfestByron second artist announcement loo...  sadness  0.083
785  40785  I can literally eat creamy pesto pasta topped ...  sadness  0.083

[786 rows x 4 columns]


In [4]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import emoji
import re

In [5]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df1['tweet'] = df1['tweet'].apply(preprocess_tweet)

In [6]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df1['tweet'] = df1['tweet'].apply(remove_mentions)


In [7]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text) 
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df1['tweet'] = df1['tweet'].apply(clean_text)



In [8]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,40000,depression sucks #depression,sadness,0.958
1,40001,feeling worthless as always #depression,sadness,0.958
2,40002,feeling worthless as always,sadness,0.958
3,40003,my #fibromyalgia has been really bad lately wh...,sadness,0.946
4,40004,im think ima lay in bed all day and sulk life ...,sadness,0.934
...,...,...,...,...
781,40781,thank you so much just home stunned but so hap...,sadness,0.104
782,40782,just put the winter duvet on,sadness,0.104
783,40783,thats so pretty i love the sky in the backgrou...,sadness,0.088
784,40784,second artist announcement looking good #blues...,sadness,0.083


In [9]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text

In [10]:
df1['tweet'] = df1['tweet'].apply(convert_emoji)

In [11]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df1['tweet'] = df1['tweet'].apply(tokenize_tweets)

In [12]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,40000,"[depression, sucks, #depression]",sadness,0.958
1,40001,"[feeling, worthless, as, always, #depression]",sadness,0.958
2,40002,"[feeling, worthless, as, always]",sadness,0.958
3,40003,"[my, #fibromyalgia, has, been, really, bad, la...",sadness,0.946
4,40004,"[im, think, ima, lay, in, bed, all, day, and, ...",sadness,0.934
...,...,...,...,...
781,40781,"[thank, you, so, much, just, home, stunned, bu...",sadness,0.104
782,40782,"[just, put, the, winter, duvet, on]",sadness,0.104
783,40783,"[thats, so, pretty, i, love, the, sky, in, the...",sadness,0.088
784,40784,"[second, artist, announcement, looking, good, ...",sadness,0.083


In [13]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'all',
    'about', 'above', 'after', 'again', 'against', 'ain', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can',
    'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't",
    'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't",
    'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how',
    'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma',
    'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no',
    'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn',
    "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them',
    'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until',
    'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
    'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you',
    "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens

df1['tweet'] = df1['tweet'].apply(remove_stopwords)

In [14]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,40000,"[depression, sucks, #depression]",sadness,0.958
1,40001,"[feeling, worthless, always, #depression]",sadness,0.958
2,40002,"[feeling, worthless, always]",sadness,0.958
3,40003,"[#fibromyalgia, really, bad, lately, good, men...",sadness,0.946
4,40004,"[im, think, ima, lay, bed, day, sulk, life, hi...",sadness,0.934
...,...,...,...,...
781,40781,"[thank, much, home, stunned, happy, dont, thin...",sadness,0.104
782,40782,"[put, winter, duvet]",sadness,0.104
783,40783,"[thats, pretty, love, sky, background, purple,...",sadness,0.088
784,40784,"[second, artist, announcement, looking, good, ...",sadness,0.083


In [15]:
import torch
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def extract_features(tweet):
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    
    return features

df1['features'] = df1['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
from sklearn.utils import shuffle
df1 = shuffle(df1)
df1

Unnamed: 0,Id,tweet,emotion,score,features
250,40250,"[blame, manager, watching, players, play, abys...",sadness,0.604,"[[-0.18551016, 0.14533976, 0.095107906, -0.145..."
275,40275,"[dont, let, behavior, others, destroy, ur, inn...",sadness,0.583,"[[-0.3205233, 0.28793436, -0.044567768, -0.051..."
130,40130,"[weeks, full, mondays, end, #disheartened]",sadness,0.688,"[[0.016893499, 0.10395066, 0.3136695, -0.03307..."
644,40644,"[bows, moment, rather, sulk, im, going, opinio...",sadness,0.312,"[[-0.28913236, 0.28231645, -0.009142242, -0.26..."
79,40079,"[god, full, shilling, seriously, need, major, ...",sadness,0.750,"[[-0.11578786, 0.16990525, 0.34702885, -0.2188..."
...,...,...,...,...,...
684,40684,"[#pessimist, complains, wind, #optimist, expec...",sadness,0.271,"[[-0.57726145, 0.15732256, -0.009420659, -0.09..."
542,40542,"[hard, tell, pic, mistake, either, way, nothin...",sadness,0.375,"[[-0.48719376, -0.17063761, 0.1746943, -0.0378..."
530,40530,"[sulky, pants]",sadness,0.396,"[[-0.28299236, 0.0910921, -0.27471957, 0.04074..."
523,40523,"[theres, sitcom, better, cant, laugh, sheldon,...",sadness,0.396,"[[-0.41907048, -0.24152792, 0.14949818, -0.070..."


In [17]:
df1.features[0].shape

(6, 768)

In [18]:

df1.head(10)

Unnamed: 0,Id,tweet,emotion,score,features
250,40250,"[blame, manager, watching, players, play, abys...",sadness,0.604,"[[-0.18551016, 0.14533976, 0.095107906, -0.145..."
275,40275,"[dont, let, behavior, others, destroy, ur, inn...",sadness,0.583,"[[-0.3205233, 0.28793436, -0.044567768, -0.051..."
130,40130,"[weeks, full, mondays, end, #disheartened]",sadness,0.688,"[[0.016893499, 0.10395066, 0.3136695, -0.03307..."
644,40644,"[bows, moment, rather, sulk, im, going, opinio...",sadness,0.312,"[[-0.28913236, 0.28231645, -0.009142242, -0.26..."
79,40079,"[god, full, shilling, seriously, need, major, ...",sadness,0.75,"[[-0.11578786, 0.16990525, 0.34702885, -0.2188..."
174,40174,"[dont, see, difference, courting, appealing, w...",sadness,0.667,"[[-0.70383626, 0.20942569, -0.16147777, -0.057..."
514,40514,"[pessimist, someone, opportunity, knocks, comp...",sadness,0.396,"[[-0.028500024, 0.4877816, -0.030809958, -0.59..."
529,40529,"[episode, today, whilst, editing, power, outag...",sadness,0.396,"[[-0.3621486, -0.09056032, 0.17661268, -0.1874..."
423,40423,"[history, repeating, itselfgaa, culture, dare,...",sadness,0.458,"[[-0.40033904, -0.03608309, 0.027729334, -0.31..."
592,40592,"[rich, fumes, sullen, sences, cheerd]",sadness,0.342,"[[0.006814859, 0.261758, -0.032302003, -0.0964..."


In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features = df1['features'].tolist()
padded_features = pad_sequences(features, padding='post')
padded_df = df1.copy()
padded_df['features'] = padded_features.tolist()

In [20]:
x = np.stack(padded_df['features'])

In [21]:
y = np.array(padded_df['score'])    
print("Input shape:", x.shape)
print("Output shape:", y.shape)

Input shape: (786, 41, 768)
Output shape: (786,)


In [22]:
x = np.reshape(x, (786, 41* 768))  
y = np.reshape(y, (786,))

In [23]:
y

array([0.604, 0.583, 0.688, 0.312, 0.75 , 0.667, 0.396, 0.396, 0.458,
       0.342, 0.354, 0.461, 0.458, 0.667, 0.458, 0.812, 0.833, 0.417,
       0.188, 0.771, 0.667, 0.458, 0.417, 0.333, 0.438, 0.438, 0.583,
       0.333, 0.354, 0.583, 0.125, 0.562, 0.438, 0.708, 0.562, 0.375,
       0.479, 0.438, 0.312, 0.604, 0.458, 0.417, 0.278, 0.417, 0.521,
       0.46 , 0.214, 0.208, 0.321, 0.667, 0.854, 0.146, 0.458, 0.271,
       0.646, 0.438, 0.24 , 0.333, 0.625, 0.542, 0.369, 0.729, 0.688,
       0.312, 0.604, 0.25 , 0.375, 0.375, 0.604, 0.646, 0.375, 0.646,
       0.375, 0.146, 0.44 , 0.5  , 0.188, 0.613, 0.771, 0.271, 0.375,
       0.729, 0.167, 0.396, 0.583, 0.479, 0.25 , 0.354, 0.771, 0.604,
       0.833, 0.75 , 0.87 , 0.583, 0.438, 0.479, 0.521, 0.542, 0.417,
       0.52 , 0.625, 0.438, 0.812, 0.604, 0.669, 0.792, 0.132, 0.396,
       0.396, 0.271, 0.958, 0.625, 0.417, 0.688, 0.458, 0.562, 0.375,
       0.541, 0.604, 0.708, 0.271, 0.354, 0.667, 0.271, 0.312, 0.792,
       0.688, 0.188,

In [24]:
X=x.copy()  
Y=y.copy()  

In [25]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

alpha = 2000
model = Ridge(alpha=alpha)
model.fit(X, Y)

Ridge(alpha=2000)

In [26]:
y_pred = model.predict(X)
mse = mean_squared_error(Y, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.02436696906053771


In [27]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(Y, y_pred)

print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.12935629765480813


In [28]:
import joblib
save_path = 'regression_model.pkl'
joblib.dump(model, save_path)


['regression_model.pkl']

In [29]:
df2= pd.read_csv(r"C:\Users\i\Downloads\sadness-ratings-0to1.dev.gold.txt", delimiter='\t', header=None)
df2.columns = ['Id', 'tweet', 'emotion', 'score']

In [30]:
df2

Unnamed: 0,Id,tweet,emotion,score
0,40786,@1johndes ball watching &amp; Rojo'd header wa...,sadness,0.583
1,40787,"A pessimist is someone who, when opportunity k...",sadness,0.188
2,40788,A .500 season is all I'm looking for at this p...,sadness,0.688
3,40789,"Stars, when you shine,\nYou know how I feel.\n...",sadness,0.292
4,40790,All I want to do is watch some netflix but I a...,sadness,0.667
...,...,...,...,...
69,40855,Common app just randomly logged me out as I wa...,sadness,0.833
70,40856,"I'd rather laugh with the rarest genius, in be...",sadness,0.688
71,40857,If you #invest in my new #film I will stop ask...,sadness,0.458
72,40858,"Just watched Django Unchained, Other people ma...",sadness,0.333


In [31]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df2['tweet'] = df2['tweet'].apply(preprocess_tweet)

In [32]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df2['tweet'] = df2['tweet'].apply(remove_mentions)


In [33]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text)  
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df2['tweet'] = df2['tweet'].apply(clean_text)


In [34]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text
df2['tweet'] = df2['tweet'].apply(convert_emoji)

In [35]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df2['tweet'] = df2['tweet'].apply(tokenize_tweets)

In [36]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'all',
    'about', 'above', 'after', 'again', 'against', 'ain', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can',
    'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't",
    'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't",
    'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how',
    'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma',
    'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no',
    'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn',
    "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them',
    'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until',
    'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
    'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you',
    "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']


def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens
df2['tweet'] = df2['tweet'].apply(remove_stopwords)

In [37]:
import torch
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


def extract_features(tweet):
    
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    
    return features
df2['features'] = df2['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features2 = df2['features'].tolist()
padded_features2 = pad_sequences(features2, padding='post')
padded_df2 = df2.copy()
padded_df2['features'] = padded_features2.tolist()

In [39]:
X = np.stack(padded_df2['features'])
print('Input feature shape:', X.shape)

Input feature shape: (74, 30, 768)


In [40]:
print('Input feature shape:', X.shape)

Input feature shape: (74, 30, 768)


In [44]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_sequence_length=41
X_=pad_sequences(X,maxlen=max_sequence_length,padding="post",truncating='post')

In [46]:
X_ = np.reshape(X_, (74, 41 * 768))  

In [47]:
saved_model_path = 'regression_model.pkl'
loaded_model = joblib.load(saved_model_path)

In [48]:
predictions = loaded_model.predict(X_)
predictions

array([0.48784655, 0.4817829 , 0.51502776, 0.48867738, 0.51152269,
       0.47268669, 0.51427968, 0.54073416, 0.49474977, 0.45404355,
       0.47473635, 0.53372453, 0.5461583 , 0.51911273, 0.50145796,
       0.49031687, 0.46854585, 0.50248791, 0.46383519, 0.50515464,
       0.50919774, 0.5064765 , 0.5306432 , 0.47929434, 0.45840868,
       0.4404831 , 0.47490827, 0.52702248, 0.48262534, 0.45128623,
       0.5590954 , 0.56246497, 0.48169456, 0.50034924, 0.45543836,
       0.55754581, 0.55863748, 0.47730018, 0.51305815, 0.52449995,
       0.50874266, 0.50665062, 0.51627427, 0.44890362, 0.4688324 ,
       0.52229636, 0.56065039, 0.46657713, 0.4651449 , 0.46823075,
       0.51649416, 0.49475265, 0.44925808, 0.50765783, 0.48376244,
       0.49233456, 0.49040889, 0.51422773, 0.51234926, 0.48375273,
       0.55012716, 0.44118504, 0.47085625, 0.49575477, 0.46674028,
       0.5094721 , 0.44222625, 0.49247741, 0.46084586, 0.49587525,
       0.49701455, 0.44745434, 0.45570542, 0.52423579])

In [49]:
df2['predict']=pd.DataFrame(predictions)

In [50]:
mse=mean_squared_error(df2['score'],df2['predict'])
print("mean_square_error:",mse)

mean_square_error: 0.02818928876422047


In [51]:
mae = mean_absolute_error(df2['score'],df2['predict'])
print("Mean Absolute Error:", mae)


Mean Absolute Error: 0.14187912392873117


In [52]:
df2

Unnamed: 0,Id,tweet,emotion,score,features,predict
0,40786,"[ball, watching, amp, rojod, header, equally, ...",sadness,0.583,"[[-0.43671992, 0.057505384, -0.19090268, -0.25...",0.487847
1,40787,"[pessimist, someone, opportunity, knocks, comp...",sadness,0.188,"[[-0.04478655, 0.30576694, 0.06919098, -0.2915...",0.481783
2,40788,"[season, im, looking, point, #depressing, #roy...",sadness,0.688,"[[-0.3687506, 0.162658, -0.02078579, -0.126300...",0.515028
3,40789,"[stars, shinenyou, know, feelnscent, pine, nyo...",sadness,0.292,"[[-0.2151165, 0.2684997, -0.24530822, 0.244578...",0.488677
4,40790,"[want, watch, netflix, stuck, class, #depressing]",sadness,0.667,"[[0.16816223, -0.09243305, 0.078344375, 0.0814...",0.511523
...,...,...,...,...,...,...
69,40855,"[common, app, randomly, logged, writing, last,...",sadness,0.833,"[[-0.33207858, -0.17362213, -0.4620683, -0.113...",0.495875
70,40856,"[id, rather, laugh, rarest, genius, beautiful,...",sadness,0.688,"[[-0.18101707, 0.03217506, 0.012613542, -0.036...",0.497015
71,40857,"[#invest, new, #film, stop, asking, invest, ne...",sadness,0.458,"[[0.06594525, 0.13042212, 0.05061286, -0.07066...",0.447454
72,40858,"[watched, django, unchained, people, may, frow...",sadness,0.333,"[[0.15952902, 0.13285297, 0.2045169, 0.0801301...",0.455705


In [53]:
df3= pd.read_csv(r"C:\Users\i\Downloads\sadness-ratings-0to1.test.gold.txt", delimiter='\t', header=None)
df3.columns = ['Id', 'tweet', 'emotion', 'score']

In [54]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df3['tweet'] = df3['tweet'].apply(preprocess_tweet)

In [55]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df3['tweet'] = df3['tweet'].apply(remove_mentions)

In [56]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text)  
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df3['tweet'] = df3['tweet'].apply(clean_text)


In [57]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text
df3['tweet'] = df3['tweet'].apply(convert_emoji)

In [58]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df3['tweet'] = df3['tweet'].apply(tokenize_tweets)

In [59]:
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
    'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your', 'so', 'all',
    'about', 'above', 'after', 'again', 'against', 'ain', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can',
    'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't",
    'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't",
    'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how',
    'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma',
    'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no',
    'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn',
    "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them',
    'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until',
    'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
    'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you',
    "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']


def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens
df3['tweet'] = df3['tweet'].apply(remove_stopwords)

In [60]:
import torch
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


def extract_features(tweet):
    
    tokenized_text = ' '.join(tweet)
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states.squeeze(0).numpy()
    
    return features
df3['features'] = df3['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features3 = df3['features'].tolist()
padded_features3 = pad_sequences(features3, padding='post')
padded_df3 = df3.copy()
padded_df3['features'] = padded_features3.tolist()

In [63]:
X = np.stack(padded_df3['features'])
print('Input feature shape:', X.shape)

Input feature shape: (673, 37, 768)


In [64]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_sequence_length=41
X_=pad_sequences(X,maxlen=max_sequence_length,padding="post",truncating='post')

In [66]:
X_ = np.reshape(X_, (673, 41 * 768))  

In [67]:
saved_model_path = 'regression_model.pkl'
loaded_model = joblib.load(saved_model_path)

In [68]:
predicttest = loaded_model.predict(X_)
predicttest

array([0.46550617, 0.48496906, 0.4413564 , 0.51697883, 0.53039954,
       0.50574699, 0.4649814 , 0.4939384 , 0.47088382, 0.51133294,
       0.55577054, 0.48435694, 0.49318285, 0.46069021, 0.53101276,
       0.4482887 , 0.46273076, 0.49654763, 0.44395651, 0.55308603,
       0.49224111, 0.49849717, 0.48515881, 0.53256576, 0.47714708,
       0.47669385, 0.47740875, 0.47837025, 0.47489953, 0.48593391,
       0.57561818, 0.46628537, 0.47929332, 0.46262995, 0.467198  ,
       0.48732086, 0.49067047, 0.50705313, 0.48771711, 0.49097021,
       0.45419565, 0.4538678 , 0.51537679, 0.51195464, 0.45544081,
       0.53256127, 0.51689947, 0.49481387, 0.46616372, 0.5317275 ,
       0.48900705, 0.48223959, 0.47944081, 0.53620596, 0.49536154,
       0.49845128, 0.49308951, 0.48557988, 0.47463475, 0.50573635,
       0.48139452, 0.44471647, 0.55929044, 0.5142178 , 0.47361906,
       0.55544939, 0.56867547, 0.48858857, 0.4981999 , 0.50127756,
       0.43687922, 0.5166149 , 0.49274696, 0.4222919 , 0.50296

In [69]:
df3['predicttest']=pd.DataFrame(predicttest)

In [70]:
df3=df3.drop(['score','features'],axis=1)

In [71]:
df3

Unnamed: 0,Id,tweet,emotion,predicttest
0,40860,"[teens, sons, left, car, get, haircuts, im, pr...",sadness,0.465506
1,40861,"[teens, sons, left, car, get, haircuts, im, pr...",sadness,0.484969
2,40862,"[hartramseysuplift, youre, still, discouraged,...",sadness,0.441356
3,40863,"[nearly, dropped, phone, sink, hahahaha]",sadness,0.516979
4,40864,"[whenever, im, feeling, sad, listen, monsta, h...",sadness,0.530400
...,...,...,...,...
668,41528,"[candice, constantly, pout, #gbbo]",sadness,0.516822
669,41529,"[#unhappy, #redbus, cc, talked, week, still, d...",sadness,0.486785
670,41530,"[pull, afew, weeks, ago, sadly, theres, game, ...",sadness,0.437649
671,41531,"[im, buying, art, supplies, im, debating, seri...",sadness,0.534854
