In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df1= pd.read_csv("Desktop/emoint tweet/joy-ratings-0to1.train.txt", delimiter='\t', header=None)
df1.columns = ['Id', 'tweet', 'emotion', 'score']

In [3]:
print(df1)


        Id                                              tweet emotion  score
0    30000  Just got back from seeing @GaryDelaney in Burs...     joy  0.980
1    30001  Oh dear an evening of absolute hilarity I don'...     joy  0.958
2    30002  Been waiting all week for this game ❤️❤️❤️ #ch...     joy  0.940
3    30003  @gardiner_love : Thank you so much, Gloria! Yo...     joy  0.938
4    30004  I feel so blessed to work with the family that...     joy  0.938
..     ...                                                ...     ...    ...
818  30818  It's just the lack of company and liveliness o...     joy  0.058
819  30819             Quinn's short hair makes me sad. #glee     joy  0.040
820  30820  hate overthinking e v e r y t h i n g like i j...     joy  0.040
821  30821  People who cheer for sports teams completely o...     joy  0.020
822  30822  @DamnPatriot You're a POS for rejoicing in som...     joy  0.019

[823 rows x 4 columns]


In [4]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import emoji
import re

In [5]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    return tweet
df1['tweet'] = df1['tweet'].apply(preprocess_tweet)

In [6]:
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

df1['tweet'] = df1['tweet'].apply(remove_mentions)


In [7]:
def clean_text(text):
    text = re.sub(r'[^\w\s#@]', '', text)  # Remove all non-word characters except # and @ symbols
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df1['tweet'] = df1['tweet'].apply(clean_text)



In [8]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,30000,just got back from seeing in burslem amazing f...,joy,0.980
1,30001,oh dear an evening of absolute hilarity i dont...,joy,0.958
2,30002,been waiting all week for this game #cheer #fr...,joy,0.940
3,30003,thank you so much gloria youre so sweet and th...,joy,0.938
4,30004,i feel so blessed to work with the family that...,joy,0.938
...,...,...,...,...
818,30818,its just the lack of company and liveliness ou...,joy,0.058
819,30819,quinns short hair makes me sad #glee,joy,0.040
820,30820,hate overthinking e v e r y t h i n g like i j...,joy,0.040
821,30821,people who cheer for sports teams completely o...,joy,0.020


In [9]:
def convert_emoji(tweet):
    text = emoji.demojize(tweet)
    return text

In [10]:
df1['tweet'] = df1['tweet'].apply(convert_emoji)

In [11]:
def tokenize_tweets(tweet):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(tweet)

df1['tweet'] = df1['tweet'].apply(tokenize_tweets)

In [12]:
df1

Unnamed: 0,Id,tweet,emotion,score
0,30000,"[just, got, back, from, seeing, in, burslem, a...",joy,0.980
1,30001,"[oh, dear, an, evening, of, absolute, hilarity...",joy,0.958
2,30002,"[been, waiting, all, week, for, this, game, #c...",joy,0.940
3,30003,"[thank, you, so, much, gloria, youre, so, swee...",joy,0.938
4,30004,"[i, feel, so, blessed, to, work, with, the, fa...",joy,0.938
...,...,...,...,...
818,30818,"[its, just, the, lack, of, company, and, livel...",joy,0.058
819,30819,"[quinns, short, hair, makes, me, sad, #glee]",joy,0.040
820,30820,"[hate, overthinking, e, v, e, r, y, t, h, i, n...",joy,0.040
821,30821,"[people, who, cheer, for, sports, teams, compl...",joy,0.020


In [13]:

# Define a list of stopwords
stop_words = [
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is','this','all'
    'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your','so'
]

# Function to remove stopwords and single-letter words from a list of tokens
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
    return filtered_tokens

# Apply the remove_stopwords function to the 'tweet' column
df1['tweet'] = df1['tweet'].apply(remove_stopwords)



In [14]:
df1


Unnamed: 0,Id,tweet,emotion,score
0,30000,"[just, got, back, seeing, burslem, amazing, fa...",joy,0.980
1,30001,"[oh, dear, evening, absolute, hilarity, dont, ...",joy,0.958
2,30002,"[been, waiting, all, week, game, #cheer, #friday]",joy,0.940
3,30003,"[thank, much, gloria, youre, sweet, thoughtful...",joy,0.938
4,30004,"[feel, blessed, work, family, nanny, nothing, ...",joy,0.938
...,...,...,...,...
818,30818,"[just, lack, company, liveliness, out, here, m...",joy,0.058
819,30819,"[quinns, short, hair, makes, me, sad, #glee]",joy,0.040
820,30820,"[hate, overthinking, like, jus, wanna, happy, ...",joy,0.040
821,30821,"[people, who, cheer, sports, teams, completely...",joy,0.020


In [15]:
import torch
from transformers import BertModel, BertTokenizer


# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to extract features from a tokenized tweet
def extract_features(tweet):
    # Convert the list of tokens to a string
    tokenized_text = ' '.join(tweet)
    
    # Tokenize the text using BERT tokenizer
    input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    
    # Pass the input tensor through the BERT model to extract features
    with torch.no_grad():
        outputs = model(input_ids)
    
    # Get the last layer hidden states
    last_hidden_states = outputs.last_hidden_state
    
    # Extract the features from the last hidden states
    features = last_hidden_states.squeeze(0).numpy()
    
    return features

# Apply feature extraction to the 'tweet' column of the DataFrame
df1['features'] = df1['tweet'].apply(extract_features)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
df1

Unnamed: 0,Id,tweet,emotion,score,features
0,30000,"[just, got, back, seeing, burslem, amazing, fa...",joy,0.980,"[[-0.25902194, 0.021936538, 0.5708762, -0.6156..."
1,30001,"[oh, dear, evening, absolute, hilarity, dont, ...",joy,0.958,"[[0.11824173, 0.13200244, 0.048643403, -0.2167..."
2,30002,"[been, waiting, all, week, game, #cheer, #friday]",joy,0.940,"[[-0.10390823, 0.012376993, -0.11558945, 0.009..."
3,30003,"[thank, much, gloria, youre, sweet, thoughtful...",joy,0.938,"[[0.011071219, -0.05531338, 0.16319497, -0.004..."
4,30004,"[feel, blessed, work, family, nanny, nothing, ...",joy,0.938,"[[-0.08090295, 0.37672976, 0.34673566, -0.3677..."
...,...,...,...,...,...
818,30818,"[just, lack, company, liveliness, out, here, m...",joy,0.058,"[[0.2985258, 0.44204718, 0.40646386, -0.412263..."
819,30819,"[quinns, short, hair, makes, me, sad, #glee]",joy,0.040,"[[-0.06435747, -0.11854539, 0.3118093, -0.0855..."
820,30820,"[hate, overthinking, like, jus, wanna, happy, ...",joy,0.040,"[[-0.2011238, 0.43611714, -0.2035487, 0.024425..."
821,30821,"[people, who, cheer, sports, teams, completely...",joy,0.020,"[[0.09072961, 0.124670625, -0.05488845, -0.106..."


In [17]:
df1.features[0].shape

(17, 768)

In [18]:

df1.head(10)

Unnamed: 0,Id,tweet,emotion,score,features
0,30000,"[just, got, back, seeing, burslem, amazing, fa...",joy,0.98,"[[-0.25902194, 0.021936538, 0.5708762, -0.6156..."
1,30001,"[oh, dear, evening, absolute, hilarity, dont, ...",joy,0.958,"[[0.11824173, 0.13200244, 0.048643403, -0.2167..."
2,30002,"[been, waiting, all, week, game, #cheer, #friday]",joy,0.94,"[[-0.10390823, 0.012376993, -0.11558945, 0.009..."
3,30003,"[thank, much, gloria, youre, sweet, thoughtful...",joy,0.938,"[[0.011071219, -0.05531338, 0.16319497, -0.004..."
4,30004,"[feel, blessed, work, family, nanny, nothing, ...",joy,0.938,"[[-0.08090295, 0.37672976, 0.34673566, -0.3677..."
5,30005,"[today, reached, subscribers, yt, #goodday, #t...",joy,0.926,"[[-0.05542418, -0.051494613, 0.16199583, -0.01..."
6,30006,"[good, morning, love, happy, first, day, fall,...",joy,0.924,"[[-0.112404265, -0.09642104, 0.22093175, -0.08..."
7,30007,"[#bridgetjonesbaby, best, thing, ive, seen, ag...",joy,0.922,"[[-0.34465936, 0.017047953, -0.1802155, -0.035..."
8,30008,"[just, got, back, seeing, burslem, amazing, fa...",joy,0.92,"[[0.10194833, 0.3232219, 0.32963663, -0.629902..."
9,30009,"[thought, holidays, could, not, get, any, more...",joy,0.917,"[[-0.08916461, 0.0760634, 0.11607354, -0.01644..."


In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming df1 is your DataFrame
features = df1['features'].tolist()

# Pad the nested arrays
padded_features = pad_sequences(features, padding='post')

# Convert the padded features back to a DataFrame
padded_df = df1.copy()
padded_df['features'] = padded_features.tolist()

In [20]:
X = np.stack(padded_df['features'])

# Print the shape of the input feature array
print('Input feature shape:', X.shape)

Input feature shape: (823, 42, 768)


In [21]:
y = np.array(padded_df['score'])     # Output variable

# Print the shape of the input and output sets
print("Input shape:", X.shape)
print("Output shape:", y.shape)

Input shape: (823, 42, 768)
Output shape: (823,)


In [22]:
X = np.reshape(X, (823, 42 * 768))  # Reshape input to 2D
y = np.reshape(y, (823,))

In [23]:
# import sklearn
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# print(type(df1['tweet']))

In [24]:
# TypeError                                 Traceback (most recent call last)
# <ipython-input-31-c5644810a03e> in <module>
#       1 tweets = df1['tweet'].tolist()
# ----> 2 joined_tweets = ' '.join(tweets)
#       3 
#       4 vectorizer_tfidf = TfidfVectorizer()
#       5 X_tfidf = vectorizer_tfidf.fit_transform([joined_tweets])

# TypeError: sequence item 0: expected str instance, list found
# import numpy as np

# input_data = df1.features  # Replace with your input data
# train_X = np.array(input_data)

In [25]:
# # Check dimensions of X_tfidf and emotions
# print("X_tfidf shape:", X_tfidf.shape)
# print("emotions shape:", emotions.shape)




In [26]:


# train_X = X_tfidf.copy()
     

# train_X.to_numpy()
     


In [27]:
X.shape

(823, 32256)

In [28]:
Y=df1['score'].copy()

     

In [29]:
#cols = ['id', 'tweet', 'emotion', 'score']

In [30]:
# df2 = pd.read_csv("Desktop/emoint tweet/joy-ratings-0to1.dev.gold.txt", header=None, sep='\t', names=cols, index_col=0)

# df2.head()

In [31]:
# def preprocess_tweet(tweet):
#     tweet = tweet.lower()
#     tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
#     return tweet
# df2['tweet'] = df2['tweet'].apply(preprocess_tweet)

In [32]:
# def remove_mentions(tweet):
#     return re.sub(r'@\w+', '', tweet)

# df2['tweet'] = df2['tweet'].apply(remove_mentions)


In [33]:
# def clean_text(text):
#     text = re.sub(r'[^\w\s#@]', '', text)  # Remove all non-word characters except # and @ symbols
#     text = re.sub(r'\d+', '', text)  # Remove numbers
#     text = re.sub(r'\s+', ' ', text).strip()
#     return text

# df2['tweet'] = df2['tweet'].apply(clean_text)


In [34]:
# def convert_emoji(tweet):
#     text = emoji.demojize(tweet)
#     return text

In [35]:
# df2['tweet'] = df2['tweet'].apply(convert_emoji)

In [36]:
# def tokenize_tweets(tweet):
#     tokenizer = TweetTokenizer()
#     return tokenizer.tokenize(tweet)
# df2['tweet'] = df2['tweet'].astype(str)
# df2['tweet'] = df2['tweet'].apply(tokenize_tweets)


In [37]:

# # Define a list of stopwords
# stop_words = [
#     'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is','this','all'
#     'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'i', 'you', 'your','so'
# ]

# # Function to remove stopwords and single-letter words from a list of tokens
# def remove_stopwords(tokens):
#     filtered_tokens = [token for token in tokens if len(token) > 1 and token.lower() not in stop_words]
#     return filtered_tokens

# # Apply the remove_stopwords function to the 'tweet' column
# df2['tweet'] = df2['tweet'].apply(remove_stopwords)



In [38]:
# print(df2)

In [39]:
# import sklearn
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
# vectorizer_tfidf = TfidfVectorizer()

# vectorizer_tfidf.fit(df1)
# X_tfidf = vectorizer_tfidf.fit_transform(df2)
# X_tfidf = pd.DataFrame.sparse.from_spmatrix(X_tfidf)
     

In [41]:
# val_X = X_Dev_tfidf.copy()

In [42]:
# X_Dev_tfidf = vectorizer_tfidf.transform(df2)
# #X_Dev_tfidf = pd.DataFrame.sparse.from_spmatrix(X_Dev_tfidf)

   

In [43]:
#val_X = X_Dev_tfidf.copy()

In [44]:
# val_X.to_numpy()
#      

In [45]:

# val_Y=df1['score'].copy()
# val_Y.to_numpy()
     


In [46]:
# val_Y.to_numpy()
     

In [47]:
# import torch
# from transformers import BertModel, BertTokenizer


# # Load the pre-trained BERT model and tokenizer
# model_name = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertModel.from_pretrained(model_name)

# # Function to extract features from a tokenized tweet
# def extract_features(tweet):
#     # Convert the list of tokens to a string
#     tokenized_text = ' '.join(tweet)
    
#     # Tokenize the text using BERT tokenizer
#     input_ids = torch.tensor(tokenizer.encode(tokenized_text, add_special_tokens=True)).unsqueeze(0)
    
#     # Pass the input tensor through the BERT model to extract features
#     with torch.no_grad():
#         outputs = model(input_ids)
    
#     # Get the last layer hidden states
#     last_hidden_states = outputs.last_hidden_state
    
#     # Extract the features from the last hidden states
#     features = last_hidden_states.squeeze(0).numpy()
    
#     return features

# # Apply feature extraction to the 'tweet' column of the DataFrame
# df2['features'] = df2['tweet'].apply(extract_features)

In [48]:
# df2

In [49]:
import tensorflow as tf

In [50]:
from keras.models import Sequential
from keras.layers import Dense,Dropout
model = Sequential()
model.add(Dense(32256, input_shape=(32256,), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

optimiser = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='mae', optimizer=optimiser ,metrics=["mae"])
model.summary()
     


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32256)             1040481792
                                                                 
 dropout (Dropout)           (None, 32256)             0         
                                                                 
 dense_1 (Dense)             (None, 512)               16515584  
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                32832     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 6

In [51]:
# Convert the lists to strings
df1['tweet'] = df1['tweet'].apply(lambda tokens: ' '.join(tokens))

# Convert DataFrame to NumPy array
#Convert DataFrame column to NumPy array
array = df1['tweet'].values

# Convert NumPy array to TensorFlow tensor
tensor = tf.convert_to_tensor(array)




In [None]:
history = model.fit(X,Y,batch_size=32,epochs=10,shuffle=True,verbose=1)#callbacks=stopping_criterions


Epoch 1/10
