# Tweet Generation using NLP

## Importing necessary libraries

In [25]:
import pandas as pd
import re
import csv
!pip install keras
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Dataset Reading

In [27]:
# Opening Sentiment140 dataset in text mode to handle encoding errors
with open('Sentiment140.csv', 'rt',encoding='ISO-8859-1') as f:
    reader = csv.reader(f, delimiter=',', quotechar='"')

    # Skipping the problematic lines and creating a new list of valid rows
    rows = []
    for i, row in enumerate(reader):
        if len(row) == 6:
            rows.append(row)
        else:
            print(f'Skipping row {i}: {row}')

df = pd.DataFrame(rows, columns=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head()



Skipping row 14064: ['0', '1676065473', 'Fri May 01 21:00:36 PDT 2009', 'NO_QUERY', 'trslovenk I\'ll live the rest of my life driving a mini van "']
Skipping row 35219: ['0', 'ock Band "']
Skipping row 42266: ['0', '1685408188', 'Sun May 03 00:15:39 PDT 2009', 'NO_QUERY', 'ParsnipP', 'I fell asleep in my carry case... but woke up right where I started!! I thought I was going on 1557216761"', 'Sun Apr 19 02:42:44 PDT 2009', 'NO_QUERY', 'fullidoclaire', '242AM just got home. in biggggggg trouble from mommy ']
Skipping row 56405: ['0', '16875642from']
Skipping row 63512: ['0', '1565164613', 'Mon Apr 20 06:04:17 PDT 2009', 'NO_QUERY', 'jeffreytgilbert', '@WSJ noooooooooooooooooooooooooooooooooooooooooooo66', 'Sun May 03 09:01:42 PDT 2009', 'NO_QUERY', 'bentriderro', '@zoeart we run out of chalk. Sorry ']
Skipping row 84696: ['0', '1750957377', 'Sat May 09 18:19:39 PDT 2009', 'NO_QUERY', 'EddieG,Tweetdeck isn\'t working. "']
Skipping row 91746: ['0', '1678144191', 'Sat May 02 05:40:36 PDT 2

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Considering any one user - ElleCTF

In [28]:
df.loc[df['user']=="ElleCTF"]

Unnamed: 0,target,id,date,flag,user,text
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
177172,0,1833373892,Sun May 17 23:05:15 PDT 2009,NO_QUERY,ElleCTF,Killlll me... I can't believe I have the flu
247115,0,1833373892,Sun May 17 23:05:15 PDT 2009,NO_QUERY,ElleCTF,Killlll me... I can't believe I have the flu
637092,0,2044263917,Fri Jun 05 09:06:19 PDT 2009,NO_QUERY,ElleCTF,"@kennypistol it hates me today, first it would..."
646904,0,2045682914,Fri Jun 05 11:05:01 PDT 2009,NO_QUERY,ElleCTF,"On the jurassic park ride, think its gonna rai..."
647293,0,2045915728,Fri Jun 05 11:25:34 PDT 2009,NO_QUERY,ElleCTF,Alright the rides have been suspended bc of th...
714166,0,2044263917,Fri Jun 05 09:06:19 PDT 2009,NO_QUERY,ElleCTF,"@kennypistol it hates me today, first it would..."
724043,0,2045682914,Fri Jun 05 11:05:01 PDT 2009,NO_QUERY,ElleCTF,"On the jurassic park ride, think its gonna rai..."
724432,0,2045915728,Fri Jun 05 11:25:34 PDT 2009,NO_QUERY,ElleCTF,Alright the rides have been suspended bc of th...
843258,0,2071542055,Sun Jun 07 19:33:54 PDT 2009,NO_QUERY,ElleCTF,"Vacation in 2 weeks, Ill be gone for 10 days. ..."


## Data Cleaning

In [29]:
# Dropping any rows with empty content
df.dropna(subset=['text'], inplace=True)

# Dropping duplicate entries
df.drop_duplicates(subset=['text'], inplace=True)

# Resetting index of the dataframe
df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Data Preprocessing

In [30]:
# Filtering tweets by a particular user
target_user = "ElleCTF"
user_tweets = df[df["user"] == target_user]["text"].values
print("User tweets=", user_tweets)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
stemmer=PorterStemmer()

def preprocess_tweet(tweet):
    # Removing URLs
    tweet = re.sub(r"http\S+", "", tweet)

    # Removing mentions
    tweet = re.sub(r"@[^\s]+", "", tweet)

    # Removing hashtags
    tweet = re.sub(r"#", "", tweet)

    # Tokenizing tweet
    tokens = word_tokenize(tweet.lower())

    # Removing stop words and punctuation
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]

    # Lemmatizing tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Stemming tokens
    tokens = [stemmer.stem(token) for token in tokens]

    # Joining tokens to form tweet
    tweet = " ".join(tokens)

    return tweet

# Apply preprocessing to tweets
preprocessed_tweets = [preprocess_tweet(tweet) for tweet in user_tweets]

preprocessed_tweets

User tweets= ['my whole body feels itchy and like its on fire '
 "Killlll me... I can't believe I have the flu "
 "@kennypistol it hates me today, first it wouldn't verify my pass, then the mummy is closed, and now wolverine isn't out here  and its hot"
 'On the jurassic park ride, think its gonna rain. Dnw to be on the edge  second row. Fml I just started to dry off'
 'Alright the rides have been suspended bc of the rain. I am sad '
 'Vacation in 2 weeks, Ill be gone for 10 days. Thats the longest me and megan will have gone without seeing each other since 8th grade '
 "@passionshaker seriously? I never got it  I'm sorry, my computer/phone is an asswhore I'm not mad though, takes a lot to piss me off"
 '@meganctf well, technically Its not a TRUE job. I do it because my corner got shut down  bad times and all'
 '@hurricanehalvo omg wait is it still alive!? im scared for you  lol you are what you have, thats what I tell people :]'
 '@meganctf GET ME SOME CREEPER POSTCARDS!!! Write to me

['whole bodi feel itchi like fire',
 'killlll ca believ flu',
 'hate today first would verifi pa mummi close wolverin hot',
 'jurass park ride think gon na rain dnw edg second row fml start dri',
 'alright ride suspend bc rain sad',
 'vacat week ill gone day that longest megan gone without see sinc grade',
 'serious never got sorri asswhor mad though take lot piss',
 'well technic true job corner got shut bad time',
 'omg wait still aliv im scare lol that tell peopl',
 'get creeper postcard write imi alreadi',
 'cameron would come lake send postcard make hot like fat guy thong someth',
 'clue got problem bite itch till bleed noth help one know wrong']

## Sequence Tokenization

In [19]:
from keras.preprocessing.text import Tokenizer
import tensorflow

tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_tweets)
sequences = tokenizer.texts_to_sequences(preprocessed_tweets)
print("tokenizer.word_index=",tokenizer.word_index)
# Padding sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = max([len(seq) for seq in sequences])
sequences_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding="pre")

vocab_size = len(tokenizer.word_index) + 1

# Model
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length-1))
model.add(LSTM(150))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

# Training the model
from keras.utils import to_categorical

X = sequences_padded[:, :-1]
y = to_categorical(sequences_padded[:, -1], num_classes=vocab_size)
model.fit(X, y, epochs=100, verbose=2)

model.save('model.h5')

tokenizer.eord_index= {'got': 1, 'like': 2, 'would': 3, 'hot': 4, 'ride': 5, 'rain': 6, 'gone': 7, 'that': 8, 'postcard': 9, 'whole': 10, 'bodi': 11, 'feel': 12, 'itchi': 13, 'fire': 14, 'killlll': 15, 'ca': 16, 'believ': 17, 'flu': 18, 'hate': 19, 'today': 20, 'first': 21, 'verifi': 22, 'pa': 23, 'mummi': 24, 'close': 25, 'wolverin': 26, 'jurass': 27, 'park': 28, 'think': 29, 'gon': 30, 'na': 31, 'dnw': 32, 'edg': 33, 'second': 34, 'row': 35, 'fml': 36, 'start': 37, 'dri': 38, 'alright': 39, 'suspend': 40, 'bc': 41, 'sad': 42, 'vacat': 43, 'week': 44, 'ill': 45, 'day': 46, 'longest': 47, 'megan': 48, 'without': 49, 'see': 50, 'sinc': 51, 'grade': 52, 'serious': 53, 'never': 54, 'sorri': 55, 'asswhor': 56, 'mad': 57, 'though': 58, 'take': 59, 'lot': 60, 'piss': 61, 'well': 62, 'technic': 63, 'true': 64, 'job': 65, 'corner': 66, 'shut': 67, 'bad': 68, 'time': 69, 'omg': 70, 'wait': 71, 'still': 72, 'aliv': 73, 'im': 74, 'scare': 75, 'lol': 76, 'tell': 77, 'peopl': 78, 'get': 79, 'creepe

## Function for tweet generation

In [20]:
import numpy as np

def generate_tweet(model, seed_text, max_sequence_length, tokenizer, temperature):
    # Converting seed text to sequence
    seed_seq = tokenizer.texts_to_sequences([seed_text])[0]
    print("seed_seq=",seed_seq)
    # Padding sequence
    seed_seq = pad_sequences([seed_seq], maxlen=max_sequence_length-1, padding="pre")
    print("seed_seq after padding=",seed_seq)
    # Initializing generated tweet with seed text
    generated_tweet = seed_text
    
    # Generating next words using model and seed sequence
    for i in range(max_sequence_length-len(seed_seq)):
        preds = model.predict(seed_seq, verbose=0)[0]
        print("preds=", preds)
        # Applying temperature to adjust randomness
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        
        # Sample from probability distribution to get next word
        next_word_idx = np.random.choice(len(preds), p=preds)
        while next_word_idx+1 not in tokenizer.index_word:
          next_word_idx = np.random.choice(len(preds), p=preds)
        next_word = tokenizer.index_word[next_word_idx+1]
        print("next_word=",next_word)
        # Adding next word to generated tweet
        generated_tweet += " " + next_word
        print("generated_tweet=",generated_tweet)
        # Updating seed sequence with next word
        seed_seq = np.append(seed_seq[:, 1:], [[next_word_idx]], axis=1)
        print("seed_seq after update=",seed_seq)
    return generated_tweet


## Model Path

In [21]:
import os

model_filename = "model.h5"
model_path = os.path.abspath(model_filename)

print("Actual file path of pre-trained model: ", model_path)


Actual file path of pre-trained model:  /content/model.h5


## Loading the model

In [22]:
from keras.models import load_model

model = load_model(model_path)


In [23]:
df.loc[df['user']=="ElleCTF"]

Unnamed: 0,target,id,date,flag,user,text
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
119737,0,1833373892,Sun May 17 23:05:15 PDT 2009,NO_QUERY,ElleCTF,Killlll me... I can't believe I have the flu
352668,0,2044263917,Fri Jun 05 09:06:19 PDT 2009,NO_QUERY,ElleCTF,"@kennypistol it hates me today, first it would..."
355489,0,2045682914,Fri Jun 05 11:05:01 PDT 2009,NO_QUERY,ElleCTF,"On the jurassic park ride, think its gonna rai..."
355875,0,2045915728,Fri Jun 05 11:25:34 PDT 2009,NO_QUERY,ElleCTF,Alright the rides have been suspended bc of th...
452296,0,2071542055,Sun Jun 07 19:33:54 PDT 2009,NO_QUERY,ElleCTF,"Vacation in 2 weeks, Ill be gone for 10 days. ..."
648198,0,2240196468,Fri Jun 19 09:27:21 PDT 2009,NO_QUERY,ElleCTF,@passionshaker seriously? I never got it I'm ...
655834,0,2244206853,Fri Jun 19 14:26:37 PDT 2009,NO_QUERY,ElleCTF,"@meganctf well, technically Its not a TRUE job..."
675069,0,2250179207,Fri Jun 19 23:53:26 PDT 2009,NO_QUERY,ElleCTF,@hurricanehalvo omg wait is it still alive!? i...
739100,0,2284143377,Mon Jun 22 13:31:04 PDT 2009,NO_QUERY,ElleCTF,@meganctf GET ME SOME CREEPER POSTCARDS!!! Wri...


## Tweet Generation

In [24]:
# Seed text is usually taken as the phrase with which the user may want to start the tweet
seed_text = "my whole body"
new_tweet = generate_tweet(model, seed_text, max_sequence_length, tokenizer, temperature=0.4)

print("Generated tweet: ", new_tweet)


seed_seq= [10]
seed_seq after padding= [[ 0  0  0  0  0  0  0  0  0  0  0  0 10]]
preds= [2.07121434e-06 2.08825168e-06 4.20590368e-06 2.09294899e-06
 2.10685539e-03 6.67342510e-06 2.79036317e-06 2.20727452e-06
 4.94228061e-06 3.23786867e-06 6.18508466e-06 9.35447770e-06
 3.10998462e-06 7.60627131e-07 2.03485802e-01 3.21434163e-06
 3.88256058e-06 4.82291944e-06 6.59844995e-01 3.22487313e-06
 1.87293415e-06 7.74111686e-06 6.78608421e-06 7.98308429e-06
 1.03013394e-06 1.35527225e-05 1.20625668e-06 8.95101311e-06
 5.07185587e-06 9.42569659e-06 3.36821563e-06 5.65395624e-07
 2.30673595e-06 2.27284545e-06 3.48689605e-06 4.75991237e-06
 1.76426602e-06 9.88577358e-06 1.61584734e-03 2.19945355e-06
 2.16758144e-06 1.54722568e-06 5.43265119e-02 3.53840710e-06
 2.20612992e-06 1.41127407e-06 1.09999219e-05 3.89384832e-06
 8.77189359e-06 2.59747117e-06 8.11851078e-06 2.65352310e-06
 9.73235583e-05 1.66160487e-06 2.92146001e-06 2.48328683e-06
 5.90394302e-06 1.49628818e-06 7.71351188e-06 4.25675171e