In [1]:
!pip install tensorflow




In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  # Correct import for lemmatizer
import re
import pickle



In [3]:
def load_data(filepath):
    columns = ['target', 'id', 'date', 'flag', 'user','text']
    data = pd.read_csv(filepath, encoding = "latin-1", names=columns)
    return data

In [4]:
df = load_data(r"C:\Users\Vikas\OneDrive\Desktop\training.1600000.processed.noemoticon.csv")

In [5]:
df

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [7]:
df['text'][5]

'@Kwesidei not the whole crew '

In [8]:
df['text'][7]

"@LOLTrish hey  long time no see! Yes.. Rains a bit ,only a bit  LOL , I'm fine thanks , how's you ?"

In [9]:
df['tweet']=df['text'].str.lower()

In [10]:
df['tweet'][7]

"@loltrish hey  long time no see! yes.. rains a bit ,only a bit  lol , i'm fine thanks , how's you ?"

In [11]:
df['tweet']

0          @switchfoot http://twitpic.com/2y1zl - awww, t...
1          is upset that he can't update his facebook by ...
2          @kenichan i dived many times for the ball. man...
3            my whole body feels itchy and like its on fire 
4          @nationwideclass no, it's not behaving at all....
                                 ...                        
1599995    just woke up. having no school is the best fee...
1599996    thewdb.com - very cool to hear old walt interv...
1599997    are you ready for your mojo makeover? ask me f...
1599998    happy 38th birthday to my boo of alll time!!! ...
1599999    happy #charitytuesday @thenspcc @sparkscharity...
Name: tweet, Length: 1600000, dtype: object

In [12]:
df['tweet'] = df['tweet'].apply(lambda x:re.sub(r'[^a-zA-Z\s]',"",x))

In [13]:
df['tweet']

0          switchfoot httptwitpiccomyzl  awww thats a bum...
1          is upset that he cant update his facebook by t...
2          kenichan i dived many times for the ball manag...
3            my whole body feels itchy and like its on fire 
4          nationwideclass no its not behaving at all im ...
                                 ...                        
1599995    just woke up having no school is the best feel...
1599996    thewdbcom  very cool to hear old walt intervie...
1599997    are you ready for your mojo makeover ask me fo...
1599998    happy th birthday to my boo of alll time tupac...
1599999    happy charitytuesday thenspcc sparkscharity sp...
Name: tweet, Length: 1600000, dtype: object

In [14]:
df['tweet'][5]

'kwesidei not the whole crew '

In [15]:
df['tweet'][7]

'loltrish hey  long time no see yes rains a bit only a bit  lol  im fine thanks  hows you '

In [16]:
df["tweet_tokens"] = df['tweet'].apply(lambda x:x.split())

In [17]:
df["tweet_tokens"][7]

['loltrish',
 'hey',
 'long',
 'time',
 'no',
 'see',
 'yes',
 'rains',
 'a',
 'bit',
 'only',
 'a',
 'bit',
 'lol',
 'im',
 'fine',
 'thanks',
 'hows',
 'you']

In [18]:
lemma = WordNetLemmatizer()

In [19]:

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
stop_words = set(stopwords.words('english'))

In [21]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [22]:
lemma = WordNetLemmatizer()


In [23]:
stop_words = set(stopwords.words('english'))

In [24]:
len(stop_words)

179

In [25]:
df['tweet_refine'] = df['tweet_tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [26]:
df['tweet_refine']


0          [switchfoot, httptwitpiccomyzl, awww, thats, b...
1          [upset, cant, update, facebook, texting, might...
2          [kenichan, dived, many, times, ball, managed, ...
3                    [whole, body, feels, itchy, like, fire]
4            [nationwideclass, behaving, im, mad, cant, see]
                                 ...                        
1599995                  [woke, school, best, feeling, ever]
1599996    [thewdbcom, cool, hear, old, walt, interviews,...
1599997                [ready, mojo, makeover, ask, details]
1599998    [happy, th, birthday, boo, alll, time, tupac, ...
1599999    [happy, charitytuesday, thenspcc, sparkscharit...
Name: tweet_refine, Length: 1600000, dtype: object

In [27]:

nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
df['tweet_refine'] = df['tweet_refine'].apply(lambda x: [lemma.lemmatize(word) for word in x])

In [29]:
from nltk.stem import PorterStemmer


In [30]:
stem = PorterStemmer()

In [31]:
df['tweet_refine'] = df['tweet_refine'].apply(lambda x: [stem.stem(word) for word in x])

In [32]:
df['tweet_refine'][5634]

['cyclesoci',
 'comment',
 'neg',
 'stori',
 'httptinyurlcomcgqajm',
 'uk',
 'societi',
 'seemingli',
 'doesnt',
 'want',
 'slow']

In [33]:
X = df["tweet_refine"]
y = df["target"]

In [37]:
len(X[2452])

12

In [38]:
tokenizer = Tokenizer(num_words=100000, oov_token= "<OOV>")

In [44]:
tokenizer.fit_on_texts(X)

In [45]:
X_tokenized = tokenizer.texts_to_sequences(X)

In [46]:
X_tokenized[2452]

[728, 1, 810, 218, 2073, 913, 34, 510, 1081, 41, 29, 60896]

In [47]:
X[2452]

['ahhh',
 'drafthous',
 'surpris',
 'world',
 'premier',
 'screen',
 'new',
 'star',
 'trek',
 'last',
 'night',
 'torchi']

In [49]:
X_padded = pad_sequences(X_tokenized, maxlen=50)

In [50]:
len(X_padded[534])

50

In [51]:
(X_padded)

array([[    0,     0,     0, ...,  7428,  1649,     5],
       [    0,     0,     0, ...,    12,   189,  1062],
       [    0,     0,     0, ...,   354,     3,  2788],
       ...,
       [    0,     0,     0, ...,  6320,   295,  1513],
       [    0,     0,     0, ..., 10283, 88186, 70438],
       [    0,     0,     0, ...,     1,     1,     1]])

In [54]:
y.unique()

array([0, 4], dtype=int64)

In [62]:
#Build LSTM model
def build_lstm_model(vocab_size, embedding_dim=100, max_len=50):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_len),
        LSTM(128, return_sequences=True),
        LSTM(64),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [63]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=5, batch_size=64):
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size = batch_size,
        validation_data=(X_val, y_val),
        verbose=1
    )
    return history

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [65]:
vocab_size = len(tokenizer.word_index) + 1

In [66]:
vocab_size

725483

In [67]:
model = build_lstm_model(vocab_size)



In [68]:
history = train_model(model, X_train, y_train, X_val, y_val)     

Epoch 1/5
[1m16000/16000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263346s[0m 16s/step - accuracy: 2.2037e-04 - loss: -56313.1992 - val_accuracy: 0.0000e+00 - val_loss: -423878.3438
Epoch 2/5
[1m16000/16000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15233s[0m 952ms/step - accuracy: 0.0000e+00 - loss: -654134.7500 - val_accuracy: 0.0000e+00 - val_loss: -1498344.3750
Epoch 3/5
[1m16000/16000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53101s[0m 3s/step - accuracy: 0.0000e+00 - loss: -1901366.5000 - val_accuracy: 0.0000e+00 - val_loss: -3195511.0000
Epoch 4/5
[1m16000/16000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29290s[0m 2s/step - accuracy: 0.0000e+00 - loss: -3766579.2500 - val_accuracy: 0.0000e+00 - val_loss: -5515876.5000
Epoch 5/5
[1m16000/16000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38626s[0m 2s/step - accuracy: 0.0000e+00 - loss: -6265702.5000 - val_accuracy: 0.0000e+00 - val_loss: -8447499.0000


In [70]:
model.save('sentiment_model.keras')  

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)