In [1]:
import tensorflow as tf

# Explicitly allow memory growth for GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(physical_devices[0], 'GPU')
        # Allow memory growth to avoid allocating all GPU memory at once
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print("GPU configured successfully")
    except RuntimeError as e:
        print(e)


GPU configured successfully


In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv("twitter_sentiment.csv")
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [4]:
len(df)

75681

In [5]:
df=df.drop(['2401','Borderlands'],axis=1)

In [6]:
df.columns=['review','reviewText']
df

Unnamed: 0,review,reviewText
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
75676,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
75677,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
75678,Positive,Today sucked so it’s time to drink wine n play...
75679,Positive,Bought a fraction of Microsoft today. Small wins.


In [7]:
df.isnull().sum()

review          0
reviewText    686
dtype: int64

In [8]:
df.dropna(inplace=True)
len(df)

74995

In [9]:
df = df[df['reviewText'].apply(len)>1]
len(df)

74646

In [10]:
df['review'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [11]:
df['review'].value_counts()

review
Negative      22530
Positive      20843
Neutral       18285
Irrelevant    12988
Name: count, dtype: int64

In [12]:
df=df[df['review']!='Irrelevant']
df=df[df['review']!='Neutral']

In [13]:
len(df),df['review'].value_counts()

(43373,
 review
 Negative    22530
 Positive    20843
 Name: count, dtype: int64)

In [14]:
df=pd.get_dummies(df, columns=['review'])
df

Unnamed: 0,reviewText,review_Negative,review_Positive
0,I am coming to the borders and I will kill you...,False,True
1,im getting on borderlands and i will kill you ...,False,True
2,im coming on borderlands and i will murder you...,False,True
3,im getting on borderlands 2 and i will murder ...,False,True
4,im getting into borderlands and i can murder y...,False,True
...,...,...,...
75673,guess i'll broke.,False,True
75674,Please explain how this is possible! How can t...,True,False
75675,Good on Sony. As much as I want to see the new...,False,True
75678,Today sucked so it’s time to drink wine n play...,False,True


In [15]:
#pos=0,neg=1
df=df.drop(['review_Positive'],axis=1)
df['review_Negative']= df['review_Negative'].astype(int)
df

Unnamed: 0,reviewText,review_Negative
0,I am coming to the borders and I will kill you...,0
1,im getting on borderlands and i will kill you ...,0
2,im coming on borderlands and i will murder you...,0
3,im getting on borderlands 2 and i will murder ...,0
4,im getting into borderlands and i can murder y...,0
...,...,...
75673,guess i'll broke.,0
75674,Please explain how this is possible! How can t...,1
75675,Good on Sony. As much as I want to see the new...,0
75678,Today sucked so it’s time to drink wine n play...,0


In [16]:
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [17]:
## Lower All the cases
df['reviewText']=df['reviewText'].str.lower()
## Removing special characters
df['reviewText']=df['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',str(x)))
## Remove the stopswords
df['reviewText']=df['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
## Remove url 
df['reviewText']=df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
## Remove html tags
df['reviewText']=df['reviewText'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
## Remove any additional spaces
df['reviewText']=df['reviewText'].apply(lambda x: " ".join(x.split()))

  df['reviewText']=df['reviewText'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


In [18]:
df

Unnamed: 0,reviewText,review_Negative
0,coming borders kill,0
1,im getting borderlands kill,0
2,im coming borderlands murder,0
3,im getting borderlands 2 murder,0
4,im getting borderlands murder,0
...,...,...
75673,guess ill broke,0
75674,please explain possible let companies overchar...,1
75675,good sony much want see new ps5 whats going ri...,0
75678,today sucked time drink wine n play borderland...,0


In [19]:
from nltk.stem import WordNetLemmatizer
lm=WordNetLemmatizer()

In [20]:
def lemmatize_words(text):
    return " ".join([lm.lemmatize(word,pos='v') for word in text.split()])

In [21]:
df['reviewText']=df['reviewText'].apply(lambda x:lemmatize_words(x))
df

Unnamed: 0,reviewText,review_Negative
0,come border kill,0
1,im get borderlands kill,0
2,im come borderlands murder,0
3,im get borderlands 2 murder,0
4,im get borderlands murder,0
...,...,...
75673,guess ill break,0
75674,please explain possible let company overcharge...,1
75675,good sony much want see new ps5 whats go right...,0
75678,today suck time drink wine n play borderlands ...,0


In [22]:
y=df.drop(['reviewText'],axis=1)
y

Unnamed: 0,review_Negative
0,0
1,0
2,0
3,0
4,0
...,...
75673,0
75674,1
75675,0
75678,0


In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df['reviewText'],y,
                                              test_size=0.20)

In [24]:
len(X_train)

34698

In [25]:
import gensim
from gensim.models import KeyedVectors,Word2Vec
import gensim.downloader as api
wv=api.load("word2vec-google-news-300")

In [26]:
def preprocess(sentences, wv):
    vectors = []
    valid_indices = []
    for i, sentence in enumerate(sentences):
        tokens = sentence.split()  # Simple tokenization
        sentence_vector = []
        for token in tokens:
            if token in wv:
                sentence_vector.append(wv[token])
        if sentence_vector:  # Avoid empty vectors
            vectors.append(np.mean(sentence_vector, axis=0))
            valid_indices.append(i)
    return np.array(vectors, dtype='float32'), valid_indices

In [27]:
X_train_vectors, train_valid_indices  = preprocess(X_train.tolist(), wv)
X_test_vectors , test_valid_indices = preprocess(X_test.tolist(), wv)

In [28]:
X_train_vectors[0]

array([-8.33333358e-02, -9.02709961e-02,  1.49617508e-01,  3.75162773e-02,
       -1.34663895e-01,  1.51875811e-02,  6.13199882e-02, -1.50960281e-01,
        1.64698914e-01, -4.84212255e-03, -1.62726089e-01, -1.72648117e-01,
       -8.69038925e-02,  1.26139326e-02, -1.30452469e-01,  5.98449707e-02,
        8.13447237e-02, -6.98242188e-02, -1.76920578e-01, -7.20926896e-02,
        1.49739578e-01,  6.14420557e-03,  1.32232666e-01,  1.09659828e-01,
        4.98046875e-02, -8.08715820e-03, -8.94571915e-02,  6.46565780e-02,
        2.03682575e-02, -9.15527344e-02, -3.39762382e-02, -1.09232582e-01,
       -4.80041504e-02, -9.64355469e-02, -7.97640458e-02, -6.26042709e-02,
        1.17284141e-01,  2.11181641e-02,  3.98203544e-02,  5.45857735e-02,
       -1.85953770e-02, -9.82259139e-02,  6.31713867e-02, -2.44547520e-02,
       -2.07519531e-02, -4.71343994e-02,  2.99682617e-02,  9.53776017e-02,
       -3.87484245e-02, -6.37207031e-02,  1.40787764e-02, -4.69563790e-02,
       -4.46980782e-02, -

In [53]:
len(X_train_vectors),len(X_test_vectors)

(33925, 8500)

In [59]:
len(Y_train)

33925

In [30]:
len(X_train),len(X_test)

(34698, 8675)

In [54]:
len(Y_test)

8500

In [52]:
len(X_test_padded)

8675

In [56]:
len(X_test_seq)

8675

In [31]:
len(y_train),len(y_test)

(34698, 8675)

In [35]:
Y_train = y_train['review_Negative'].tolist()
Y_test = y_test['review_Negative'].tolist()
Y_train = [Y_train[i] for i in train_valid_indices]
Y_test = [Y_test[i] for i in test_valid_indices]

In [36]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB().fit(X_train_vectors,Y_train)

In [37]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [38]:
y_pred=model.predict(X_test_vectors)

In [39]:
print("accuracy: ",accuracy_score(Y_test,y_pred))

accuracy:  0.7112941176470589


In [40]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_vectors, Y_train)


In [41]:
y_pred_rf= rf_classifier.predict(X_test_vectors)
accuracy = accuracy_score(Y_test, y_pred_rf)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.8985882352941177


In [42]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = 50
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')


In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

vocab_size = 5000
embedding_dim = 64

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(32),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_padded, y_train, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2288f1539a0>

In [46]:
loss, accuracy = model.evaluate(X_test_padded,y_test)
print(f'Test Accuracy: {accuracy}')

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})

In [45]:
# Build GRU model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

maxlen=50
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))
model.add(GRU(128, return_sequences=True))
model.add(GRU(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=15, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Accuracy: 0.92


In [66]:
# Assuming your model expects sequences of length 50
max_sequence_length = 50  # Update this to the model's expected sequence length

# Tokenize and pad the input
input_text = [" you are a good human being"]
input_seq = tokenizer.texts_to_sequences(input_text)
input_padded = pad_sequences(input_seq, maxlen=max_sequence_length, padding='post')

# Now, make the prediction
res = model.predict(input_padded)




In [67]:
res

array([[0.00263715]], dtype=float32)

In [68]:
if res <=0.5:
    print("positive")
else:
    print("negative")

positive
