In [34]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [35]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Madhusowmya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [37]:
dataset = pd.read_csv("training.1600000.processed.noemoticon.csv" , encoding= 'ISO-8859-1')

In [38]:
dataset.head()

Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [39]:
col_names = ['target' , 'id' , 'date' , 'flag' , 'user' , 'text']
dataset.columns = col_names

In [40]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [41]:
dataset.shape

(1048572, 6)

In [42]:
#checking for missing values
dataset.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [43]:
# Distribution of tweets
dataset['target'].value_counts()

target
0    799996
4    248576
Name: count, dtype: int64

In [44]:
# Converting 0 to -ve and 4 to +ve
dataset['target'] = dataset['target'].map({0:0 , 4:1})

In [45]:
dataset['target'].value_counts()

target
0    799996
1    248576
Name: count, dtype: int64

In [46]:
# Stemming

stremmer = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # removing not a-z and A-Z
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stremmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [47]:
dataset['text'] = dataset['text'].apply(stemming)

In [48]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset updat facebook text might cri result sch...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save rest g...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass behav mad see
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,kwesidei whole crew


In [49]:
x = dataset['text']
y = dataset['target']

In [50]:
# splitting the dataset
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)

In [51]:
# convert textual data to numerical data
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [52]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6198292 stored elements and shape (838857, 328138)>
  Coords	Values
  (0, 38891)	0.400775143648158
  (0, 14592)	0.36261851892418073
  (0, 310440)	0.23944674945165806
  (0, 213360)	0.33741977246344723
  (0, 106715)	0.1976367684924913
  (0, 78032)	0.39706301209607425
  (0, 238401)	0.5830788261117192
  (1, 310440)	0.16815634344198596
  (1, 234536)	0.48130448708128454
  (1, 317696)	0.20514010986264616
  (1, 271849)	0.24304190125482247
  (1, 94523)	0.43038800147556905
  (1, 122944)	0.4035858443173664
  (1, 198441)	0.2870371557279857
  (1, 8855)	0.2240883171081635
  (1, 68179)	0.2864104053098128
  (1, 224365)	0.27598082539483954
  (2, 308903)	0.3260091681570522
  (2, 215352)	0.4044438546293875
  (2, 277744)	0.3822383090103777
  (2, 35126)	0.39464818697445025
  (2, 121997)	0.19090014442691997
  (2, 291621)	0.2875143207107189
  (2, 55892)	0.20543830173559866
  (2, 277759)	0.5166987795580361
  :	:
  (838853, 37072)	0.3073996032889281

In [53]:
# Training the model
model = LogisticRegression()
model.fit(x_train , y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
# Testing the model
y_pred = model.predict(x_test)
print(accuracy_score(y_test , y_pred))

0.8331831294852537


In [55]:
# Function to predict the sentiment
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z
    text = text.lower()
    text = text.split() 
    text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)   
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"

In [56]:
# Testing the model
print(predict_sentiment("I hate you"))
print(predict_sentiment("I love you"))

Negative
Positive


In [57]:
# Save the model
import pickle
pickle.dump(model , open('model.pkl' , 'wb'))

In [58]:
pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset
dataset = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='ISO-8859-1')
col_names = ['target', 'id', 'date', 'flag', 'user', 'text']
dataset.columns = col_names

# Keep only relevant columns
dataset = dataset[['text', 'target']]

# Map target values (0 = Negative, 4 = Positive)
dataset['target'] = dataset['target'].map({0: 0, 4: 1})

# Text preprocessing with stemming
stemmer = PorterStemmer()

def stemming(content):
    content = re.sub('[^a-zA-Z]', ' ', content)
    content = content.lower()
    content = content.split()
    content = [stemmer.stem(word) for word in content if word not in stop_words]
    return ' '.join(content)

dataset['text'] = dataset['text'].apply(stemming)

# Split data
x = dataset['text']
y = dataset['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Vectorization
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

# Dictionary to store models and accuracies
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

accuracies = {}

# Train and evaluate each model
for name, clf in models.items():
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

# Identify best model
best_model_name = max(accuracies, key=accuracies.get)
print(f"\nBest Model: {best_model_name} with Accuracy: {accuracies[best_model_name]:.4f}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Madhusowmya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Logistic Regression Accuracy: 0.8337
Naive Bayes Accuracy: 0.7807
Linear SVM Accuracy: 0.8296


In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load and preprocess data
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='ISO-8859-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]
df['target'] = df['target'].map({0: 0, 4: 1})

# Text cleaning function
stemmer = PorterStemmer()
def clean(text):
    text = re.sub('[^a-zA-Z]', ' ', text).lower().split()
    return ' '.join([stemmer.stem(word) for word in text if word not in stop_words])
df['text'] = df['text'].apply(clean)

# Reduce dataset for speed
df = df.sample(100000, random_state=42)

x = df['text']
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = pad_sequences(x_train, maxlen=100)
x_test = pad_sequences(x_test, maxlen=100)

# Model configurations
dl_models = {}
dl_accuracies = {}

# Callback to stop training when validation accuracy stops improving
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

def compile_and_train(model, name):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=100, batch_size=64, validation_data=(x_test, y_test), verbose=1, callbacks=[early_stopping])
    acc = model.evaluate(x_test, y_test, verbose=0)[1]
    dl_models[name] = model
    dl_accuracies[name] = acc

# Model 1: Simple LSTM
model1 = Sequential([
    Embedding(5000, 128, input_length=100),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
compile_and_train(model1, "Simple_LSTM")

# Model 2: GRU
model2 = Sequential([
    Embedding(5000, 128, input_length=100),
    GRU(64),
    Dense(1, activation='sigmoid')
])
compile_and_train(model2, "GRU")

# Model 3: Bidirectional LSTM
model3 = Sequential([
    Embedding(5000, 128, input_length=100),
    Bidirectional(LSTM(64)),
    Dense(1, activation='sigmoid')
])
compile_and_train(model3, "BiLSTM")

# Model 4: CNN
model4 = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])
compile_and_train(model4, "CNN")

# Model 5: CNN + LSTM
model5 = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
compile_and_train(model5, "CNN_LSTM")

# Find best model
best_dl_model_name = max(dl_accuracies, key=dl_accuracies.get)
best_dl_model = dl_models[best_dl_model_name]

# Save best model and tokenizer
best_dl_model.save("best_dl_model.h5")
with open("dl_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

best_dl_model_name, dl_accuracies[best_dl_model_name]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Madhusowmya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='ISO-8859-1', header=None)


Epoch 1/100




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 20ms/step - accuracy: 0.6340 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 2/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 20ms/step - accuracy: 0.6340 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 3/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.6374 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 4/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 34ms/step - accuracy: 0.6337 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 5/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 21ms/step - accuracy: 0.6348 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 6/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 20ms/step - accuracy: 0.6382 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 1/100




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 25ms/step - accuracy: 0.6332 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 2/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 35ms/step - accuracy: 0.6369 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 3/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 35ms/step - accuracy: 0.6381 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 4/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 34ms/step - accuracy: 0.6326 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 5/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 34ms/step - accuracy: 0.6355 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 6/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 34ms/step - accuracy: 0.6329 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 1/100




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 45ms/step - accuracy: 0.6359 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 2/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 45ms/step - accuracy: 0.6337 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 3/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 43ms/step - accuracy: 0.6354 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 4/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 43ms/step - accuracy: 0.6359 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 5/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 43ms/step - accuracy: 0.6332 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 6/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 42ms/step - accuracy: 0.6382 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 1/100




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - accuracy: 0.6332 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 2/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.6347 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 3/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.6307 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 4/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - accuracy: 0.6370 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 5/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.6336 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 6/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.6370 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 1/100




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 22ms/step - accuracy: 0.6336 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 2/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - accuracy: 0.6396 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 3/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 14ms/step - accuracy: 0.6363 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 4/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 15ms/step - accuracy: 0.6366 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 5/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - accuracy: 0.6352 - loss: nan - val_accuracy: 0.6337 - val_loss: nan
Epoch 6/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 18ms/step - accuracy: 0.6395 - loss: nan - val_accuracy: 0.6337 - val_loss: nan




('Simple_LSTM', 0.633650004863739)

In [None]:
import re
from nltk.corpus import stopwords
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load stopwords once to avoid calling stopwords.words repeatedly
stop_words = set(stopwords.words('english'))

# Load your trained deep learning model
dl_model = load_model('best_dl_model.h5')

# Assuming the tokenizer is saved as 'dl_tokenizer.pkl'
import pickle
with open('dl_tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

def predict_sentiment_dl(text):
    # Text preprocessing
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    text = text.lower().split()  # Convert to lowercase and split into words
    text = [stemmer.stem(word) for word in text if word not in stop_words]  # Stem and remove stopwords
    
    # Rejoin the list of words back into a string
    text = ' '.join(text)
    
    # Vectorize the text using the tokenizer (you should already have 'tokenizer' trained during model training)
    text_vectorized = tokenizer.texts_to_sequences([text])  # Convert text to sequence
    text_vectorized = pad_sequences(text_vectorized, maxlen=100)  # Pad to match the input shape of the model
    
    # Predict sentiment using the DL model
    prediction = dl_model.predict(text_vectorized)
    
    # If prediction is greater than or equal to 0.5, classify as Positive
    return "Positive" if prediction >= 0.5 else "Negative"

# Test the function
print(predict_sentiment_dl("I love this project"))
print(predict_sentiment_dl("I hate waiting in lines"))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Negative


In [None]:
from keras.models import Sequential
from keras.layers import Dense

# Example model (you'll already have yours)
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(100,)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')

# Save the model
model.save("dl_model.h5")




In [None]:
import pennylane as qml
import numpy as np
import pickle

# Define the quantum device
n_qubits = 4  # Update this if you are using a different number
dev = qml.device("default.qubit", wires=n_qubits)

# Define a deeper quantum circuit
@qml.qnode(dev)
def quantum_circuit(params, x):
    # Encode classical data into quantum states
    for i in range(n_qubits):
        qml.RY(x[i], wires=i)
    
    # Apply multiple layers of parameterized gates and entanglement
    for layer in range(len(params) // (2 * n_qubits)):
        for i in range(n_qubits):
            qml.RY(params[layer * 2 * n_qubits + i], wires=i)
            qml.RZ(params[layer * 2 * n_qubits + i + n_qubits], wires=i)
        for i in range(n_qubits - 1):
            qml.CNOT(wires=[i, i + 1])
    
    # Measure expectation value
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Update the cost function
def cost(params, X, Y):
    loss = 0
    for x, y in zip(X, Y):
        predictions = quantum_circuit(params, x)
        loss += np.sum((predictions - y) ** 2)
    return loss / len(X)

# Increase the number of layers
num_layers = 3
params = qml.numpy.random.rand(num_layers * 2 * n_qubits, requires_grad=True)

# Optimize the parameters
opt = qml.AdamOptimizer(stepsize=0.01)
for i in range(200):  # Increase the number of iterations
    params = opt.step(lambda p: cost(p, x_train, y_train), params)
    if i % 10 == 0:
        print(f"Step {i}: Cost = {cost(params, x_train, y_train)}")

print("Optimized parameters:", params)

# Save the optimized model
with open("optimized_quantum_model.pkl", "wb") as f:
    pickle.dump(params, f)

print("Optimized model saved successfully!")


Step 0: Cost = 0.8591621665974379
Step 10: Cost = 0.7383211002771717
Step 20: Cost = 0.702513372145349
Step 30: Cost = 0.6902951135752973
Step 40: Cost = 0.6815711635436168
Step 50: Cost = 0.6776818342428622
Step 60: Cost = 0.6738998258208582
Step 70: Cost = 0.6711289740704949
Step 80: Cost = 0.6684963868437728
Step 90: Cost = 0.6662713125121701
Step 100: Cost = 0.6643740567189699
Step 110: Cost = 0.6627639826622537
Step 120: Cost = 0.6614455270022898
Step 130: Cost = 0.6604164104234838
Step 140: Cost = 0.6596622998500052
Step 150: Cost = 0.659143986903786
Step 160: Cost = 0.65880224839487
Step 170: Cost = 0.6585759367719106
Step 180: Cost = 0.6584163522729028
Step 190: Cost = 0.6582911563268883
Optimized parameters: [ 0.27097343  0.44500062  0.76051767 -0.2559001   0.93190422  0.51005678
  1.03540644 -0.02567562  0.25721029  0.18952773  0.46432666  0.29428221
  0.16507352  1.08810032  0.51587984  0.09591538  1.38078517 -0.06961413
  0.32231442  0.13947141  0.54933753  0.63848113  0.96

In [None]:
# QDL Hybrid Model: Quantum-Deep Learning Model Integration

from tensorflow.keras import layers
from tensorflow.keras.models import Model
import pennylane as qml
from pennylane import numpy as pnp
import tensorflow as tf

# Quantum layer
def quantum_layer(inputs):
    dev = qml.device("default.qubit", wires=2)

    @qml.qnode(dev)
    def qnode(inputs, weights):
        qml.templates.AngleEmbedding(inputs, wires=[0, 1])
        qml.templates.BasicEntanglerLayers(weights, wires=[0, 1])
        return qml.expval(qml.PauliZ(0))  # Output as expectation value of PauliZ
    
    weights = pnp.array([[0.1, 0.2], [0.3, 0.4]], requires_grad=True)
    results = []
    for sample in inputs:
        result = qnode(sample, weights)
        results.append(result)
    return pnp.array(results)

# QDL Model (Hybrid Model)
class QDLModel(tf.keras.Model):
    def __init__(self, num_features):
        super(QDLModel, self).__init__()
        self.embedding = layers.Embedding(input_dim=10000, output_dim=128, input_length=100)
        self.lstm = layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)
        self.quantum_layer = layers.Lambda(lambda x: quantum_layer(x))  # Adding quantum layer
        self.dense = layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.lstm(x)
        q_output = self.quantum_layer(x)  # Quantum processing step
        x = tf.concat([x, q_output], axis=-1)  # Combining classical and quantum outputs
        x = self.dense(x)
        return x

# Preprocess Data
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Build and compile the QDL model
qdl_model = QDLModel(num_features=100)
qdl_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the QDL model
qdl_model.fit(X_train, y_train, epochs=2, batch_size=128, validation_split=0.1)

# Evaluate the QDL model
loss, qdl_acc = qdl_model.evaluate(X_test, y_test, verbose=0)
print(f"\nQDL (Quantum-Deep Learning) Accuracy: {qdl_acc:.4f}")

# Save the QDL model
qdl_model.save("best_qdl_model.h5")


NameError: name 'X_pad' is not defined