In [5]:
!pip install gensim



In [4]:
!pip install keras tensorflow



In [3]:
!pip install keras tensorflow gensim scikit-learn pandas numpy



In [1]:
from google.colab import files
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split

# 1. Upload file from local machine to Colab
uploaded = files.upload()  # This will prompt you to upload

# 2. Get the filename (assumes only one file uploaded, e.g., Reviews.csv)
filename = list(uploaded.keys())[0]

# 3. Download stopwords for cleaning
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return ' '.join([word for word in text.split() if word not in stop_words])

# 4. Load and preprocess the CSV file
df = pd.read_csv(filename)
df = df[['Text', 'Score']]
df = df[df['Score'] != 3]  # Remove neutral
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)  # 1=Positive, 0=Negative
df['CleanText'] = df['Text'].apply(clean_text)

# 5. Train-test split and save for next steps
X = df['CleanText']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
np.savez('sentiment_data.npz', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

print("Upload and preprocessing complete. Data saved as 'sentiment_data.npz'.")

Saving Reviews.csv.zip to Reviews.csv.zip


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Upload and preprocessing complete. Data saved as 'sentiment_data.npz'.


In [7]:
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load preprocessed data from the previous step
data = np.load('sentiment_data.npz', allow_pickle=True)
X_train, X_test = data['X_train'], data['X_test']
y_train, y_test = data['y_train'], data['y_test']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression Classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Evaluation
y_pred = clf.predict(X_test_tfidf)
print("TF-IDF + Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the vectorizer and model for deployment/use in Streamlit app
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(clf, 'tfidf_model.pkl')

TF-IDF + Logistic Regression Results
Accuracy: 0.9350151669313351
              precision    recall  f1-score   support

           0       0.86      0.70      0.77     16379
           1       0.95      0.98      0.96     88784

    accuracy                           0.94    105163
   macro avg       0.90      0.84      0.87    105163
weighted avg       0.93      0.94      0.93    105163



['tfidf_model.pkl']

In [8]:
import numpy as np
import gensim
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load preprocessed data
data = np.load('sentiment_data.npz', allow_pickle=True)
X_train, X_test = data['X_train'], data['X_test']
y_train, y_test = data['y_train'], data['y_test']

# Tokenize text
tokenized_train = [str(x).split() for x in X_train]
tokenized_test = [str(x).split() for x in X_test]

# Train Word2Vec embeddings
w2v = gensim.models.Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=2, workers=4)
w2v.save('word2vec.model')

def avg_word_vec(words, model, vocab, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in vocab:
            nwords += 1
            feature_vec = np.add(feature_vec, model[word])
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

vocab = set(w2v.wv.index_to_key)
X_train_w2v = np.array([avg_word_vec(x, w2v.wv, vocab, 100) for x in tokenized_train])
X_test_w2v = np.array([avg_word_vec(x, w2v.wv, vocab, 100) for x in tokenized_test])

# Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_w2v, y_train)
y_pred = clf.predict(X_test_w2v)

# Evaluation
print("Word2Vec + Random Forest Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the model for later use
joblib.dump(clf, 'word2vec_rf_model.pkl')

Word2Vec + Random Forest Results
Accuracy: 0.9347774407348592
              precision    recall  f1-score   support

           0       0.93      0.63      0.75     16379
           1       0.93      0.99      0.96     88784

    accuracy                           0.93    105163
   macro avg       0.93      0.81      0.86    105163
weighted avg       0.93      0.93      0.93    105163



['word2vec_rf_model.pkl']

In [10]:
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import accuracy_score, classification_report

# Load preprocessed data
data = np.load('sentiment_data.npz', allow_pickle=True)
X_train, X_test = data['X_train'], data['X_test']
y_train, y_test = data['y_train'], data['y_test']

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Model Definition
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=maxlen),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=2, batch_size=128)

# Evaluation
y_pred = (model.predict(X_test_pad) > 0.5).astype('int32').flatten()
print("RNN/LSTM Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model and tokenizer
model.save('rnn_model.h5')
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)



Epoch 1/2
[1m3287/3287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 10ms/step - accuracy: 0.9146 - loss: 0.2201 - val_accuracy: 0.9442 - val_loss: 0.1478
Epoch 2/2
[1m3287/3287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.9517 - loss: 0.1278 - val_accuracy: 0.9507 - val_loss: 0.1356
[1m3287/3287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step




RNN/LSTM Results
Accuracy: 0.9507241139944658
              precision    recall  f1-score   support

           0       0.89      0.78      0.83     16379
           1       0.96      0.98      0.97     88784

    accuracy                           0.95    105163
   macro avg       0.92      0.88      0.90    105163
weighted avg       0.95      0.95      0.95    105163



In [11]:
import numpy as np
import joblib
import gensim
import pickle
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Load test data
data = np.load('sentiment_data.npz', allow_pickle=True)
X_test = data['X_test']
y_test = data['y_test']

results = {}

# --- TF-IDF + Logistic Regression ---
vectorizer = joblib.load('tfidf_vectorizer.pkl')
tfidf_model = joblib.load('tfidf_model.pkl')
X_test_tfidf = vectorizer.transform(X_test)
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)
results["TF-IDF + LogisticRegression"] = {
    "accuracy": accuracy_score(y_test, y_pred_tfidf),
    "f1": f1_score(y_test, y_pred_tfidf),
    "report": classification_report(y_test, y_pred_tfidf)
}

# --- Word2Vec + RandomForest ---
w2v = gensim.models.Word2Vec.load('word2vec.model')
rf_model = joblib.load('word2vec_rf_model.pkl')
vocab = set(w2v.wv.index_to_key)
def avg_word_vec(words, model, vocab, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in vocab:
            nwords += 1
            feature_vec = np.add(feature_vec, model[word])
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec
X_test_tokenized = [str(x).split() for x in X_test]
X_test_w2v = np.array([avg_word_vec(x, w2v.wv, vocab, 100) for x in X_test_tokenized])
y_pred_rf = rf_model.predict(X_test_w2v)
results["Word2Vec + RandomForest"] = {
    "accuracy": accuracy_score(y_test, y_pred_rf),
    "f1": f1_score(y_test, y_pred_rf),
    "report": classification_report(y_test, y_pred_rf)
}

# --- RNN/LSTM ---
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
maxlen = 100
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)
rnn_model = load_model('rnn_model.h5')
y_pred_rnn = (rnn_model.predict(X_test_pad) > 0.5).astype('int32').flatten()
results["RNN/LSTM"] = {
    "accuracy": accuracy_score(y_test, y_pred_rnn),
    "f1": f1_score(y_test, y_pred_rnn),
    "report": classification_report(y_test, y_pred_rnn)
}

# --- Display Results and Recommendation ---
print("="*40)
print("Comparative Analysis of Sentiment Models\n")
best_model = None
best_f1 = 0
for model_name, res in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {res['accuracy']:.4f}")
    print(f"F1-Score: {res['f1']:.4f}")
    print(res["report"])
    print("="*40)
    if res["f1"] > best_f1:
        best_f1 = res["f1"]
        best_model = model_name

print(f"Recommended model for deployment: ***{best_model}*** (highest F1-score: {best_f1:.4f})")



[1m3287/3287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step
Comparative Analysis of Sentiment Models

Model: TF-IDF + LogisticRegression
Accuracy: 0.9350
F1-Score: 0.9621
              precision    recall  f1-score   support

           0       0.86      0.70      0.77     16379
           1       0.95      0.98      0.96     88784

    accuracy                           0.94    105163
   macro avg       0.90      0.84      0.87    105163
weighted avg       0.93      0.94      0.93    105163

Model: Word2Vec + RandomForest
Accuracy: 0.9348
F1-Score: 0.9625
              precision    recall  f1-score   support

           0       0.93      0.63      0.75     16379
           1       0.93      0.99      0.96     88784

    accuracy                           0.93    105163
   macro avg       0.93      0.81      0.86    105163
weighted avg       0.93      0.93      0.93    105163

Model: RNN/LSTM
Accuracy: 0.9507
F1-Score: 0.9711
              precision    recall  f1-scor

In [13]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m112.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m133.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0


In [14]:
import streamlit as st
import numpy as np

st.title("Amazon Review Sentiment Analysis")

# Set this according to your best model: 'tfidf', 'word2vec', or 'rnn'
MODEL_TYPE = 'tfidf'  # <-- change as needed

if MODEL_TYPE == 'tfidf':
    import joblib
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    clf = joblib.load('tfidf_model.pkl')
    def predict_sentiment(text):
        x = vectorizer.transform([text])
        pred = clf.predict(x)[0]
        return 'Positive' if pred == 1 else 'Negative'

elif MODEL_TYPE == 'word2vec':
    import joblib
    import gensim
    w2v = gensim.models.Word2Vec.load('word2vec.model')
    clf = joblib.load('word2vec_rf_model.pkl')
    vocab = set(w2v.wv.index_to_key)
    def avg_word_vec(words, model, vocab, num_features):
        feature_vec = np.zeros((num_features,), dtype="float32")
        nwords = 0
        for word in words:
            if word in vocab:
                nwords += 1
                feature_vec = np.add(feature_vec, model[word])
        if nwords > 0:
            feature_vec = np.divide(feature_vec, nwords)
        return feature_vec
    def predict_sentiment(text):
        words = text.lower().split()
        vec = avg_word_vec(words, w2v.wv, vocab, 100).reshape(1, -1)
        pred = clf.predict(vec)[0]
        return 'Positive' if pred == 1 else 'Negative'

elif MODEL_TYPE == 'rnn':
    import pickle
    from keras.models import load_model
    from keras.preprocessing.sequence import pad_sequences
    with open('tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)
    model = load_model('rnn_model.h5')
    maxlen = 100
    def predict_sentiment(text):
        seq = tokenizer.texts_to_sequences([text])
        pad = pad_sequences(seq, maxlen=maxlen)
        pred = (model.predict(pad) > 0.5).astype('int32')[0][0]
        return 'Positive' if pred == 1 else 'Negative'

user_input = st.text_area("Enter a review for sentiment analysis:")
if st.button("Predict"):
    if user_input.strip():
        sentiment = predict_sentiment(user_input)
        st.write(f"Sentiment: **{sentiment}**")
    else:
        st.write("Please enter a review.")

2025-10-15 09:33:28.953 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-10-15 09:33:29.014 Session state does not function when running a script without `streamlit run`


In [18]:
code = '''
import streamlit as st
import numpy as np

st.title("Amazon Review Sentiment Analysis")

MODEL_TYPE = 'tfidf'  # <-- change as needed

if MODEL_TYPE == 'tfidf':
    import joblib
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    clf = joblib.load('tfidf_model.pkl')
    def predict_sentiment(text):
        x = vectorizer.transform([text])
        pred = clf.predict(x)[0]
        return 'Positive' if pred == 1 else 'Negative'

user_input = st.text_area("Enter a review for sentiment analysis:")
if st.button("Predict"):
    if user_input.strip():
        sentiment = predict_sentiment(user_input)
        st.write(f"Sentiment: **{sentiment}**")
    else:
        st.write("Please enter a review.")
'''

In [19]:
with open('app.py', 'w') as f:
    f.write(code)

In [22]:
from google.colab import files
files.download('app.py')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
from google.colab import files
files.download('tfidf_model.pkl')  # replace with your file name


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>