In [None]:
!pip install -q streamlit scikit-learn pandas numpy sentence-transformers nltk
!npm install -g localtunnel

In [None]:
preprocess_code = '''
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

CONTRACTIONS = {
    "don't": "do not", "can't": "cannot", "won't": "will not",
    "i'm": "i am", "it's": "it is", "you're": "you are",
    "they're": "they are", "we're": "we are", "isn't": "is not",
    "aren't": "are not", "wasn't": "was not", "weren't": "were not",
    "doesn't": "does not", "didn't": "did not", "hasn't": "has not",
    "haven't": "have not", "hadn't": "had not", "couldn't": "could not",
    "wouldn't": "would not", "shouldn't": "should not", "mightn't": "might not",
    "mustn't": "must not"
}

def clean_text(text: str) -> str:
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"http\\S+|www\\S+|https\\S+", "", text)
    text = re.sub(r"\\S+@\\S+", "", text)
    for c, f in CONTRACTIONS.items():
        text = text.replace(c, f)
    text = re.sub(r"[^a-z\\s]", "", text)
    text = re.sub(r"\\s+", " ", text).strip()
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)
'''
with open("preprocessing.py", "w") as f:
    f.write(preprocess_code)

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer
from google.colab import files
from preprocessing import clean_text

In [None]:
uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(next(iter(uploaded.values()))), sep="\t")

In [None]:
def clean_text(text):
    if pd.isnull(text):
        return ""

    text = text.lower()

    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    text = re.sub(r"\S+@\S+", "", text)

    contractions = {
        "don't": "do not", "can't": "cannot", "won't": "will not",
        "i'm": "i am", "it's": "it is", "you're": "you are",
        "they're": "they are", "we're": "we are", "isn't": "is not",
        "aren't": "are not", "wasn't": "was not", "weren't": "were not",
        "doesn't": "does not", "didn't": "did not", "hasn't": "has not",
        "haven't": "have not", "hadn't": "had not", "couldn't": "could not",
        "wouldn't": "would not", "shouldn't": "should not", "mightn't": "might not",
        "mustn't": "must not"
    }
    for c, f in contractions.items():
        text = text.replace(c, f)

    text = re.sub(r"[^a-z\s]", " ", text)

    text = re.sub(r"\s+", " ", text).strip()

    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]

    return " ".join(tokens)

df["verified_reviews"] = df["verified_reviews"].apply(clean_text)

df = df[df["verified_reviews"].str.strip() != ""]

df = df[df["feedback"].isin(["positive", "negative"])]

df["feedback"] = df["feedback"].map({"positive": 1, "negative": 0}).astype(int)
y = 1 - df["feedback"]

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
X_embed = embedder.encode(df["verified_reviews"].tolist(), show_progress_bar=True)

In [None]:
print("Raw feedback unique values:", df["feedback"].unique())

df["feedback"] = df["feedback"].astype(str).str.lower().str.strip()

print("Normalized feedback values:", df["feedback"].unique())

positive_keywords = ["positive", "pos", "1", "yes", "true"]
negative_keywords = ["negative", "neg", "0", "no", "false"]

label_map = {}
for val in df["feedback"].unique():
    if any(k in val for k in positive_keywords):
        label_map[val] = 1
    elif any(k in val for k in negative_keywords):
        label_map[val] = 0

print("Generated label_map:", label_map)

df["feedback"] = df["feedback"].map(label_map)

df = df[df["feedback"].isin([0, 1])]

df["verified_reviews"] = df["verified_reviews"].fillna("").apply(clean_text)
df = df[df["verified_reviews"].str.strip() != ""]

y = 1 - df["feedback"]

print(f"Samples remaining after cleaning: {len(df)}")
print("Unique feedback values after mapping:", df["feedback"].unique())

In [None]:
import os
import pickle
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

for file in ["model.pkl", "embedder.pkl"]:
    if os.path.exists(file):
        os.remove(file)

df = pd.read_csv("amazon_alexa.tsv", sep="\t")

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["verified_reviews"] = df["verified_reviews"].fillna("").apply(clean_text)

df = df[df["verified_reviews"].str.strip() != ""]

y = df["feedback"]

model_embed = SentenceTransformer("all-MiniLM-L6-v2")
X_embed = model_embed.encode(df["verified_reviews"].tolist())

X_train, X_test, y_train, y_test = train_test_split(
    X_embed, y, test_size=0.2, stratify=y, random_state=42
)

model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("embedder.pkl", "wb") as f:
    pickle.dump(model_embed, f)

with open("model.pkl", "rb") as f:
    model = pickle.load(f)

with open("embedder.pkl", "rb") as f:
    model_embed = pickle.load(f)

print("✅ Model and embedder saved successfully!")

In [None]:
import os

for file in ["model.pkl", "embedder.pkl"]:
    if os.path.exists(file):
        os.remove(file)

In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
df_raw = pd.read_csv("amazon_alexa.tsv", sep="\t")
print(df_raw.shape)
print(df_raw.head())

In [None]:
!curl https://loca.lt/mytunnelpassword

In [None]:
app_code = '''
import streamlit as st
import pickle
from sentence_transformers import SentenceTransformer
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.strip()

@st.cache_resource
def load_model():
    with open("model.pkl", "rb") as f:
        model = pickle.load(f)
    with open("embedder.pkl", "rb") as f:
        embedder = pickle.load(f)
    return model, embedder

model, embedder = load_model()

st.title("Amazon Alexa Review Sentiment Analysis")
st.write("Predict whether a review is **Positive** or **Negative**.")

user_input = st.text_area("Enter your review:")

if st.button("Predict"):
    cleaned = clean_text(user_input)
    if cleaned == "":
        st.warning("Please enter valid text.")
    else:
        emb = embedder.encode([cleaned])
        pred = model.predict(emb)
        sentiment = "Positive" if pred[0] == 1 else "Negative"
        st.success(f"Prediction: **{sentiment}**")
'''

with open("app.py", "w") as f:
    f.write(app_code)

!nohup streamlit run app.py --server.port 8501 &>/tmp/logs.txt &

!npx localtunnel --port 8501