In [27]:
import pandas as pd  # Make sure this import is at the top
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from textblob import TextBlob
import shutil  # For zipping files
import pickle  # For saving models

In [28]:
# --- LOAD DATASET ---
# Use the uploaded file path in Kaggle's environment
csv_file_path = "/kaggle/input/amazon-datset/245_1.csv"  # Update this path based on your uploaded folder


In [29]:
import pandas as pd  # Make sure this import is at the top
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from textblob import TextBlob
import shutil  # For zipping files
import pickle  # For saving models
try:
    df = pd.read_csv(csv_file_path)  # Remove the compression if your file isn't gzipped
    print(f"Successfully loaded dataset from: {csv_file_path}") 
    print(df.head())  # Check the first few rows
except FileNotFoundError:
    print(f"Error: '{csv_file_path}' not found. Make sure it's in the correct directory.")
    exit()
except Exception as e:
    print(f"Error loading the dataset: {e}")
    exit()

Successfully loaded dataset from: /kaggle/input/amazon-datset/245_1.csv
                     id            brand  \
0  AV13O1A8GV-KLJ3akUyj  Universal Music   
1  AV14LG0R-jtxr-f38QfS         Lundberg   
2  AV14LG0R-jtxr-f38QfS         Lundberg   
3  AV16khLE-jtxr-f38VFn              K-Y   
4  AV16khLE-jtxr-f38VFn              K-Y   

                                          categories             dateAdded  \
0  Movies, Music & Books,Music,R&b,Movies & TV,Mo...  2017-07-25T00:52:42Z   
1  Food,Packaged Foods,Snacks,Crackers,Snacks, Co...  2017-07-25T05:16:03Z   
2  Food,Packaged Foods,Snacks,Crackers,Snacks, Co...  2017-07-25T05:16:03Z   
3  Personal Care,Medicine Cabinet,Lubricant/Sperm...  2017-07-25T16:26:19Z   
4  Personal Care,Medicine Cabinet,Lubricant/Sperm...  2017-07-25T16:26:19Z   

            dateUpdated          ean  \
0  2018-02-05T08:36:58Z  6.02537E+11   
1  2018-02-05T11:27:45Z  73416000391   
2  2018-02-05T11:27:45Z  73416000391   
3  2018-02-05T11:25:51Z  679819344

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [30]:
# --- ASSUMPTIONS AND ADJUSTMENTS (CRITICAL!) ---
TEXT_COLUMN = 'reviews.text'  # Adjust the column name for review text
USERNAME_COLUMN = 'reviews.username' # Adjust if you have a username column or set it to None

In [31]:
# --- HANDLE MISSING VALUES ---
df[TEXT_COLUMN] = df[TEXT_COLUMN].fillna('')

In [32]:
# --- PERFORM SENTIMENT ANALYSIS ---
def analyze_sentiment(text):
    try:
        analysis = TextBlob(str(text))
        polarity = analysis.sentiment.polarity
        if polarity > 0.1:
            return 'positive'
        elif polarity < -0.1:
            return 'negative'
        else:
            return 'neutral'
    except Exception as e:
        print(f"Sentiment analysis error: {e}")
        return 'neutral'

df['sentiment'] = df[TEXT_COLUMN].apply(analyze_sentiment)


In [33]:
# --- DEFINE FEATURES AND TARGETS ---
X = df[TEXT_COLUMN]
y_sentiment = df['sentiment']

In [34]:
# --- CHECK AND PREPARE USERNAME COLUMN ---
y_username = None  # If username exists, initialize it
if USERNAME_COLUMN and USERNAME_COLUMN in df.columns:
    df[USERNAME_COLUMN] = df[USERNAME_COLUMN].fillna('Unknown')
    y_username = df[USERNAME_COLUMN]

In [35]:
# --- SPLIT DATA ---
X_train, X_test, y_sentiment_train, y_sentiment_test = train_test_split(X, y_sentiment, test_size=0.2, random_state=42)


In [36]:
# --- VECTORIZE TEXT ---
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [37]:
# --- TRAIN SENTIMENT MODEL ---
sentiment_model = MultinomialNB()
sentiment_model.fit(X_train_vec, y_sentiment_train)

In [38]:
# --- TRAIN USERNAME MODEL ---
username_model = None
vectorizer_username = None

if y_username is not None:
    X_train_username, X_test_username, y_username_train, y_username_test = train_test_split(X, y_username, test_size=0.2, random_state=42)
    vectorizer_username = TfidfVectorizer(max_features=5000)
    X_train_username_vec = vectorizer_username.fit_transform(X_train_username)
    X_test_username_vec = vectorizer_username.transform(X_test_username)
    username_model = MultinomialNB()
    username_model.fit(X_train_username_vec, y_username_train)

In [39]:
# --- PREDICTION FUNCTION ---
def predict_review(review_text):
    sentiment = 'N/A'
    username = 'N/A'
    review_vec = vectorizer.transform([review_text])
    sentiment = sentiment_model.predict(review_vec)[0]

    if username_model:
        review_vec_username = vectorizer_username.transform([review_text])
        username = username_model.predict(review_vec_username)[0]

    return sentiment, username

In [40]:
# --- EXAMPLE USAGE ---
new_review = "This product is absolutely terrible! It broke after one use."
predicted_sentiment, predicted_username = predict_review(new_review)
print(f"Predicted sentiment: {predicted_sentiment}")
print(f"Predicted username: {predicted_username if username_model else 'N/A'}")

Predicted sentiment: positive
Predicted username: An anonymous customer


In [42]:
# --- SAVE PICKLE FILES USING 'with' METHOD ---
try:
    # Save vectorizer and sentiment model
    with open("vectorizer.pkl", "wb") as f:
        pickle.dump(vectorizer, f)
        
    with open("sentiment_model.pkl", "wb") as f:
        pickle.dump(sentiment_model, f)
    
    # Save username-related models, if present
    if username_model and vectorizer_username:
        with open("vectorizer_username.pkl", "wb") as f:
            pickle.dump(vectorizer_username, f)
        
        with open("username_model.pkl", "wb") as f:
            pickle.dump(username_model, f)

    print("Pickle files saved!")
except Exception as e:
    print(f"Error during pickling: {e}")


Pickle files saved!
