In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('/content/amazon_alexa.tsv',delimiter = '\t',quoting = 3)

In [6]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"""Sometimes while playing a game, you can answe...",1
3,5,31-Jul-18,Charcoal Fabric,"""I have had a lot of fun with this thing. My 4...",1
4,5,31-Jul-18,Charcoal Fabric,Music,1


##Preprocessing

In [7]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
all_stopword = stopwords.words('english')
all_stopword.remove('not')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
corpus = []
for i in range(0,3150):
  review = re.sub('[^a-zA-Z]'," ",str(df["verified_reviews"][i])) #Removing Special character
  review = review.lower() #for uniformity
  review = review.split() #splits into list of words
  review = [ps.stem(x) for x in review if x not in set(all_stopword)]
  review = " ".join(review)
  corpus.append(review)


In [9]:
len(corpus)

3150

##Data Transformation

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

In [11]:
X = vectorizer.fit_transform(corpus)
y = df['feedback']

In [12]:

df['feedback'].value_counts()  # Assuming 'feedback' is the column containing labels (0 for negative, 1 for positive)

Unnamed: 0_level_0,count
feedback,Unnamed: 1_level_1
1,2893
0,257


##SMOTE for balancing dataset

In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Split the balanced dataset into training (80%) and testing (20%)
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

# Check class distribution after SMOTE and splitting
print("Training class distribution:", Counter(y_train_bal))
print("Testing class distribution:", Counter(y_test_bal))


Training class distribution: Counter({1: 2314, 0: 2314})
Testing class distribution: Counter({1: 579, 0: 579})


In [14]:
from collections import Counter

# Count the number of 0s and 1s after SMOTE
feedback_counts = Counter(y_balanced)

print("Feedback count after SMOTE:", feedback_counts)


Feedback count after SMOTE: Counter({1: 2893, 0: 2893})


##Random Forest

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [16]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_bal, y_train_bal)
y_pred_rf = rf_model.predict(X_test_bal)

##Model Performance

In [17]:
print("Random Forest Accuracy:", accuracy_score(y_test_bal, y_pred_rf))
print(classification_report(y_test_bal, y_pred_rf))

Random Forest Accuracy: 0.9740932642487047
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       579
           1       1.00      0.95      0.97       579

    accuracy                           0.97      1158
   macro avg       0.98      0.97      0.97      1158
weighted avg       0.98      0.97      0.97      1158



In [18]:
import re

# Function to preprocess user input
def preprocess_text(text):
    review = re.sub('[^a-zA-Z]', " ", text)  # Remove special characters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Tokenization
    review = [ps.stem(word) for word in review if word not in set(all_stopword)]
    return " ".join(review)

# Take user input
user_review = input("Enter a review: ")

# Preprocess the input text
processed_review = preprocess_text(user_review)

# Convert input text into TF-IDF features
review_tfidf = vectorizer.transform([processed_review])

# Predict using the trained Random Forest model
prediction = rf_model.predict(review_tfidf)[0]

# Interpret the result
if prediction == 1:
    print("Sentiment: Happy 😊")
else:
    print("Sentiment: Sad 😔")


Enter a review: its not good
Sentiment: Sad 😔
