In [27]:
import pandas as pd

In [28]:
data = pd.read_csv('/kaggle/input/mobile-reviews-sentiment-and-specification/Mobile Reviews Sentiment.csv')

In [29]:
data.columns

Index(['review_id', 'customer_name', 'age', 'brand', 'model', 'price_usd',
       'price_local', 'currency', 'exchange_rate_to_usd', 'rating',
       'review_text', 'sentiment', 'country', 'language', 'review_date',
       'verified_purchase', 'battery_life_rating', 'camera_rating',
       'performance_rating', 'design_rating', 'display_rating',
       'review_length', 'word_count', 'helpful_votes', 'source'],
      dtype='object')

In [30]:
columns_to_drop = [
    'review_id', 'customer_name', 'age', 'brand', 'model', 'price_usd',
    'price_local', 'currency', 'exchange_rate_to_usd', 'rating', 'country',
    'language', 'review_date', 'battery_life_rating', 'camera_rating',
    'performance_rating', 'design_rating', 'display_rating', 'source'
]

df = data.drop(columns=columns_to_drop, axis = 1)


In [31]:
df.head()

Unnamed: 0,review_text,sentiment,verified_purchase,review_length,word_count,helpful_votes
0,Not worth the money spent. Wouldn’t recommend.,Negative,True,46,7,1
1,Absolutely love this phone! The camera is next...,Positive,True,74,12,5
2,Loving the clean UI and fast updates. Loving i...,Positive,True,55,11,8
3,Build quality feels solid and durable. No regr...,Positive,False,66,11,3
4,Not bad for daily use but could be optimized. ...,Neutral,True,73,12,0


In [32]:
columns = ['review_text', 'sentiment', 'verified_purchase', 'review_length','word_count', 'helpful_votes']

for i in columns:
    print(i , df[i].isnull().sum())

review_text 0
sentiment 0
verified_purchase 0
review_length 0
word_count 0
helpful_votes 0


In [33]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

stopword = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def Clean(text):
    text = text.lower()
    text = text.replace('\n' , '').replace('\r' , '')
    text = re.sub(r'[^a-zA-Z\s]' , '' , text)
    text = re.sub(r'http\S+|www\S+|@S+|@|#', '' , text)
    text = re.sub(r'\s+' , ' ' ,text)
    text = word_tokenize(text)
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopword]
    text = ' '.join(text)
    return text

df['Cleaned'] = df['review_text'].apply(Clean)

df_cleaned = df[['Cleaned' , 'sentiment' ]]
df_cleaned.head()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Cleaned,sentiment
0,worth money spent wouldnt recommend,Negative
1,absolutely love phone camera next level absolu...,Positive
2,loving clean ui fast update loving far,Positive
3,build quality feel solid durable regret buying...,Positive
4,bad daily use could optimized average experien...,Neutral


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

vectorizer = TfidfVectorizer(max_features = 10000 )
le = LabelEncoder()

X = vectorizer.fit_transform(df_cleaned['Cleaned'])
y = le.fit_transform(df_cleaned['sentiment'])

In [35]:
print(le.classes_)


['Negative' 'Neutral' 'Positive']


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2, random_state = 42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)


In [37]:
from sklearn.metrics import accuracy_score
y_pred = rf.predict(X_test)
print("Random Forest Accuracy:", (accuracy_score(y_test, y_pred))*100)

Random Forest Accuracy: 100.0


In [38]:
svm = SVC(kernel='linear', probability=True, random_state=42)
svm.fit(X_train , y_train)

In [39]:
y_pred = svm.predict(X_test)
print("SVM Accuracy:", (accuracy_score(y_test, y_pred))*100)

SVM Accuracy: 100.0


In [43]:
sentence = ['very bad product ']
sentence_vec = vectorizer.transform(sentence)

pred = rf.predict(sentence_vec)
le.inverse_transform(pred)

array(['Negative'], dtype=object)

In [41]:
sentence = ['love this product ']
sentence_vec = vectorizer.transform(sentence)

pred = svm.predict(sentence_vec)
le.inverse_transform(pred)

array(['Positive'], dtype=object)