In [5]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from wordcloud import WordCloud


#Load the data

data = pd.read_csv(r"amazon_alexa.tsv", delimiter = '\t', quoting = 3)

print(f"Dataset shape : {data.shape}")

# Preprocess data
data['verified_reviews'] = data['verified_reviews'].str.lower().str.replace('[^\w\s]','')

# Define target variable and features
X = data['verified_reviews']
y = data['feedback']  # Assuming feedback column is used for sentiment classification (0 = negative, 1 = positive)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the reviews
cv = CountVectorizer(stop_words='english', max_features=1000)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

# Train a Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_cv, y_train)

# Evaluate the model
y_pred = lr_model.predict(X_test_cv)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Train a Support Vector Machine model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_cv, y_train)

# Evaluate the SVM model
y_pred_svm = svm_model.predict(X_test_cv)
print(classification_report(y_test, y_pred_svm))

# Aspect Extraction using SpaCy
nlp = spacy.load('en_core_web_sm')

def extract_aspects(text):
    doc = nlp(text)
    aspects = [chunk.text for chunk in doc.noun_chunks]
    return aspects

data['aspects'] = data['verified_reviews'].apply(extract_aspects)

# Word Cloud for Positive Reviews
pos_reviews = " ".join([review for review in data[data['feedback'] == 1]['verified_reviews']])
wc = WordCloud(background_color='white', max_words=50).generate(pos_reviews)

plt.figure(figsize=(10,10))
plt.imshow(wc, interpolation='bilinear')
plt.title('Wordcloud for Positive Reviews', fontsize=10)
plt.axis('off')
plt.show()

# Word Cloud for Negative Reviews
neg_reviews = " ".join([review for review in data[data['feedback'] == 0]['verified_reviews']])
wc = WordCloud(background_color='white', max_words=50).generate(neg_reviews)

plt.figure(figsize=(10,10))
plt.imshow(wc, interpolation='bilinear')
plt.title('Wordcloud for Negative Reviews', fontsize=10)
plt.axis('off')
plt.show()

# Aspect-Based Sentiment Visualization
aspect_counts = data['aspects'].explode().value_counts()
aspect_counts[:10].plot(kind='bar', figsize=(10,6), title="Top 10 Aspects Mentioned in Reviews")
plt.show()


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject