In [1]:
import pandas as pd

In [2]:
data_path = "/content/train_data.csv"
data = pd.read_csv(data_path)

In [3]:
print("Training Data Snapshot:")
display(data.head())

Training Data Snapshot:


Unnamed: 0,Name,Brand,Categories,PrimaryCategories,Reviews.Date,Reviews.Text,Reviews.Title,Sentiment,Full_Review,Sentiment_Score,Sentiment_Category
0,Gaming HP,HP,Computers,Gaming,2024-01-08,Mediocre experience.,Average,Neutral,Mediocre experience. Average,0.42,Neutral
1,Laptops Sony,Sony,Computers,Laptops,2024-05-26,Worth every penny.,Fantastic Buy,Positive,Worth every penny. Fantastic Buy,0.94,Positive
2,Microphones LG,LG,Audio,Microphones,2024-01-18,Exceeded expectations.,Highly Recommend,Positive,Exceeded expectations. Highly Recommend,0.91,Positive
3,Tables Dell,Dell,Furniture,Tables,2024-11-01,Amazing quality!,Great Product,Positive,Amazing quality! Great Product,0.92,Positive
4,Gaming HP,HP,Computers,Gaming,2024-11-16,It's fine.,It's okay,Neutral,It's fine. It's okay,0.4,Neutral


In [4]:
print("\nDataset Information:")
data.info()


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                2000 non-null   object 
 1   Brand               2000 non-null   object 
 2   Categories          2000 non-null   object 
 3   PrimaryCategories   2000 non-null   object 
 4   Reviews.Date        2000 non-null   object 
 5   Reviews.Text        2000 non-null   object 
 6   Reviews.Title       2000 non-null   object 
 7   Sentiment           2000 non-null   object 
 8   Full_Review         2000 non-null   object 
 9   Sentiment_Score     2000 non-null   float64
 10  Sentiment_Category  2000 non-null   object 
dtypes: float64(1), object(10)
memory usage: 172.0+ KB


In [5]:
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
Name                  0
Brand                 0
Categories            0
PrimaryCategories     0
Reviews.Date          0
Reviews.Text          0
Reviews.Title         0
Sentiment             0
Full_Review           0
Sentiment_Score       0
Sentiment_Category    0
dtype: int64


In [7]:
print("\nDescriptive Statistics:")
print(data.describe())

print("\nCorrelation Matrix:")
print(data.corr())


Descriptive Statistics:
       Sentiment_Score
count      2000.000000
mean          0.538260
std           0.284799
min           0.000000
25%           0.290000
50%           0.560000
75%           0.782500
max           1.000000

Correlation Matrix:


ValueError: could not convert string to float: 'Gaming HP'

In [8]:
for col in data.select_dtypes(include=['number']):
    if data[col].isnull().any():
        data[col] = data[col].fillna(data[col].mean())


In [9]:
for col in data.select_dtypes(include=['object']):
    if data[col].isnull().any():
        data[col] = data[col].fillna(data[col].mode()[0])

print("\nMissing Values after imputation:")
print(data.isnull().sum())


Missing Values after imputation:
Name                  0
Brand                 0
Categories            0
PrimaryCategories     0
Reviews.Date          0
Reviews.Text          0
Reviews.Title         0
Sentiment             0
Full_Review           0
Sentiment_Score       0
Sentiment_Category    0
dtype: int64


In [10]:
print("\nValue counts for a categorical column (replace 'column_name' with your column):")
if 'column_name' in data.columns:
    print(data['column_name'].value_counts())


Value counts for a categorical column (replace 'column_name' with your column):


In [13]:
try:
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns
    if len(categorical_cols) > 0:
        first_categorical_col = categorical_cols[0]
        print(f"\nValue counts for '{first_categorical_col}':")
        print(data[first_categorical_col].value_counts())
    else:
        print("\nNo categorical columns found in the dataset.")
except FileNotFoundError:
    print(f"Error: File not found at {data_path}. Please check the file path.")
except pd.errors.ParserError:
    print(f"Error: Could not parse the CSV file at {data_path}. Please check the file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



Value counts for 'Name':
Name
Kitchen Samsung    32
Gaming Apple       24
Cameras Samsung    24
Cleaning ASUS      22
Beds Samsung       21
                   ..
Speakers Apple      8
Cleaning Sony       7
Tables Sony         7
Chairs Apple        7
Laptops LG          6
Name: count, Length: 135, dtype: int64


In [18]:
from collections import Counter

if 'Reviews.Text' in data.columns:
    data['word_counts'] = data['Reviews.Text'].apply(lambda x: len(str(x).split()))
    word_counts_distribution = Counter(data['word_counts'])
    print("\nWord Count Distribution in Reviews:")
    print(word_counts_distribution)

    most_common_counts = word_counts_distribution.most_common(10)
    print("\nMost Common Word Counts:")
    print(most_common_counts)
else:
    print("\nError: 'reviews' column not found in the dataset.")


Word Count Distribution in Reviews:
Counter({2: 1294, 4: 453, 3: 253})

Most Common Word Counts:
[(2, 1294), (4, 453), (3, 253)]


In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [23]:
if 'Reviews.Text' in data.columns:
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    def preprocess_text(text):

        text = re.sub(r'[^\w\s]', '', str(text).lower())
        words = nltk.word_tokenize(text)
        words = [stemmer.stem(w) for w in words if w not in stop_words and w.isalnum()]
        return " ".join(words)

    data['Reviews.Text'] = data['Reviews.Text'].fillna("")

    data['processed_text'] = data['Reviews.Text'].apply(preprocess_text)
else:
    print("Error: 'Reviews.Text' column not found in the dataset.")

In [24]:
data['char_count'] = data['processed_text'].apply(len)
data['word_count'] = data['processed_text'].apply(lambda x: len(x.split()))


In [25]:
data['avg_word_length'] = data['processed_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if len(x.split()) > 0 else 0)

In [27]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [28]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
data['vader_compound'] = data['Reviews.Text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

In [31]:
print(data[['Reviews.Text', 'processed_text', 'char_count', 'word_count', 'avg_word_length', 'vader_compound']].head())

             Reviews.Text     processed_text  char_count  word_count  \
0    Mediocre experience.     mediocr experi          14           2   
1      Worth every penny.  worth everi penni          17           3   
2  Exceeded expectations.      exceed expect          13           2   
3        Amazing quality!       amaz qualiti          12           2   
4              It's fine.               fine           4           1   

   avg_word_length  vader_compound  
0              6.5          0.0000  
1              5.0          0.2263  
2              6.0          0.0000  
3              5.5          0.6239  
4              4.0          0.2023  


In [None]:
# prompt:  Train a Sentiment Prediction Model

import pandas as pd


# Assuming 'data' DataFrame from the previous code block is available

# Prepare the data for sentiment prediction


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [35]:
if 'Sentiment_Category' in data.columns and 'Full_Review' in data.columns:

    X = data['Full_Review']
    y = data['Sentiment_Category']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    vectorizer = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vec, y_train)

    y_pred = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

else:
    print("Error: 'Sentiment_Category' or 'Full_Review' columns not found in the dataset.")

Accuracy: 1.0
              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00       128
     Neutral       1.00      1.00      1.00       129
    Positive       1.00      1.00      1.00       143

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [36]:
def aspect_based_sentiment(text, aspect):

    tokens = nltk.word_tokenize(text)
    try:
        aspect_index = tokens.index(aspect)
    except ValueError:
        return 'Neutral'
    window_size = 5
    start = max(0, aspect_index - window_size)
    end = min(len(tokens), aspect_index + window_size + 1)
    context = tokens[start:end]

    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(" ".join(context))
    compound_score = scores['compound']

    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

if 'processed_text' in data.columns:
  data['aspect_sentiment'] = data['processed_text'].apply(lambda x: aspect_based_sentiment(x, 'food'))
  print(data[['processed_text', 'aspect_sentiment']].head())
else:
  print("Error: 'processed_text' column not found. Make sure to run the preprocessing steps first.")

      processed_text aspect_sentiment
0     mediocr experi          Neutral
1  worth everi penni          Neutral
2      exceed expect          Neutral
3       amaz qualiti          Neutral
4               fine          Neutral


In [39]:
new_data_path = "/content/amazon_reviews_sample_50.csv"
try:
    new_data = pd.read_csv(new_data_path)

    if 'Reviews.Text' in new_data.columns:
        new_data['Reviews.Text'] = new_data['Reviews.Text'].fillna("")
        new_data['processed_text'] = new_data['Reviews.Text'].apply(preprocess_text)

        new_data['char_count'] = new_data['processed_text'].apply(len)
        new_data['word_count'] = new_data['processed_text'].apply(lambda x: len(x.split()))
        new_data['avg_word_length'] = new_data['processed_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if len(x.split()) > 0 else 0)
        new_data['vader_compound'] = new_data['Reviews.Text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

        if 'Full_Review' in new_data.columns:
            new_X = new_data['Full_Review']
            new_X_vec = vectorizer.transform(new_X)

            new_y_pred = model.predict(new_X_vec)
            new_data['predicted_sentiment'] = new_y_pred

            print("Predictions on new data:")
            print(new_data[['Full_Review', 'predicted_sentiment']].head())

            new_data['aspect_sentiment'] = new_data['processed_text'].apply(lambda x: aspect_based_sentiment(x, 'food'))
            print("\nAspect-Based Sentiment Analysis on new data:")
            print(new_data[['processed_text', 'aspect_sentiment']].head())
        else:
            print("Error: 'Full_Review' column not found in the new dataset.")

    else:
        print("Error: 'Reviews.Text' column not found in the new dataset.")

except FileNotFoundError:
    print(f"Error: File not found at {new_data_path}")
except Exception as e:
    print(f"An error occurred: {e}")

Predictions on new data:
                               Full_Review predicted_sentiment
0       Not bad, not great. Decent Quality             Neutral
1                     It's fine. It's okay             Neutral
2  Exceeded expectations. Highly Recommend            Positive
3                     It's fine. It's okay             Neutral
4       Not worth the price. Not Impressed            Negative

Aspect-Based Sentiment Analysis on new data:
  processed_text aspect_sentiment
0      bad great          Neutral
1           fine          Neutral
2  exceed expect          Neutral
3           fine          Neutral
4    worth price          Neutral
