<a href="https://colab.research.google.com/github/Arpita5188/CBT-CIP/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
data = pd.read_csv('/content/Reviews.csv', names=['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'], skiprows=1, on_bad_lines='skip', quoting=3) # quoting=3 to use QUOTE_NONE

# Display dataset structure
print(data.head())
print(data.info())

# Assuming the dataset has 'ReviewText' and 'Rating' columns
reviews = data['Text']
ratings = data['Score']

# Data Preprocessing
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

data['CleanedText'] = reviews.apply(preprocess_text)

# Sentiment Labeling
# Assuming ratings 1-2 = Negative, 3 = Neutral, 4-5 = Positive
def label_sentiment(rating):
    if rating <= 2:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

data['Sentiment'] = ratings.apply(label_sentiment)

# Splitting the Data
X = data['CleanedText']
y = data['Sentiment']

# Converting text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Model Training
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Predicting sentiment of new reviews
def predict_sentiment(review):
    review_cleaned = preprocess_text(review)
    review_tfidf = vectorizer.transform([review_cleaned])
    return model.predict(review_tfidf)[0]

# Test the prediction
new_review = "This product is amazing! It works like a charm."
predicted_sentiment = predict_sentiment(new_review)
print(f"Review: {new_review}")
print(f"Predicted Sentiment: {predicted_sentiment}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Id   ProductId          UserId    ProfileName  HelpfulnessNumerator  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW     delmartian                     1   
1   2  B00813GRG4  A1D87F6ZCVE5NK         dll pa                     0   
2   4  B000UA0QIQ  A395BORC6FGVXV           Karl                     3   
3   9  B000E7L2R4  A1MZYO9TZK0BBI       R. James                     1   
4  10  B00171APVA  A21BT40VZCCYT4  Carol A. Reed                     0   

   HelpfulnessDenominator  Score        Time                Summary  \
0                       1      5  1303862400  Good Quality Dog Food   
1                       0      1  1346976000      Not as Advertised   
2                       3      2  1307923200         Cough Medicine   
3                       1      5  1322006400             Yay Barley   
4                       0      5  1351209600       Healthy Dog Food   

                                                Text  
0  I have bought several of the Vitality canned d...  
1  "Product arrive