<a href="https://colab.research.google.com/github/ColleyMo/AI-project/blob/main/sentiment-analysis1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from wordcloud import WordCloud
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch
import torch.nn as nn
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## importing the data

In [5]:
# Import Data
df = pd.read_csv('adidasdata.csv')



In [6]:
df.shape

(1106, 10)

# sentiment analysis with vader

In [7]:
sie = SentimentIntensityAnalyzer()

In [8]:
sie.polarity_scores('I love this shoe')

{'neg': 0.0, 'neu': 0.323, 'pos': 0.677, 'compound': 0.6369}

In [9]:
sie.polarity_scores("I hate this shoe, i cant walk in it")

{'neg': 0.381, 'neu': 0.619, 'pos': 0.0, 'compound': -0.5719}

In [10]:
df['Reviews'] = df['Reviews'].astype(str)

In [11]:
df['PolarityScores'] = [sie.polarity_scores(x)['compound'] for x in df['Reviews']]

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Price,ColoursAvailable,ReviewTopic,Reviews,UserID,Date,VerifiedPurchaser,IncentivizedReview,PolarityScores
0,0,Samba OG Shoes,100,6,Nice quality shoes,They match practically with any outfit that I...,abdubs35,"September 13, 2023",False,False,0.0
1,1,Samba OG Shoes,100,6,Nice shoes,"Very nice shoes , just not as green as the pic...",THEMAN,"September 13, 2023",False,True,0.4754
2,2,Samba OG Shoes,100,6,Buy the shoes!,"Bang on trend, comfy and cool. Would recommend...",Huggsy,"September 12, 2023",True,True,0.5859
3,3,Samba OG Shoes,100,6,ALL GOODS SOLID! WORTH THE PRICE!,THAT WAS DOPE! medyo mahaba lng ng very little...,TOYOTABOY,"September 12, 2023",False,False,-0.4997
4,4,Samba OG Shoes,100,6,Never out of style,Love how light on your feet they are and comfo...,SangeBo,"September 12, 2023",False,False,0.8176


#fine tuning


In [25]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenize and encode the text data
encoded_data = tokenizer(df['Reviews'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Create a TensorDataset for training
dataset = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'],
                        torch.tensor(df['SentimentScores'].astype(int).values))

# Define DataLoader for batching and shuffling data
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Fine-tuning BERT on the sentiment analysis task
for epoch in range(1):  # Adjust number of epochs as needed
    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# classification


In [26]:
df['SentimentScores'] = df['Reviews'].apply(lambda x: sie.polarity_scores(x)['compound'])
X = df[['SentimentScores']].values  # Feature: Sentiment scores
y = df['ReviewTopic'].values  # Label: Review topic

In [27]:
positive_threshold = 0.2
negative_threshold = -0.2

# Function to classify sentiment based on polarity score
def classify_sentiment(score):
    if score > positive_threshold:
        return 'Positive'
    elif score < negative_threshold:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the classification function to polarity scores
df['Sentiment'] = df['PolarityScores'].apply(classify_sentiment)

In [28]:
# Train-test split
X = df[['PolarityScores']].values  # Feature: Polarity scores
y = df['Sentiment'].values  # Target: Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)


In [29]:
# Predictions
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)

In [30]:
# Calculate precision, recall, and F1-score for the training set
train_precision = precision_score(y_train, y_pred_train, average='weighted')
train_recall = recall_score(y_train, y_pred_train, average='weighted')
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')

# Calculate precision, recall, and F1-score for the test set
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

# Print the evaluation metrics
print("Training Set Metrics:")
print(f"Precision: {train_precision}")
print(f"Recall: {train_recall}")
print(f"F1-Score: {train_f1_score}")
print()
print("Test Set Metrics:")
print(f"Precision: {test_precision}")
print(f"Recall: {test_recall}")
print(f"F1-Score: {test_f1_score}")


Training Set Metrics:
Precision: 0.9844537991404319
Recall: 0.9841628959276018
F1-Score: 0.9838292271995323

Test Set Metrics:
Precision: 0.9910143303925687
Recall: 0.990990990990991
F1-Score: 0.9908895908895909
