<a href="https://colab.research.google.com/github/3m6d/ML-techniques-practise/blob/main/AISentimentAnalysisEn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import drive

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from imblearn.over_sampling import SMOTE

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
drive.mount('/content/drive')

df = pd.read_json('/content/drive/MyDrive/Beauty.jsonl', lines=True)

In [None]:
df.head()

Renaming the Rating column to Sentiment column

In [None]:

# Remove any 'neutral' ratings equal to 3
df = df[df['rating'] != 3]

df.dropna(inplace=True)

# Encode 4s and 5s as 1 (positive sentiment) and 1s and 2s as 0 (negative sentiment)
df['Sentiment'] = np.where(df['rating'] > 3, 1, 0)

In [None]:

print("The Number of Reviews less than rating 3")
df[df['rating'] < 3].shape

In [None]:

print("The Number of Reviews greater than 3")
df[df['rating'] > 3].shape

In [None]:
print("The Size of Dataset",df.shape)
print('Distribution of Positive and Negative Reviews, Three being the threshold')
df.hist('rating')

In [None]:

X = df['text']
y = df['Sentiment']

In [None]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer


def cleanText(raw_text, remove_stopwords=True, stemming=False, split_text=False, \
             ):
    '''
    Convert a raw review to a cleaned review
    '''
    text = BeautifulSoup(raw_text, 'lxml').get_text()  #remove html
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # remove non-character
    words = letters_only.lower().split() # convert to lower case

    if remove_stopwords: # remove stopword
        stops = set(stopwords.words("english"))
        stops.remove('not')
        stops.remove('no')
        words = [w for w in words if not w in stops]
    if stemming==True: # stemming
        stemmer = PorterStemmer()
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(w) for w in words]

    if split_text==True:  # split text
        return (words)

    return( " ".join(words))

In [None]:
import nltk
nltk.download('stopwords')

X_cleaned = []

for d in X:
    X_cleaned.append(cleanText(d))

In [None]:
countVect = CountVectorizer(min_df = 50, ngram_range = (1,2),strip_accents='unicode', binary=True)
X_all_countVect = countVect.fit_transform(X_cleaned)

print("Number of features : %d \n" %len(countVect.get_feature_names_out())) #1722
print("Show some feature names : \n", countVect.get_feature_names_out()[::1000])

In [None]:
# Split data into training and testing sets
X_train_countVect, X_test_countVect, y_train, y_test = train_test_split(
    X_all_countVect, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
def knn_classifier(X_train_countVect,y_train,X_test_countVect,y_test,target):
    classifier=KNeighborsClassifier(n_neighbors=5)
    classifier.fit(X_train_countVect,y_train)

    y_pred=classifier.predict(X_test_countVect)

    y_pred_train = classifier.predict(X_train_countVect)
    print('KNN Results:')
    print("KNN Accuracy:",metrics.accuracy_score(y_test,y_pred))
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix",confusion_matrix(y_test, y_pred))
    print("KNN Train Accuracy:",metrics.accuracy_score(y_train,y_pred_train))
    print(classification_report(y_train, y_pred_train))

    return metrics.accuracy_score(y_test,y_pred)

In [None]:
def svc_classifier(X_train_countVect,y_train,X_test_countVect,y_test,target_names):
  from sklearn import svm
  clf=svm.SVC(kernel='linear')
  clf.fit(X_train_countVect,y_train)

  y_pred=clf.predict(X_test_countVect)

  y_pred_train =clf.predict(X_train_countVect)

  print('SVM Results:')
  print("SVM Accuracy:",metrics.accuracy_score(y_test,y_pred))
  print(classification_report(y_test, y_pred, target_names=target_names))
  print("Confusion Matrix",confusion_matrix(y_test, y_pred))
  print("SVM Train Accuracy:",metrics.accuracy_score(y_train,y_pred_train))
  print(classification_report(y_train, y_pred_train, target_names=target_names))

  return metrics.accuracy_score(y_test,y_pred)


In [None]:

# Naive Bayes classifier
def nb_classifier(X_train_countVect,y_train,X_test_countVect,y_test,target_names):

  clf = MultinomialNB()
  clf.fit(X_train_countVect.toarray(),y_train)

  y_pred=clf.predict(X_test_countVect)

  y_pred_train =clf.predict(X_train_countVect)
  print('NB Results:')
  print("MNB Accuracy:",metrics.accuracy_score(y_test,y_pred))
  print(classification_report(y_test, y_pred, target_names=target_names))
  print("Confusion Matrix",confusion_matrix(y_test, y_pred))
  print("MNB Train Accuracy:",metrics.accuracy_score(y_train,y_pred_train))
  print(classification_report(y_train, y_pred_train, target_names=target_names))

  return metrics.accuracy_score(y_test,y_pred)

In [None]:

# Logistic Regression
def lr_classifier(X_train_countVect,y_train,X_test_countVect,y_test,target_names):
  lr = LogisticRegression()
  lr.fit(X_train_countVect.toarray(), y_train)


  y_pred=lr.predict(X_test_countVect)

  y_pred_train =lr.predict(X_train_countVect)
  print('LR Results:')
  print("LR Accuracy:",metrics.accuracy_score(y_test,y_pred))
  print(classification_report(y_test, y_pred, target_names=target_names))
  print("Confusion Matrix",confusion_matrix(y_test, y_pred))
  print("LR Train Accuracy:",metrics.accuracy_score(y_train,y_pred_train))
  print(classification_report(y_train, y_pred_train, target_names=target_names))

  return metrics.accuracy_score(y_test,y_pred)

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler


knn = []
svm = []
dt =[]
nb =[]
lr =[]

rus = RandomUnderSampler(random_state=777)
X_RUS, y_RUS = rus.fit_resample(X_all_countVect, y)
target_names = ['Positive','Negative']
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in skf.split(X_RUS, y_RUS):
  X_train_countVect = X_RUS[train_index]
  y_train = y_RUS[train_index]
  X_test_countVect = X_RUS[test_index]
  y_test = y_RUS[test_index]


  knn_mean = knn_classifier(X_train_countVect,y_train,X_test_countVect,y_test,target_names)
  knn.append(knn_mean)
  nb_mean = nb_classifier(X_train_countVect,y_train,X_test_countVect,y_test,target_names)
  nb.append(nb_mean)
  lr_mean = lr_classifier(X_train_countVect,y_train,X_test_countVect,y_test,target_names)
  lr.append(lr_mean)
  svm_mean = svc_classifier(X_train_countVect,y_train,X_test_countVect,y_test,target_names)
  svm.append(svm_mean)


  print('The Accuracy for KNN:',sum(knn)/len(knn))
  print('The Accuracy for SVM:',sum(svm)/len(svm))
  print('The Accuracy for DT:',sum(dt)/len(dt))
  print('The Accuracy for MNB:',sum(nb)/len(nb))
  print('The Accuracy for LR:',sum(lr)/len(lr))

In [None]:
def preprocess_and_vectorize(X_train, X_test, vectorizer):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Function to evaluate model performance
def evaluate_model(y_true, y_pred, y_probs=None):
    metrics = {
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
    }
    if y_probs is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_true, y_probs)
    return metrics

In [None]:
# 1. Naive Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_preds = nb_model.predict(X_test_tfidf)
nb_probs = nb_model.predict_proba(X_test_tfidf)[:, 1]
print("Naive Bayes Metrics:", evaluate_model(y_test, nb_preds, nb_probs))

In [None]:
# 2. Support Vector Machines (SVM)
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train_tfidf, y_train)
svm_preds = svm_model.predict(X_test_tfidf)
svm_probs = svm_model.predict_proba(X_test_tfidf)[:, 1]
print("SVM Metrics:", evaluate_model(y_test, svm_preds, svm_probs))

In [None]:
# 3. Logistic Regression
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train_tfidf, y_train)
log_reg_preds = log_reg_model.predict(X_test_tfidf)
log_reg_probs = log_reg_model.predict_proba(X_test_tfidf)[:, 1]
print("Logistic Regression Metrics:", evaluate_model(y_test, log_reg_preds, log_reg_probs))

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

# Function to preprocess and vectorize data
def preprocess_and_vectorize(X_train, X_test, vectorizer):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec

# Function to get Word2Vec features
def get_w2v_features(tokens, model, vector_size):
    features = []
    for sentence in tokens:
        vec = np.zeros(vector_size)
        count = 0
        for word in sentence:
            if word in model.wv.key_to_index:
                vec += model.wv[word]
                count += 1
        if count > 0:
            vec /= count
        features.append(vec)
    return np.array(features)

# Function to evaluate model
def evaluate_model(model, X_test_vec, y_test_enc):
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test_enc, y_pred)
    print(f"Accuracy: {accuracy:.2f}\n")
    print("Classification Report:\n", classification_report(y_test_enc, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test_enc, y_pred))
    print("-" * 50)
    return accuracy

# Step 1: Create Dummy Data
data = {
    "text": [
        "I love this product, it's amazing!",
        "This is the worst experience I've ever had.",
        "Absolutely fantastic! Highly recommend.",
        "Not great, would not buy again.",
        "The quality is superb, really satisfied.",
        "Terrible, broke after one use.",
        "Decent product for the price.",
        "Awful customer service, very disappointed.",
        "Excellent value for money, very happy.",
        "It's okay, nothing special but works fine."
    ],
    "sentiment": ["positive", "negative", "positive", "negative", "positive",
                  "negative", "neutral", "negative", "positive", "neutral"]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 2: Preprocessing
X = df['text']
y = df['sentiment']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encoding labels for consistency
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))  # Using unigrams and bigrams
X_train_tfidf, X_test_tfidf = preprocess_and_vectorize(X_train, X_test, tfidf_vectorizer)

# Bag of Words (BoW) Vectorization
bow_vectorizer = CountVectorizer(max_features=500, ngram_range=(1, 2))
X_train_bow, X_test_bow = preprocess_and_vectorize(X_train, X_test, bow_vectorizer)

# Word2Vec Vectorization
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4, sg=1)
X_train_w2v = get_w2v_features(X_train_tokens, w2v_model, vector_size=100)
X_test_w2v = get_w2v_features(X_test_tokens, w2v_model, vector_size=100)

# Step 3: Model Training and Evaluation
methods = {
    "TF-IDF": (X_train_tfidf, X_test_tfidf),
    "Bag of Words": (X_train_bow, X_test_bow),
    "Word2Vec": (X_train_w2v, X_test_w2v)
}

models = {}
results = {}

for method, (X_train_vec, X_test_vec) in methods.items():
    print(f"Evaluating with {method}...")
    model = LogisticRegression(max_iter=200)
    model.fit(X_train_vec, y_train_enc)
    accuracy = evaluate_model(model, X_test_vec, y_test_enc)
    results[method] = accuracy
    models[method] = model

# Compare Results
print("\nComparison of Accuracy:")
for method, accuracy in results.items():
    print(f"{method}: {accuracy:.2f}")

# Step 4: Predict Sentiment for User Input
print("\nEnter sentences to predict their sentiment:")
while True:
    sentence = input("Enter a sentence (or type 'exit' to quit): ")
    if sentence.lower() == 'exit':
        break

    sentence_vecs = {
        "TF-IDF": tfidf_vectorizer.transform([sentence]),
        "Bag of Words": bow_vectorizer.transform([sentence]),
        "Word2Vec": get_w2v_features([sentence.split()], w2v_model, vector_size=100)
    }

    print("Predictions:")
    for method, vec in sentence_vecs.items():
        pred_label = models[method].predict(vec)
        sentiment = label_encoder.inverse_transform(pred_label)
        print(f"{method}: {sentiment[0]}")
    print("-" * 50)
