In [3]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Load dataset from CSV
file_path = r"C:\Users\Aravinnth\Desktop\cdwintern\all_kindle_review.csv"  # Update with your dataset path
df = pd.read_csv(file_path)

# Ensure the dataset has the required columns
df = df[['reviewText', 'rating']]
df.columns = ['text', 'label']  # Rename columns to match the expected names

# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(preprocess_text)

# Train Word2Vec
tokenized_texts = df['tokens'].tolist()
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, sg=0)

# Train TF-IDF
tfidf = TfidfVectorizer()
tfidf.fit(df['text'])
tfidf_vocab = tfidf.vocabulary_
idf_values = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

# Compute TF-IDF weighted Word2Vec sentence embeddings
def get_tfidf_weighted_vector(tokens, w2v_model, idf_values):
    weighted_vectors = []
    for token in tokens:
        if token in w2v_model.wv and token in idf_values:
            weighted_vectors.append(w2v_model.wv[token] * idf_values[token])
    return np.mean(weighted_vectors, axis=0) if weighted_vectors else np.zeros(w2v_model.vector_size)

df['vector'] = df['tokens'].apply(lambda x: get_tfidf_weighted_vector(x, w2v_model, idf_values))

# Prepare data
X = np.vstack(df['vector'].values)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluate multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

best_model_name = None
best_accuracy = 0

print("Model Evaluation Results:\n")
for model_name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    
    # Update best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = model_name

# Output the best model
print(f"\nBest Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aravinnth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aravinnth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Evaluation Results:

Random Forest Accuracy: 0.4108
SVM Accuracy: 0.4496
Logistic Regression Accuracy: 0.4442

Best Model: SVM with Accuracy: 0.4496


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Load dataset from CSV
file_path = r"C:\Users\Aravinnth\Desktop\cdwintern\all_kindle_review.csv"  # Update with your dataset path
df = pd.read_csv(file_path)

# Ensure the dataset has the required columns
df = df[['reviewText', 'rating']]
df.columns = ['text', 'label']  # Rename columns to match the expected names

# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(preprocess_text)

# Train Word2Vec
tokenized_texts = df['tokens'].tolist()
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, sg=0)

# Train TF-IDF
tfidf = TfidfVectorizer()
tfidf.fit(df['text'])
tfidf_vocab = tfidf.vocabulary_
idf_values = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

# Compute TF-IDF weighted Word2Vec sentence embeddings
def get_tfidf_weighted_vector(tokens, w2v_model, idf_values):
    weighted_vectors = []
    for token in tokens:
        if token in w2v_model.wv and token in idf_values:
            weighted_vectors.append(w2v_model.wv[token] * idf_values[token])
    return np.mean(weighted_vectors, axis=0) if weighted_vectors else np.zeros(w2v_model.vector_size)

df['vector'] = df['tokens'].apply(lambda x: get_tfidf_weighted_vector(x, w2v_model, idf_values))

from sklearn.preprocessing import MinMaxScaler

# Prepare data
X = np.vstack(df['vector'].values)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data to be non-negative
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Evaluate multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Naive Bayes': MultinomialNB()
}

best_model_name = None
best_accuracy = 0

print("Model Evaluation Results:\n")
for model_name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    
    # Update best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = model_name

# Output the best model
print(f"\nBest Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aravinnth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aravinnth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Evaluation Results:

Random Forest Accuracy: 0.4004
SVM Accuracy: 0.4400
Logistic Regression Accuracy: 0.4333
Naive Bayes Accuracy: 0.3346

Best Model: SVM with Accuracy: 0.4400


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
