In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
import pickle
import pandas as pd

In [3]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abhia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [4]:
# Load and prepare data
df = pd.read_csv("all_kindle_review .csv")
df = df[["reviewText", "rating"]]

In [5]:
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [6]:
# Convert to binary sentiment (0 for negative, 1 for positive)
df['rating'] = df['rating'].apply(lambda x: 0 if x < 3 else 1)

In [7]:
# Enhanced text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers (keeping basic punctuation)
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
    
    # Expand contractions
    contractions = {
        "won't": "will not", "can't": "cannot", "n't": " not",
        "'re": " are", "'s": " is", "'d": " would",
        "'ll": " will", "'t": " not", "'ve": " have",
        "'m": " am"
    }
    for cont, exp in contractions.items():
        text = text.replace(cont, exp)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove HTML tags
    text = BeautifulSoup(text, 'lxml').get_text()
    
    # Handle negations
    text = re.sub(r'\b(not|no|never)\s+(\w+)', r'\1_\2', text)
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join back to string
    text = ' '.join(tokens)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['cleaned_review'] = df['reviewText'].apply(preprocess_text)

In [8]:
# Check class balance
print("Class distribution:\n", df['rating'].value_counts())

Class distribution:
 rating
1    8000
0    4000
Name: count, dtype: int64


In [9]:
df.head()

Unnamed: 0,reviewText,rating,cleaned_review
0,"Jace Rankin may be short, but he's nothing to ...",1,"jace rankin may short , he nothing mess , man ..."
1,Great short read. I didn't want to put it dow...,1,great short read . didnt want put read one sit...
2,I'll start by saying this is the first of four...,1,ill start saying first four book wasnt expecti...
3,Aggie is Angela Lansbury who carries pocketboo...,1,aggie angela lansbury carry pocketbook instead...
4,I did not expect this type of book to be in li...,1,not_expect type book library pleased find pric...


In [10]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_review'], df['rating'], test_size=0.2, random_state=42
)

In [11]:
# Feature extraction with TF-IDF and n-grams
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),  # unigrams and bigrams
    max_features=10000,
    min_df=5,
    max_df=0.7
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [12]:
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_tfidf, y_train)

In [13]:
# Model Training - Ensemble Approach
# Logistic Regression
lr = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    C=10,
    solver='liblinear'
)

In [14]:
# Support Vector Machine
svm = SVC(
    probability=True,
    kernel='linear',
    C=1,
    class_weight='balanced'
)

In [15]:
# Random Forest
rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced_subsample',
    max_depth=None,
    min_samples_split=5,
    random_state=42
)

In [16]:
# Create ensemble model
ensemble = VotingClassifier(
    estimators=[('lr', lr), ('svm', svm), ('rf', rf)],
    voting='soft'  # uses predicted probabilities
)

In [17]:
# Train models
print("Training Logistic Regression...")
lr.fit(X_train_res, y_train_res)
print("Training SVM...")
svm.fit(X_train_res, y_train_res)
print("Training Random Forest...")
rf.fit(X_train_res, y_train_res)
print("Training Ensemble...")
ensemble.fit(X_train_res, y_train_res)

Training Logistic Regression...
Training SVM...
Training Random Forest...
Training Ensemble...


In [18]:
# Evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"\nModel: {type(model).__name__}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [19]:
# Evaluate all models
print("\nModel Evaluation:")
evaluate_model(lr, X_test_tfidf, y_test)
evaluate_model(svm, X_test_tfidf, y_test)
evaluate_model(rf, X_test_tfidf, y_test)
evaluate_model(ensemble, X_test_tfidf, y_test)


Model Evaluation:

Model: LogisticRegression
Accuracy: 0.8616666666666667
Confusion Matrix:
 [[ 638  165]
 [ 167 1430]]

Model: SVC
Accuracy: 0.85625
Confusion Matrix:
 [[ 647  156]
 [ 189 1408]]

Model: RandomForestClassifier
Accuracy: 0.83625
Confusion Matrix:
 [[ 576  227]
 [ 166 1431]]

Model: VotingClassifier
Accuracy: 0.8670833333333333
Confusion Matrix:
 [[ 632  171]
 [ 148 1449]]


In [33]:
# Save the best model (ensemble) and vectorizer
with open('best3_model.pkl', 'wb') as f:
    pickle.dump(ensemble, f)
    
with open('tfidf_vectorizer_3.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [34]:
# Example prediction
def predict_sentiment(text):
    # Load model and vectorizer
    with open('best3_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open('tfidf_vectorizer_3.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    
    # Preprocess and vectorize
    cleaned_text = preprocess_text(text)
    text_vector = vectorizer.transform([cleaned_text])
    
    # Predict
    prediction = model.predict(text_vector)
    probability = model.predict_proba(text_vector)
    
    sentiment = "Positive" if prediction[0] == 1 else "Negative"
    confidence = max(probability[0])
    
    return f"Sentiment: {sentiment} (Confidence: {confidence:.2f})"

In [35]:
# Test prediction
sample_text = "one gave hope series first little bit cheesy one funnyand got interested rest series book still feel little short overall good read"
print("\nSample Prediction:")
print(predict_sentiment(sample_text))


Sample Prediction:
Sentiment: Positive (Confidence: 0.95)
