In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib
import re

# Load the dataset
df = pd.read_csv('dataset/file.csv', encoding='latin1')

df = df.dropna()

# Map the label column values
label_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

# Apply the mapping
df['label'] = df['label'].map(label_mapping)

In [27]:
# Define cleaning functions
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_non_ascii(text):
    # Remove non-ASCII characters
    return ''.join(char for char in text if ord(char) < 128)

def remove_digits(text):
    # Remove numeric digits
    return re.sub(r'\d+', '', text)

def remove_special_characters(text):
    # Remove special characters except whitespace
    return re.sub(r'[^\w\s]', '', text)

def normalize_case(text):
    # Normalize text to lowercase
    return text.lower()

def clean_text(text):
    # Remove URLs
    text = remove_urls(text)
    # Remove non-ASCII characters
    text = remove_non_ascii(text)
    # Remove numeric digits
    text = remove_digits(text)
    # Remove special characters except whitespace
    text = remove_special_characters(text)
    # Normalize case
    text = normalize_case(text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply cleaning functions to the 'comment' column
df['comment'] = df['comment'].apply(clean_text)

In [28]:
# Split dataset
X = df['comment']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [29]:


# Create a pipeline with TfidfVectorizer and SVM classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('svc', SVC(kernel='linear'))  
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy}')

Model Accuracy: 0.874552543377642


In [30]:
# Save the model to a file
joblib.dump(pipeline, 'sentiment_detection_model.pkl')

['sentiment_detection_model.pkl']

In [31]:
# Load the model from the file
model = joblib.load('sentiment_detection_model.pkl')

# Create a DataFrame with new comments
data = {
    'comment': [
        'The product is fantastic and exceeded my expectations!',
        'I am not satisfied with the quality of the item.',
        'Great service, but the delivery was late.',
        'The experience was okay, nothing special.',
        'Absolutely love this! Will definitely recommend.',
        'The item arrived damaged and had to be returned.',
        'Customer support was very helpful and responsive.',
        'I am disappointed with the product. It did not match the description.',
        'Highly recommend this to anyone looking for a quality product.',
        'The product is decent but could be improved.',
        'Excellent quality and fast shipping.',
        'Not worth the price. I expected better.',
        'Perfect! Exactly what I needed.',
        'The service was slow, but the product is good.',
        'I will not buy this again. It was a waste of money.',
        'Very happy with the purchase. It was as described.',
        'The product exceeded my expectations. Great buy!',
        'I had a great shopping experience overall.'
    ]
}

new_data = pd.DataFrame(data)

# Basic data cleaning on new data
new_data['comment'] = new_data['comment'].str.lower()  # Convert to lowercase
new_data['comment'] = new_data['comment'].str.replace(r'\d+', '', regex=True)  # Remove numbers
new_data['comment'] = new_data['comment'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
new_data['comment'] = new_data['comment'].str.strip()  # Remove whitespace

# Predict using the loaded model
new_predictions = model.predict(new_data['comment'])

# Add predictions to the new data
new_data['sentiment_prediction'] = new_predictions

print(new_data.head(20))


                                              comment  sentiment_prediction
0   the product is fantastic and exceeded my expec...                     2
1     i am not satisfied with the quality of the item                     2
2             great service but the delivery was late                     2
3             the experience was okay nothing special                     2
4      absolutely love this will definitely recommend                     2
5     the item arrived damaged and had to be returned                     0
6    customer support was very helpful and responsive                     2
7   i am disappointed with the product it did not ...                     0
8   highly recommend this to anyone looking for a ...                     2
9         the product is decent but could be improved                     2
10                excellent quality and fast shipping                     2
11              not worth the price i expected better                     2
12          