In [19]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from keras.models import Sequential , load_model
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

### Data Exploration

In [20]:
data = pd.read_csv('sentimentdataset (Project 1).csv')
print(data.head())

  Source  ID                                            Message  Target
0   Yelp   0                                 Crust is not good.       0
1   Yelp   1          Not tasty and the texture was just nasty.       0
2   Yelp   2  Stopped by during the late May bank holiday of...       1
3   Yelp   3  The selection on the menu was great and so wer...       1
4   Yelp   4     Now I am getting angry and I want my damn pho.       0


In [21]:
# Check distribution of samples in each class
class_distribution = data['Target'].value_counts()
print(class_distribution)

Target
1    1385
0    1360
Name: count, dtype: int64


### Data Preprocessing

In [22]:
# Dropping 'Source' and 'ID' columns
data = data.drop(['Source', 'ID'], axis=1)
print(data.head())

                                             Message  Target
0                                 Crust is not good.       0
1          Not tasty and the texture was just nasty.       0
2  Stopped by during the late May bank holiday of...       1
3  The selection on the menu was great and so wer...       1
4     Now I am getting angry and I want my damn pho.       0


In [23]:
# Load SpaCy's English model
spacy_model = spacy.load('en_core_web_sm')

# Function for text preprocessing (stop words removal and lemmatization)
def preprocess_text(text):
    doc = spacy_model(text)
    processed_text = ' '.join([token.lemma_ for token in doc if not token.is_stop])
    return processed_text

# Apply text preprocessing to the 'Message' column
data['Processed_Message'] = data['Message'].apply(preprocess_text)
print(data[['Message', 'Processed_Message']].head())

                                             Message  \
0                                 Crust is not good.   
1          Not tasty and the texture was just nasty.   
2  Stopped by during the late May bank holiday of...   
3  The selection on the menu was great and so wer...   
4     Now I am getting angry and I want my damn pho.   

                                   Processed_Message  
0                                       crust good .  
1                              tasty texture nasty .  
2  stop late bank holiday Rick Steve recommendati...  
3                       selection menu great price .  
4                          get angry want damn pho .  


In [24]:
# Initialize the Tf-idf vectorizer
tfidf_model = TfidfVectorizer()

# Creating feature and target variables
X = data['Processed_Message']
y = data['Target']

# Transform the text data into numerical vectors
X_tfidf = tfidf_model.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (2196, 4153)
Shape of X_test: (549, 4153)


## Classification and Comparison

### Initial Experiment (LinearSVC)

In [25]:
# Initialize LinearSVC
svc_model = LinearSVC(dual=True)

# Define hyperparameters for grid search
hyperparam_grid = {'C': [0.1, 1, 10, 100]}

# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(svc_model, hyperparam_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_parameters = grid_search.best_params_
best_svc = grid_search.best_estimator_

# Predict using the best model
y_pred_svc = best_svc.predict(X_test)

# Classification report for LinearSVC
print("Best Parameters:", best_parameters)
print("\nClassification Report for LinearSVC:")
print(classification_report(y_test, y_pred_svc))

Best Parameters: {'C': 1}

Classification Report for LinearSVC:
              precision    recall  f1-score   support

           0       0.75      0.80      0.77       265
           1       0.80      0.75      0.77       284

    accuracy                           0.77       549
   macro avg       0.77      0.77      0.77       549
weighted avg       0.77      0.77      0.77       549



### Subsequent Experiment (ANN)

In [26]:
# Define the ANN model
ann_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  
])

# Compile the model
ann_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Define a callback to save the best model based on validation loss
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='min', verbose=1)

# Train the model with the callback
history = ann_model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.1, callbacks=[checkpoint], verbose=1)

# Evaluate the model on the test set
y_pred_ann = (ann_model.predict(X_test.toarray()) > 0.5).astype(int)

# Classification report for ANN
print("\nClassification Report for ANN:")
print(classification_report(y_test, y_pred_ann))

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.68538, saving model to best_model.keras
Epoch 2/10
Epoch 2: val_loss improved from 0.68538 to 0.61458, saving model to best_model.keras
Epoch 3/10
Epoch 3: val_loss improved from 0.61458 to 0.49227, saving model to best_model.keras
Epoch 4/10
Epoch 4: val_loss improved from 0.49227 to 0.45578, saving model to best_model.keras
Epoch 5/10
Epoch 5: val_loss improved from 0.45578 to 0.44237, saving model to best_model.keras
Epoch 6/10
Epoch 6: val_loss did not improve from 0.44237
Epoch 7/10
Epoch 7: val_loss did not improve from 0.44237
Epoch 8/10
Epoch 8: val_loss did not improve from 0.44237
Epoch 9/10
Epoch 9: val_loss did not improve from 0.44237
Epoch 10/10
Epoch 10: val_loss did not improve from 0.44237

Classification Report for ANN:
              precision    recall  f1-score   support

           0       0.76      0.83      0.79       265
           1       0.82      0.76      0.79       284

    accuracy                        

### Testing the saved best model

In [27]:
# Sample sentences as new data
new_sentences = [
    "The Burrittos Blah!",
    "The food, amazing.",
    "Service is also cute.",
    "I could care less... The interior is just beautiful.",
    "So they performed.",
    "That's right....the red velvet cake.....ohhh this stuff is so good.",
    "- They never brought a salad we asked for.",
    "This hole in the wall has great Mexican street tacos, and friendly staff.",
    "Took an hour to get our food only 4 tables in restaurant my food was Luke warm, Our server was running around like he was totally overwhelmed.",
    "The worst was the salmon sashimi.",
    "Also there are combos like a burger, fries, and beer for 23 which is a decent deal.",
    "This was like the final blow!",
    "I found this place by accident and I could not be happier.",
    "Overall, I like this place a lot.",
    "The only redeeming quality of the restaurant was that it was very inexpensive.",
    "Ample portions and good prices.",
    "Poor service, the waiter made me feel like I was stupid every time he came to the table.",
    "My first visit to Hiro was a delight!",
    "Service sucks.",
    "The shrimp tender and moist."
]


# Load the saved model
loaded_model = load_model('best_model.keras')

# Define and fit a Tf-idf vectorizer on the existing data and use it to transform new data
tfidf_model = TfidfVectorizer()
X_tfidf = tfidf_model.fit_transform(data['Processed_Message'])  # 'data' is your original dataset

new_sentences_tfidf = tfidf_model.transform(new_sentences)

# Make predictions on new data
predictions = (loaded_model.predict(new_sentences_tfidf.toarray()) > 0.5).astype(int)

# Display predictions positive if 1 negative if 0
for sentence, prediction in zip(new_sentences, predictions):
    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"Sentence: {sentence} - Predicted Sentiment: {sentiment}")


Sentence: The Burrittos Blah! - Predicted Sentiment: Negative
Sentence: The food, amazing. - Predicted Sentiment: Positive
Sentence: Service is also cute. - Predicted Sentiment: Positive
Sentence: I could care less... The interior is just beautiful. - Predicted Sentiment: Positive
Sentence: So they performed. - Predicted Sentiment: Negative
Sentence: That's right....the red velvet cake.....ohhh this stuff is so good. - Predicted Sentiment: Positive
Sentence: - They never brought a salad we asked for. - Predicted Sentiment: Positive
Sentence: This hole in the wall has great Mexican street tacos, and friendly staff. - Predicted Sentiment: Positive
Sentence: Took an hour to get our food only 4 tables in restaurant my food was Luke warm, Our server was running around like he was totally overwhelmed. - Predicted Sentiment: Negative
Sentence: The worst was the salmon sashimi. - Predicted Sentiment: Positive
Sentence: Also there are combos like a burger, fries, and beer for 23 which is a dece