# BAG OF WORD FEATURE EXTRACTION



## Importing Libraries and Loading Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = '/content/drive/MyDrive/SEM 5/DS PROJ/DSP/labeled_tweets.csv'
data = pd.read_csv(file_path)

# Mapping sentiment to numerical values
sentiment_mapping = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
data['sentiment_numerical'] = data['sentiment'].map(sentiment_mapping)

# Select the text and sentiment columns
text_column = 'cleaned_text'
label_column = 'sentiment_numerical'
texts = data[text_column].astype(str)
labels = data[label_column]

# Display basic stats about the dataset
print("\nDataset Info:")
print(data.info())
print(labels)

print("\nNumber of Samples:", len(data))
print("Number of Positive Labels:", sum(labels == 1 ))
print("Number of Negative Labels:", sum(labels == -1 ))
print("Number of Neutral Labels:", sum(labels == 0 ))



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8850 entries, 0 to 8849
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   conversation_id_str  8850 non-null   float64
 1   favorite_count       8850 non-null   int64  
 2   full_text            8850 non-null   object 
 3   lang                 8850 non-null   object 
 4   reply_count          8850 non-null   int64  
 5   retweet_count        8850 non-null   int64  
 6   smartphone           8850 non-null   object 
 7   Month                8850 non-null   int64  
 8   Year                 8850 non-null   int64  
 9   cleaned_text         8850 non-null   object 
 10  sentiment_scores     8850 non-null   object 
 11  compound             8850 non-null   float64
 12  sentiment            8850 non-null   object 
 13  sentiment_numerical  8850 non-null   int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 968.1+ KB
None
0      -1

Feature extraction is done using Bag of Words (BOW) representation to convert text into numerical data.

## Bag of Words (BOW):

- Converts text into a matrix of token counts.
- Each row represents a document, and each column corresponds to a token.
- The CountVectorizer is used to create the BOW matrix, and the shape of the matrix (rows, features) is printed.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Unigrams and bigrams
bow_vectorizer = CountVectorizer(ngram_range=(1, 2))  # (1, 2) means unigrams and bigrams
bow_matrix = bow_vectorizer.fit_transform(texts)

print("BOW with Unigrams and Bigrams Shape:", bow_matrix.shape)


BOW with Unigrams and Bigrams Shape: (8850, 103583)


## Handling Imbalanced Data with SMOTE



```
Number of Samples: 8850
Number of Positive Labels: 4047
Number of Negative Labels: 2194
Number of Neutral Labels: 2609
```
Number of positive labels higher than negative and neutral label, thus it might lead to biased model performance, where the model may favor the majority class (positive) and underperform on the minority classes (negative & neutral)



In [None]:
# Apply SMOTE to balance the classes in the training set
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(bow_matrix, labels)



In [None]:
print("\nNumber of Samples:", len(data))
print("Number of Positive Labels:", sum(y_train_resampled == 1 ))
print("Number of Negative Labels:", sum(y_train_resampled == -1 ))
print("Number of Neutral Labels:", sum(y_train_resampled == 0 ))


Number of Samples: 8850
Number of Positive Labels: 4047
Number of Negative Labels: 4047
Number of Neutral Labels: 4047


## Train-Test Split

In [None]:
# To split dataset to train and tes
from sklearn.model_selection import train_test_split
# Check the class distribution after resampling
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.2, random_state=42
)

print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)

resampled_distribution = Counter(y_train)
print("Class distribution after resampling:")
print(resampled_distribution)

Training Set Shape: (9712, 103583)
Testing Set Shape: (2429, 103583)
Class distribution after resampling:
Counter({1: 3256, -1: 3246, 0: 3210})


```
Number of Samples: 8850
Number of Positive Labels: 4047
Number of Negative Labels: 2194
Number of Neutral Labels: 2609
```

# MODEL TRAINING & EVALUATION

In [None]:
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Make predictions
y_pred_nb = nb_model.predict(X_test)

## Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC

# Train the SVM model
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train, y_train)

# Predictions and decision scores
y_pred_svm = svm_model.predict(X_test)



## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predictions and probabilities
y_pred_dt = dt_model.predict(X_test)

In [None]:
# Define a function to calculate Accuracy and F1-Score
def evaluate_model(model_name, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Consistent usage of weighted F1-Score
    print(f"\n{model_name} Evaluation:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(classification_report(y_test, y_pred))
    return accuracy, f1


# Evaluate Models
acc_nb, f1_nb = evaluate_model("Naive Bayes", y_test, y_pred_nb)
acc_svm, f1_svm = evaluate_model("Support Vector Machine", y_test, y_pred_svm)
acc_dt, f1_dt = evaluate_model("Decision Tree", y_test, y_pred_dt)

model_results = pd.DataFrame({
    "Model": ["Naive Bayes", "SVM", "Decision Tree"],
    "Accuracy": [acc_nb, acc_svm, acc_dt],
    "F1-Score": [f1_nb, f1_svm, f1_dt]
})

print("\nSummary of Model Performance:")
print(model_results)


Naive Bayes Evaluation:
Accuracy: 0.57
F1-Score: 0.55
              precision    recall  f1-score   support

          -1       0.57      0.68      0.62       801
           0       0.79      0.28      0.41       837
           1       0.52      0.77      0.62       791

    accuracy                           0.57      2429
   macro avg       0.63      0.58      0.55      2429
weighted avg       0.63      0.57      0.55      2429


Support Vector Machine Evaluation:
Accuracy: 0.72
F1-Score: 0.72
              precision    recall  f1-score   support

          -1       0.68      0.76      0.72       801
           0       0.68      0.69      0.69       837
           1       0.82      0.71      0.76       791

    accuracy                           0.72      2429
   macro avg       0.73      0.72      0.72      2429
weighted avg       0.73      0.72      0.72      2429


Decision Tree Evaluation:
Accuracy: 0.65
F1-Score: 0.65
              precision    recall  f1-score   support

     

## Cross Validation and Hyperparameter Tuning on Best Model (SVM)

---



In [None]:
# Define the parameter grid for LinearSVC
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'max_iter': [1000, 5000]  # Maximum number of iterations
}

# Initialize the SVM model
svm_model = LinearSVC(random_state=42)

# Initialize GridSearchCV with F1-score as part of scoring metrics
grid_search_svm = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1_weighted',  # Scoring changed to F1-Score for better optimization
    verbose=1
)

# Fit GridSearchCV
grid_search_svm.fit(X_train, y_train)

# Display the best parameters and best F1-Score
print("Best Parameters for SVM:", grid_search_svm.best_params_)
print("Best Cross-Validated F1-Score: {:.2f}%".format(grid_search_svm.best_score_ * 100))

# Evaluate the best model on the test set
best_svm_model = grid_search_svm.best_estimator_
y_pred_best_svm = best_svm_model.predict(X_test)

# Final Model Evaluation
print("\nTest Set Evaluation with Tuned SVM:")
print(classification_report(y_test, y_pred_best_svm))
test_accuracy = accuracy_score(y_test, y_pred_best_svm)
test_f1_score = f1_score(y_test, y_pred_best_svm, average='weighted')

# Print Accuracy and F1-Score
print(f"Test Set Accuracy: {test_accuracy:.2f}")
print(f"Test Set F1-Score: {test_f1_score:.2f}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits




Best Parameters for SVM: {'C': 0.1, 'max_iter': 1000}
Best Cross-Validated F1-Score: 71.91%

Test Set Evaluation with Tuned SVM:
              precision    recall  f1-score   support

          -1       0.71      0.67      0.69       801
           0       0.66      0.76      0.71       837
           1       0.81      0.74      0.77       791

    accuracy                           0.72      2429
   macro avg       0.73      0.72      0.72      2429
weighted avg       0.73      0.72      0.72      2429

Test Set Accuracy: 0.72
Test Set F1-Score: 0.72


## Testing the sentiment prediction using Best Model and TFIDF Vectorizer

In [None]:
!pip install contractions
import re # Regular, to peform text cleaning (remove urls, special characters and extra spaces)
import nltk # NLTK is used for NLP process mainly in text preprocessing (Tokenization, stopword removal, lemmatization, POS taagging)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from contractions import fix #To expand contracted words like "can't" to "cannot".

# Download necessary NLTK data
nltk.download('punkt_tab') # for tokenizing sentences or words
nltk.download('punkt')
nltk.download('wordnet') # For Wordnet, used in lemmatization to access synonyms, definations, and for lemmatization
nltk.download('omw-1.4')  # WordNet multilingual extensions, to support multilingual
nltk.download('averaged_perceptron_tagger_eng') # For part-of-speech tagging
nltk.download('stopwords') # For stopword removal during text preprocessing

# List of custom stopwords to retain from removing, including "no" and "not"
default_stopwords = set(stopwords.words('english'))
custom_stopwords = default_stopwords - {'no', 'not', 'nor', 'none', 'never', 'neither', 'without', 'against',
                                        'but', 'however', 'though', 'although',
                                        'because', 'since', 'due to', 'with'}


# Words that should not be lemmatized
non_lemmatizable_words = {'iphone16', 'iphone16plus', 'iphone16pro', 'iphone16promax', "ios", 'iphone15', 'iphone15plus', 'iphone15pro', 'iphone15promax',
                          'samsunggalaxy23', 'samsunggalaxys23plus', 'samsunggalaxys23ultra', 'samsunggalaxy24', 'samsunggalaxys24plus', 'samsunggalaxys24ultra',
                          'ios17', 'ios18', 'dynamic island', 'a17bionic', 'a18chip', 'usb-c', 'lightning port', 'pro motion', 'ceramic shield',
                          'snapdragon', 'exynos', '120hz', 'amozed', 'one ui',
                          '5g', 'refresh rate', 'fast charging', 'screen size'
                          }

# Function to map POS tags to WordNet tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Function for text preprocessing
def preprocess_text(text):
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Expand contractions (assuming `fix()` function is defined elsewhere)
    text = fix(text)
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove special characters and punctuation, but keep numbers and dots
    text = re.sub(r'[^a-zA-Z0-9.\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove redundant whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize and POS tag
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()

    # Enhanced: Edge case handling for empty text
    if not tokens:
        return ''  # Return an empty string if no valid tokens remain

    # Processing tokens with enhanced handling
    processed_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        if word.isalpha() and word not in custom_stopwords and word not in non_lemmatizable_words
        else word
        for word, tag in pos_tags
    ]

    return ' '.join(processed_tokens)


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Function to predict sentiment for new tweets with preprocessing
def predict_tweet_sentiment(tweet_text):
    cleaned_text = preprocess_text(tweet_text)
    tweet_vectorized = bow_vectorizer.transform([cleaned_text])
    prediction = best_svm_model.predict(tweet_vectorized)
    sentiment_mapping_reverse = {1: 'Positive', 0: 'Neutral', -1: 'Negative'}
    return sentiment_mapping_reverse[prediction[0]]

# Example predictions with preprocessing
new_tweet_1 = "iphone has no security and very ugly design"
new_tweet_2 = "iphone15 is affordable"
new_tweet_3 = "iphone is expensive"

predicted_sentiment_1 = predict_tweet_sentiment(new_tweet_1)
predicted_sentiment_2 = predict_tweet_sentiment(new_tweet_2)
predicted_sentiment_3 = predict_tweet_sentiment(new_tweet_3)

print("\nNew Tweet Sentiment Predictions:")
print(f"Tweet: {preprocess_text(new_tweet_1)}")
print(f"Predicted Sentiment: {predicted_sentiment_1}")
print(f"\nTweet: {preprocess_text(new_tweet_2)}")
print(f"Predicted Sentiment: {predicted_sentiment_2}")
print(f"\nTweet: {preprocess_text(new_tweet_3)}")
print(f"Predicted Sentiment: {predicted_sentiment_3}")


New Tweet Sentiment Predictions:
Tweet: iphone has no security and very ugly design
Predicted Sentiment: Negative

Tweet: iphone15 is affordable
Predicted Sentiment: Negative

Tweet: iphone is expensive
Predicted Sentiment: Neutral


## Model Saving and Loading

In [None]:
import joblib
# Save the model and vectorizer
joblib.dump(best_svm_model, 'svm_model.pkl')
joblib.dump(bow_vectorizer, 'vectorizer.pkl')
print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.
